HiveBrain v1.2.0
Get Started
← Back to all entries
patternMinor

Extracting data from a Word document is too slow

Submitted by: @import:stackexchange-codereview··
0
Viewed 0 times
tooslowworddocumentextractingfromdata

Problem

I have a 100page long .docx format document. I'm using a macro written in VBS to extract some information and then just generate a table from them. I iterate through the paragraphs and store the found strings in 3 separate arrays.

However, the loop is unreasonably slow. It takes 3 min to complete on a relatively fast computer. Can you take a look at it and tell me what causes this slowness?

```
'TODO : Add checks, exception handling, dynamic user options, probes, style checks, and fix slowness.

Sub genTable()

Dim objDoc

'''''''''''''''''''''Modify these''''''''''''''''
Dim ColumnName1, ColumnName2, ColumnName3, magicString
ColumnName1 = "foo1"
ColumnName2 = "foo2"
ColumnName3 = "foo3"
magicString = "ASD321: " ' we search for this string

Set objDoc = ActiveDocument ' Because we run inside of word as macro
''''''''''''''''''''''''''''''''''''''''''''''''''

Const MAX = 200 ' using fixed sized arrays, mod this if you'll have more than 200 entries.
Dim vulnerabilityArr(MAX) ' initializing the arrays
Dim severityArr(MAX)
Dim paragraphArr(MAX)

Dim counter ' will count the processed entries in this
counter = 0

Dim currParagraph ' will be set in loop
Dim tmpArray() As String ' for splitting
For pIndex = 1 To objDoc.Paragraphs.Count ' THIS LOOP IS SLOW
currParagraph = objDoc.Paragraphs(pIndex)
currParagraph = Left(currParagraph, Len(currParagraph) - 1) 'remove junk character
If InStr(1, currParagraph, magicString) > 0 Then ' assuming this string is always present and the other two target data is near it
tmpArray = Split(currParagraph) ' extract level
currParagraph = objDoc.Paragraphs(pIndex - 1) 'assuming the previous paragraph is the vuln. name

'Storing the 3 extracted data
vulnerabilityArr(counter) = currParagraph
severityArr(counter) = tmpArray(1)
paragraphArr(counter) = objDoc.Paragraphs(pIndex - 1).Range.ListFormat.ListString ' for some weird reason I cant use c

Solution

Maybe I'm deeply wrong, but please test:

-- Almost all your variables are of type Variant, even the ones which could be Long, like counter. Define each variable with the correct type (it will be faster), in the form:

Option Explicit
  Dim counter as Long
  Dim ColumnName1 as String, ColumnName2 as String


-- You dont really use pIndex. Try:

Dim currParagraph As Paragraph               ' will be set in loop
For Each currParagraph In objDoc.Paragraphs  ' THIS LOOP IS SLOW


-- I dont see why you make a copy of the paragraph only to 'remove junk character ?
for some weird reason I cant use currParagraph here: I have always heard that this information is with this junk character stick

All will:

'TODO : Add checks, exception handling, dynamic user options, probes, style checks, and fix slowness.
Option Explicit
Sub genTable()

Dim objDoc

'''''''''''''''''''''Modify these''''''''''''''''
Dim ColumnName1 As String, ColumnName2 As String, ColumnName3 As String, magicString As String
ColumnName1 = "foo1"
ColumnName2 = "foo2"
ColumnName3 = "foo3"
magicString = "ASD321: " ' we search for this string

Set objDoc = ActiveDocument ' Because we run inside of word as macro
''''''''''''''''''''''''''''''''''''''''''''''''''

Const MAX = 200 ' using fixed sized arrays, mod this if you'll have more than 200 entries.
Dim vulnerabilityArr(MAX) As Paragraph ' initializing the arrays
Dim severityArr(MAX) As String
Dim paragraphArr(MAX) As String

Dim counter As Long ' will count the processed entries in this
counter = 0

Dim tmpArray() As String ' for splitting
Dim currParagraph As Paragraph, prevParagraph As Paragraph ' will be set in loop
For Each currParagraph In objDoc.Paragraphs  ' THIS LOOP IS SLOW
                                                  ' currParagraph = Left(currParagraph, Len(currParagraph) - 1) 'remove junk character
    If InStr(currParagraph, magicString) > 0 Then ' assuming this string is always present and the other two target data is near it
        tmpArray = Split(currParagraph.Range.Text)             ' extract level
        Set prevParagraph = currParagraph.Previous    ' assuming the previous paragraph is the vuln. name

            'Storing the 3 extracted data
        Set vulnerabilityArr(counter) = prevParagraph
        severityArr(counter) = tmpArray(1)
        paragraphArr(counter) = prevParagraph.Range.ListFormat.ListString ' for some weird reason I cant use currParagraph here

        counter = counter + 1 ' adjusting index
    End If
Next currParagraph

Dim objTable As Table
Set objTable = objDoc.Tables.Add(objDoc.Paragraphs(objDoc.Paragraphs.Count).Range, counter + 1, 3, True, True)
                                 'Set objTable = objDoc.Tables(objDoc.Tables.Count) 'select last table

objTable.Cell(1, 1).Range.Text = ColumnName1
objTable.Cell(1, 2).Range.Text = ColumnName2
objTable.Cell(1, 3).Range.Text = ColumnName3

Dim RowIndex As Long, RowIndexy As Long

For RowIndex = 0 To counter - 1
    objTable.Cell(RowIndex + 2, 1).Range.Text = paragraphArr(RowIndex)
    objTable.Cell(RowIndex + 2, 2).Range.Text = vulnerabilityArr(RowIndex).Range.Text
    objTable.Cell(RowIndex + 2, 3).Range.Text = severityArr(RowIndex)
Next
RowIndexy = RowIndex + 1

objTable.AutoFormat (25)

End Sub

Code Snippets

Option Explicit
  Dim counter as Long
  Dim ColumnName1 as String, ColumnName2 as String
Dim currParagraph As Paragraph               ' will be set in loop
For Each currParagraph In objDoc.Paragraphs  ' THIS LOOP IS SLOW
'TODO : Add checks, exception handling, dynamic user options, probes, style checks, and fix slowness.
Option Explicit
Sub genTable()

Dim objDoc

'''''''''''''''''''''Modify these''''''''''''''''
Dim ColumnName1 As String, ColumnName2 As String, ColumnName3 As String, magicString As String
ColumnName1 = "foo1"
ColumnName2 = "foo2"
ColumnName3 = "foo3"
magicString = "ASD321: " ' we search for this string

Set objDoc = ActiveDocument ' Because we run inside of word as macro
''''''''''''''''''''''''''''''''''''''''''''''''''

Const MAX = 200 ' using fixed sized arrays, mod this if you'll have more than 200 entries.
Dim vulnerabilityArr(MAX) As Paragraph ' initializing the arrays
Dim severityArr(MAX) As String
Dim paragraphArr(MAX) As String

Dim counter As Long ' will count the processed entries in this
counter = 0

Dim tmpArray() As String ' for splitting
Dim currParagraph As Paragraph, prevParagraph As Paragraph ' will be set in loop
For Each currParagraph In objDoc.Paragraphs  ' THIS LOOP IS SLOW
                                                  ' currParagraph = Left(currParagraph, Len(currParagraph) - 1) 'remove junk character
    If InStr(currParagraph, magicString) > 0 Then ' assuming this string is always present and the other two target data is near it
        tmpArray = Split(currParagraph.Range.Text)             ' extract level
        Set prevParagraph = currParagraph.Previous    ' assuming the previous paragraph is the vuln. name

            'Storing the 3 extracted data
        Set vulnerabilityArr(counter) = prevParagraph
        severityArr(counter) = tmpArray(1)
        paragraphArr(counter) = prevParagraph.Range.ListFormat.ListString ' for some weird reason I cant use currParagraph here

        counter = counter + 1 ' adjusting index
    End If
Next currParagraph

Dim objTable As Table
Set objTable = objDoc.Tables.Add(objDoc.Paragraphs(objDoc.Paragraphs.Count).Range, counter + 1, 3, True, True)
                                 'Set objTable = objDoc.Tables(objDoc.Tables.Count) 'select last table

objTable.Cell(1, 1).Range.Text = ColumnName1
objTable.Cell(1, 2).Range.Text = ColumnName2
objTable.Cell(1, 3).Range.Text = ColumnName3

Dim RowIndex As Long, RowIndexy As Long

For RowIndex = 0 To counter - 1
    objTable.Cell(RowIndex + 2, 1).Range.Text = paragraphArr(RowIndex)
    objTable.Cell(RowIndex + 2, 2).Range.Text = vulnerabilityArr(RowIndex).Range.Text
    objTable.Cell(RowIndex + 2, 3).Range.Text = severityArr(RowIndex)
Next
RowIndexy = RowIndex + 1

objTable.AutoFormat (25)

End Sub

Context

StackExchange Code Review Q#25123, answer score: 7

Revisions (0)

No revisions yet.