patterncsharpMinor
Reading large file, splitting by new line
Viewed 0 times
readingfilenewlinesplittinglarge
Problem
I am reading a ~120 MB log file (~300 Million lines of text).
Each line is created programatically, so the
MakeLogLine
ExtractRouteId
An example of how I extract the text from the line.
GetAllLines
```
'''
''' Gets all the lines in the file at the path, puts them in the list of lines.
'''
''' True if successful, false if not.
Public Function GetAllLines() As Boolean
If Not CheckFile() Then Return Nothing
Con
...
30/05/2014 15:07:18 0000030-0000738 Net SourceID 11000006
30/05/2014 15:07:18 0000030-0000738 Communication RouteID 108
30/05/2014 15:14:44 0000030-0002027 Communication Route_Complete Already Completed
30/05/2014 15:07:40 0000030-0000739 Communication RouteID Selected 108
...Each line is created programatically, so the
MakeLogLine function can extract the exact information needed.MakeLogLine
Private Function MakeLogLine(ByVal l As String, Optional ByVal lineNum As Integer = 0) As LogLine
' Trim any leading / trailing whitespace.'
l = l.Trim()
' Extracts route Id separately to keep track of the ones that have been tagged.'
Dim rid As String = ExtractRouteId(l)
If rid IsNot Nothing Then
Call Me.linesWithRouteIds.Add(New rIdLines With {.line = line, .rid = rid})
End If
Return New LogLine With {.lstr = l,
.threadId = ExtractThreadId(l),
.requestId = ExtractRequestId(l),
.lineNum = lineNum,
.sourceId = ExtractSourceId(l),
.performanceId = ExtractPerformanceId(l),
.routeId = rid,
.methodName = ExtractMethodName(l),
.time = ExtractTime(l)}
End FunctionExtractRouteId
An example of how I extract the text from the line.
Private Function ExtractRouteId(ByVal l As String) As String
Dim rid As String
If l.Length 8 AndAlso rid(0) = "S"c Then Return rid.Substring(9, rid.Length - 9)
Catch ex As Exception
Return rid
End Try
Return rid
End FunctionGetAllLines
```
'''
''' Gets all the lines in the file at the path, puts them in the list of lines.
'''
''' True if successful, false if not.
Public Function GetAllLines() As Boolean
If Not CheckFile() Then Return Nothing
Con
Solution
Good
The shortcircuit condition aka
The
Magic numbers are hidden behind const.
Bad
Using of Call, which is kept only for backward compatibility.
Returning
Returning
Using of single character parameters.
Inconsistent parameter naming
Sometimes
Refactoring
Let us start with the
So for testing purpose we should change the return type to a
The variable
As we use
We should also rename the array
Removing the
Last the
Now let us take a look at the
We will for readability change the name of the inputparameter to
For the same purpose we will rename
As we could count the length of
We are only interested in the part of the string which starts at an specified index, so we can use the overloaded
As I need to leave the office now, I will just place the
```
Private Function MakeLogLine(ByVal currentLine As String, Optional lineNumber As Integer = 0) As LogLine
currentLine = currentLine.Trim()
' Extracts route Id separately to keep track of the ones that have been tagged.'
Dim routeId As String = ExtractRouteId(currentLine)
If String.IsNullOrEmpty(routeId) Then
Me.linesWithRouteIds.Add(New rIdLines With {.line = line, .rid = routeId})
End If
Return New LogLine With {.lstr = currentLine,
.threadId = ExtractThreadId(currentLine),
.requestId = ExtractRequestId(currentLine),
.lineNum = lineNumber,
.sourceId = ExtractSourceId(currentLine),
.performanceId = ExtractPerformanceId(currentLine),
.routeId = routeId ,
.methodName = ExtractM
The shortcircuit condition aka
AndAlso is used.The
StreamReader is enclosed inside a using statement.Magic numbers are hidden behind const.
StringBuilder is constructed using a initial size.Bad
StringBuilder is constructed using a initial size which is not high enough.Using of Call, which is kept only for backward compatibility.
Returning
Nothing instead of False (GetAllLines).Returning
Nothing instead of String.Empty (ExtractRouteId).Using of single character parameters.
Inconsistent parameter naming
Sometimes
camelCased which is the right choice regarding the naming convention, but sometime PascalCased which is wrong for parameters/variables. Refactoring
Let us start with the
GetAllLines() method. If we read the methodname we would expect that the method will return lines, but instead it returns True or Nothing.So for testing purpose we should change the return type to a
List(Of LogLine) and add an inputparameter fileName As String. In this way the method won't be coupled tightly to the Me object what as a guess will be a form.The variable
lineNum is only used to count the added LogLines which is useless, so let us remove it.As we use
Split = .ToString.Split(CChar(Environment.NewLine)) at each iteration of the Do until loop, we should extract CChar(Environment.NewLine) somehow outside the loop.We should also rename the array
Split to something more meaningful and something which isn't a known methodname.Removing the
With builder .. End With will save us one indentionlevel which makes our code more readable.Last the
CheckFile() method should be renamed to IsFileValid() and should have an inputparameter fileName As String. Public Function GetAllLines(fileName As String) As List(Of LogLine)
Dim logLines As New List(Of LogLine)()
If Not CheckFile(fileName) Then Return logLines
Const MaxLineSize As Integer = 4096
Const stringBuilderCapacity As Integer = 8192
Dim buffer(MaxLineSize - 1) As Char
Dim charCount As Integer
Dim builder As New System.Text.StringBuilder(stringBuilderCapacity)
Dim singleLines As String()
Dim newLineChars As Char() = Environment.NewLine.ToCharArray()
Dim lineNumber As Integer = 0
Using reader As New System.IO.StreamReader(fileName)
Do Until reader.EndOfStream
charCount = reader.Read(buffer, 0, MaxLineSize)
builder.Append(buffer, 0, charCount)
If reader.Peek() >= 0 Then
builder.Append(reader.ReadLine())
End If
singleLines = builder.ToString().Split(newLineChars)
For Each singleLine As String In singleLines
logLines.Add(MakeLogLine(singleLine, lineNumber))
lineNumber += 1
Next
builder.Clear()
Loop
End Using
Return logLines
End FunctionNow let us take a look at the
ExtractRouteId() method.We will for readability change the name of the inputparameter to
currentLine.For the same purpose we will rename
rid to routeId.As we could count the length of
RouteID X by hand, we should create a const out of our result and use it, but as we return String.Empty if the Length < 59 we won't need to check if rid.Length < "RouteID X" at all as this will be true. We are only interested in the part of the string which starts at an specified index, so we can use the overloaded
SubString() method which takes only the startindex and returns the string starting at this index till the end.Private Const minRouteIdLength As Integer = 9
Private Function ExtractRouteId(ByVal currentLine As String) As String
Dim routeId As String = String.Empty
If currentLine.Length minRouteIdLength AndAlso routeId(0) = "S"c Then
routeId = routeId.Substring(minRouteIdLength)
End If
Return routeId
End FunctionAs I need to leave the office now, I will just place the
MakeLogLine() method here and will add some explanation tomorrow. ```
Private Function MakeLogLine(ByVal currentLine As String, Optional lineNumber As Integer = 0) As LogLine
currentLine = currentLine.Trim()
' Extracts route Id separately to keep track of the ones that have been tagged.'
Dim routeId As String = ExtractRouteId(currentLine)
If String.IsNullOrEmpty(routeId) Then
Me.linesWithRouteIds.Add(New rIdLines With {.line = line, .rid = routeId})
End If
Return New LogLine With {.lstr = currentLine,
.threadId = ExtractThreadId(currentLine),
.requestId = ExtractRequestId(currentLine),
.lineNum = lineNumber,
.sourceId = ExtractSourceId(currentLine),
.performanceId = ExtractPerformanceId(currentLine),
.routeId = routeId ,
.methodName = ExtractM
Code Snippets
Public Function GetAllLines(fileName As String) As List(Of LogLine)
Dim logLines As New List(Of LogLine)()
If Not CheckFile(fileName) Then Return logLines
Const MaxLineSize As Integer = 4096
Const stringBuilderCapacity As Integer = 8192
Dim buffer(MaxLineSize - 1) As Char
Dim charCount As Integer
Dim builder As New System.Text.StringBuilder(stringBuilderCapacity)
Dim singleLines As String()
Dim newLineChars As Char() = Environment.NewLine.ToCharArray()
Dim lineNumber As Integer = 0
Using reader As New System.IO.StreamReader(fileName)
Do Until reader.EndOfStream
charCount = reader.Read(buffer, 0, MaxLineSize)
builder.Append(buffer, 0, charCount)
If reader.Peek() >= 0 Then
builder.Append(reader.ReadLine())
End If
singleLines = builder.ToString().Split(newLineChars)
For Each singleLine As String In singleLines
logLines.Add(MakeLogLine(singleLine, lineNumber))
lineNumber += 1
Next
builder.Clear()
Loop
End Using
Return logLines
End FunctionPrivate Const minRouteIdLength As Integer = 9
Private Function ExtractRouteId(ByVal currentLine As String) As String
Dim routeId As String = String.Empty
If currentLine.Length < 59 OrElse _
Not (currentLine(50) = "R"c) OrElse _
Not (currentLine(55) = "I"c) Then
Return routeId
End If
routeId = currentLine.Substring(58)
If routeId.Length > minRouteIdLength AndAlso routeId(0) = "S"c Then
routeId = routeId.Substring(minRouteIdLength)
End If
Return routeId
End FunctionPrivate Function MakeLogLine(ByVal currentLine As String, Optional lineNumber As Integer = 0) As LogLine
currentLine = currentLine.Trim()
' Extracts route Id separately to keep track of the ones that have been tagged.'
Dim routeId As String = ExtractRouteId(currentLine)
If String.IsNullOrEmpty(routeId) Then
Me.linesWithRouteIds.Add(New rIdLines With {.line = line, .rid = routeId})
End If
Return New LogLine With {.lstr = currentLine,
.threadId = ExtractThreadId(currentLine),
.requestId = ExtractRequestId(currentLine),
.lineNum = lineNumber,
.sourceId = ExtractSourceId(currentLine),
.performanceId = ExtractPerformanceId(currentLine),
.routeId = routeId ,
.methodName = ExtractMethodName(currentLine),
.time = ExtractTime(currentLine)}
End FunctionContext
StackExchange Code Review Q#63659, answer score: 7
Revisions (0)
No revisions yet.