patternjavaMajor
Fast way of searching for a string in a text file
Viewed 0 times
fastfilesearchingtextwayforstring
Problem
I have written this code to search for a string in a
I don't want to use regex.
.txt file. Is it possible to optimize the code so that it searches for the string in fastest manner possible? Assuming the text file would be a large one (500MB-1GB)I don't want to use regex.
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
public class StringFinder {
public static void main(String[] args)
{
double count = 0,countBuffer=0,countLine=0;
String lineNumber = "";
String filePath = "C:\\Users\\allen\\Desktop\\TestText.txt";
BufferedReader br;
String inputSearch = "are";
String line = "";
try {
br = new BufferedReader(new FileReader(filePath));
try {
while((line = br.readLine()) != null)
{
countLine++;
//System.out.println(line);
String[] words = line.split(" ");
for (String word : words) {
if (word.equals(inputSearch)) {
count++;
countBuffer++;
}
}
if(countBuffer > 0)
{
countBuffer = 0;
lineNumber += countLine + ",";
}
}
br.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
System.out.println("Times found at--"+count);
System.out.println("Word found at--"+lineNumber);
}
}Solution
Fast comes at a price.... code complexity and perhaps readability.
Assuming that your code produces the right results now.... and it is a big assumption because:
OK, a much faster way (keeping it as Java), is to do the following:
If the file is larger than your memory, you will have to re-position the byte-buffer occasionally. I recommend using a emopry-mapped size of about 4MB plus the size of the search-string. That way you can search the 4MB window, and then start the next window at the next 4mb boundary.
Once you get in to it, it will make sense.
This system will be fast because you will never have to copy the file's data in to Java. Everything will actually happen in the native side of things.
There is a lot to read to make it work.
I would start with a tutorial....
of course, if you want really fast, use grep.
Here is some example code that may get you started:
Assuming that your code produces the right results now.... and it is a big assumption because:
- it expects the word to be at the beginning/end of the line, or surrounded by spaces (not commas, punctuation, etc.)
- it does not look for the word inside another string, it will match 'are', but not 'bare'.
OK, a much faster way (keeping it as Java), is to do the following:
- Convert the search string ('are') to a byte-array in the same encoding as the file.
- Open a memory-mapped byte-buffer from a File-Channel on the file.
- Scan the ByteBuffer, looking for matches to the search byte-array
- count the newlines as you go.
- close the ByteBuffer
If the file is larger than your memory, you will have to re-position the byte-buffer occasionally. I recommend using a emopry-mapped size of about 4MB plus the size of the search-string. That way you can search the 4MB window, and then start the next window at the next 4mb boundary.
Once you get in to it, it will make sense.
This system will be fast because you will never have to copy the file's data in to Java. Everything will actually happen in the native side of things.
There is a lot to read to make it work.
I would start with a tutorial....
- StudyTrails
- Yaldix
- LinuxTopia
of course, if you want really fast, use grep.
Here is some example code that may get you started:
import java.io.IOException;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.channels.FileChannel.MapMode;
import java.nio.charset.StandardCharsets;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardOpenOption;
public class NIOGrep {
public static void main(String[] args) throws IOException {
if (args.length != 2) {
throw new IllegalArgumentException();
}
String grepfor = args[0];
Path path = Paths.get(args[1]);
String report = searchFor(grepfor, path);
System.out.println(report);
}
private static final int MAPSIZE = 4 * 1024 ; // 4K - make this * 1024 to 4MB in a real system.
private static String searchFor(String grepfor, Path path) throws IOException {
final byte[] tosearch = grepfor.getBytes(StandardCharsets.UTF_8);
StringBuilder report = new StringBuilder();
int padding = 1; // need to scan 1 character ahead in case it is a word boundary.
int linecount = 0;
int matches = 0;
boolean inword = false;
boolean scantolineend = false;
try (FileChannel channel = FileChannel.open(path, StandardOpenOption.READ)) {
final long length = channel.size();
int pos = 0;
while (pos 0) {
report.append(", ");
}
report.append(linecount);
scantolineend = true;
} else {
inword = true;
}
}
}
}
}
return "Times found at--" + matches + "\nWord found at--" + report;
}
private static boolean wordMatch(MappedByteBuffer buffer, int pos, int tomap, byte[] tosearch) {
//assume at valid word start.
for (int i = 0; i < tosearch.length; i++) {
if (tosearch[i] != buffer.get(pos + i)) {
return false;
}
}
byte nxt = (pos + tosearch.length) == tomap ? (byte)' ' : buffer.get(pos + tosearch.length);
return nxt == ' ' || nxt == '\n' || nxt == '\r';
}
}Code Snippets
import java.io.IOException;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.channels.FileChannel.MapMode;
import java.nio.charset.StandardCharsets;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardOpenOption;
public class NIOGrep {
public static void main(String[] args) throws IOException {
if (args.length != 2) {
throw new IllegalArgumentException();
}
String grepfor = args[0];
Path path = Paths.get(args[1]);
String report = searchFor(grepfor, path);
System.out.println(report);
}
private static final int MAPSIZE = 4 * 1024 ; // 4K - make this * 1024 to 4MB in a real system.
private static String searchFor(String grepfor, Path path) throws IOException {
final byte[] tosearch = grepfor.getBytes(StandardCharsets.UTF_8);
StringBuilder report = new StringBuilder();
int padding = 1; // need to scan 1 character ahead in case it is a word boundary.
int linecount = 0;
int matches = 0;
boolean inword = false;
boolean scantolineend = false;
try (FileChannel channel = FileChannel.open(path, StandardOpenOption.READ)) {
final long length = channel.size();
int pos = 0;
while (pos < length) {
long remaining = length - pos;
// int conversion is safe because of a safe MAPSIZE.. Assume a reaosnably sized tosearch.
int trymap = MAPSIZE + tosearch.length + padding;
int tomap = (int)Math.min(trymap, remaining);
// different limits depending on whether we are the last mapped segment.
int limit = trymap == tomap ? MAPSIZE : (tomap - tosearch.length);
MappedByteBuffer buffer = channel.map(MapMode.READ_ONLY, pos, tomap);
System.out.println("Mapped from " + pos + " for " + tomap);
pos += (trymap == tomap) ? MAPSIZE : tomap;
for (int i = 0; i < limit; i++) {
final byte b = buffer.get(i);
if (scantolineend) {
if (b == '\n') {
scantolineend = false;
inword = false;
linecount ++;
}
} else if (b == '\n') {
linecount++;
inword = false;
} else if (b == '\r' || b == ' ') {
inword = false;
} else if (!inword) {
if (wordMatch(buffer, i, tomap, tosearch)) {
matches++;
i += tosearch.length - 1;
if (report.length() > 0) {
report.append(", ");
}
report.append(linecount);
Context
StackExchange Code Review Q#44021, answer score: 22
Revisions (0)
No revisions yet.