patternjavaMinor
Creating large CSV file
Viewed 0 times
creatingfilecsvlarge
Problem
I have a performance problem when trying to create a csv file starting from another csv file.
This is how the original file looks:
Potentially it could be millions of lines like this, i have detected the problem with 1.280.000 lines.
This is the algorithm:
```
File csvInputFile = new File(csv_path);
int blockSize = 409600;
brCsvInputFile = new BufferedReader(frCsvInputFile, blockSize);
String line = null;
StringBuilder sbIntermediate = new StringBuilder();
skipFirstLine(brCsvInputFile);
while ((line = brCsvInputFile.readLine()) != null) {
createIntermediateStringBuffer(sbIntermediate, line.split(REGEX_COMMA));
}
private static void skipFirstLine(BufferedReader br) throws IOException {
String line = br.readLine();
String[] splitLine = line.split(REGEX_COMMA);
LOGGER.debug("First line detected! ");
createIndex(splitLine);
createIntermediateIndex(splitLine);
}
private static void createIndex(String[] splitLine) {
LOGGER.debug("START method createIndex.");
for (int i = 0; i < splitLine.length; i++)
headerIndex.put(splitLine[i], i);
printMap(headerIndex);
LOGGER.debug("COMPLETED method createIndex.");
}
private static void createIntermediateIndex(String[] splitLine) {
LOGGER.debug("START method createIntermediateIndex.");
com.tekcomms.c2d.xml.model.v2.Metadata_element[] metadata_element = null;
String[] servicePath = newTopology.getElement().getEntity().getService_path().getLevel();
This is how the original file looks:
country,state,co,olt,olu,splitter,ont,cpe,cpe.latitude,cpe.longitude,cpe.customer_class,cpe.phone,cpe.ip,cpe.subscriber_id
COUNTRY-0001,STATE-0001,CO-0001,OLT-0001,OLU0001,SPLITTER-0001,ONT-0001,CPE-0001,28.21487,77.451775,ALL,SIP:+674100002743@IMS.COMCAST.NET,SIP:E28EDADA06B2@IMS.COMCAST.NET,CPE_SUBSCRIBER_ID-QHLHW4
COUNTRY-0001,STATE-0002,CO-0002,OLT-0002,OLU0002,SPLITTER-0002,ONT-0002,CPE-0002,28.294018,77.068924,ALL,SIP:+796107443092@IMS.COMCAST.NET,SIP:58DD999D6466@IMS.COMCAST.NET,CPE_SUBSCRIBER_ID-AH8NJQPotentially it could be millions of lines like this, i have detected the problem with 1.280.000 lines.
This is the algorithm:
```
File csvInputFile = new File(csv_path);
int blockSize = 409600;
brCsvInputFile = new BufferedReader(frCsvInputFile, blockSize);
String line = null;
StringBuilder sbIntermediate = new StringBuilder();
skipFirstLine(brCsvInputFile);
while ((line = brCsvInputFile.readLine()) != null) {
createIntermediateStringBuffer(sbIntermediate, line.split(REGEX_COMMA));
}
private static void skipFirstLine(BufferedReader br) throws IOException {
String line = br.readLine();
String[] splitLine = line.split(REGEX_COMMA);
LOGGER.debug("First line detected! ");
createIndex(splitLine);
createIntermediateIndex(splitLine);
}
private static void createIndex(String[] splitLine) {
LOGGER.debug("START method createIndex.");
for (int i = 0; i < splitLine.length; i++)
headerIndex.put(splitLine[i], i);
printMap(headerIndex);
LOGGER.debug("COMPLETED method createIndex.");
}
private static void createIntermediateIndex(String[] splitLine) {
LOGGER.debug("START method createIntermediateIndex.");
com.tekcomms.c2d.xml.model.v2.Metadata_element[] metadata_element = null;
String[] servicePath = newTopology.getElement().getEntity().getService_path().getLevel();
Solution
In addition to @rolfl's answer
To soften the effect mentioned by rolfl, you could use the
Also by calling often the
You say this loop iterates over 8 elements
Hence the
This loop could use some facelifting by removing the Logger stuff and also simplifying the logic.
Then you have this loop also
at least it is called once more if
Also this could really need a refactoring.
Let us add braces
As the content of the StringBuilder is later written to a file, we can just return a
And the whole method refactored by extracting the code duplication to a separate method
```
private static String createIntermediateStringBuffer (String[] splitLine) throws ClassCastException,
NullPointerException {
StringBuilder sbIntermediate = new StringBuilder(1024);
ArrayList hashes = new ArrayList();
com.tekcomms.c2d.xml.model.v2.Metadata_element[] metadata_element = null;
String[] servicePath = newTopology.getElement().getEntity().getService_path().getLevel();
for (int i = 0; i position) {
String name = splitLine[position];
hashes.add(name);
sbIntermediate
.append(name)
.append(REGEX_COMMA)
.append(HashUtils.calculateHash(hashes))
.append(REGEX_COMMA);
}
}
String labelLatitude = newTopology.getElement().getEntity().getLatitude();
addByValue(sbIntermediate, splitLine, labelLatitude);
String labelLongitude = newTopology.getElement().getEntity().getLongitude();
addByValue(sbIntermediate, splitLine, labelLongitude);
String labelCustomerClass = newTopology.getElement().getCustomer_class();
addByValue(labelCustomerClass, splitLine, labelLongitude);
if (newTopology.getElement().getMetadata() != null) {
metadata_element = newTopology.getElement().getMetadata().getMetadata_element();
for (Metadata_element element:metadata_element) {
String label = metadata_element[j].getLabel();
String actualValue = "";
if (splitLine.length > getPositionFromIndex(label)) {
actualValue = splitLine[getPositionFromIndex(label)];
}
sbIntermediate.append("").append(REGEX_COMMA);
}
}
sbIntermediate.append("\n");
return sbIntermediate.toString();
}
private stati
To soften the effect mentioned by rolfl, you could use the
blocksize for constructing the StringBuilder also like StringBuilder sbIntermediate = new StringBuilder(blockSize);Also by calling often the
.toSTring() of the stringbuilder inside the createIntermediateStringBuffer() method this will slow down if the stringbuilders content grow. You say this loop iterates over 8 elements
for (int i = 0; i getPositionFromIndex(level)) {
String name = splitLine[getPositionFromIndex(level)];
sbIntermediate.append(name);
hashes.add(name);
sbIntermediate.append(REGEX_COMMA).append(HashUtils.calculateHash(hashes)).append(REGEX_COMMA);
LOGGER.debug(" ==sbIntermediate: " + sbIntermediate.toString());
}
}Hence the
sbIntermediate.toString() is called 8 times also. So the toString() method of the StringBuilder calls this constructor of the String class. While "only" doing an ArrayCopy, doing this 8 times for a great char[] will slow the process. This loop could use some facelifting by removing the Logger stuff and also simplifying the logic.
for (int i = 0; i position ) {
String name = splitLine[position];
hashes.add(name);
sbIntermediate
.append(name)
.append(REGEX_COMMA)
.append(HashUtils.calculateHash(hashes))
.append(REGEX_COMMA);
}
}Then you have this loop also
if (metadata_element != null && metadata_element.length != 0)
for (int j = 0; j getPositionFromIndex(label)) {
String actualValue = splitLine[getPositionFromIndex(label)];
if (!"".equals(actualValue))
sbIntermediate.append(actualValue).append(REGEX_COMMA);
else
sbIntermediate.append("").append(REGEX_COMMA);
} else
sbIntermediate.append("").append(REGEX_COMMA);
LOGGER.debug(" ==sbIntermediate: " + sbIntermediate.toString());
}//forat least it is called once more if
metadata_element.length > 0.Also this could really need a refactoring.
Let us add braces
{} for single if statements and remove the logger stuff and reduce the code itself. if (metadata_element != null && metadata_element.length != 0) {
for (int j = 0; j getPositionFromIndex(label)) {
actualValue = splitLine[getPositionFromIndex(label)];
}
sbIntermediate.append("").append(REGEX_COMMA);
}
}As the content of the StringBuilder is later written to a file, we can just return a
String by the createIntermediateStringBuffer which we write to the file. File csvInputFile = new File(csv_path);
int blockSize = 409600;
brCsvInputFile = new BufferedReader(frCsvInputFile, blockSize);
Writer writer = new BufferedWriter(new OutputStreamWriter(
new FileOutputStream(outputFileName), "utf-8"));
String line = null;
skipFirstLine(brCsvInputFile);
while ((line = brCsvInputFile.readLine()) != null) {
writer.write(createIntermediateStringBuffer(line.split(REGEX_COMMA)));
}
writer.close();And the whole method refactored by extracting the code duplication to a separate method
```
private static String createIntermediateStringBuffer (String[] splitLine) throws ClassCastException,
NullPointerException {
StringBuilder sbIntermediate = new StringBuilder(1024);
ArrayList hashes = new ArrayList();
com.tekcomms.c2d.xml.model.v2.Metadata_element[] metadata_element = null;
String[] servicePath = newTopology.getElement().getEntity().getService_path().getLevel();
for (int i = 0; i position) {
String name = splitLine[position];
hashes.add(name);
sbIntermediate
.append(name)
.append(REGEX_COMMA)
.append(HashUtils.calculateHash(hashes))
.append(REGEX_COMMA);
}
}
String labelLatitude = newTopology.getElement().getEntity().getLatitude();
addByValue(sbIntermediate, splitLine, labelLatitude);
String labelLongitude = newTopology.getElement().getEntity().getLongitude();
addByValue(sbIntermediate, splitLine, labelLongitude);
String labelCustomerClass = newTopology.getElement().getCustomer_class();
addByValue(labelCustomerClass, splitLine, labelLongitude);
if (newTopology.getElement().getMetadata() != null) {
metadata_element = newTopology.getElement().getMetadata().getMetadata_element();
for (Metadata_element element:metadata_element) {
String label = metadata_element[j].getLabel();
String actualValue = "";
if (splitLine.length > getPositionFromIndex(label)) {
actualValue = splitLine[getPositionFromIndex(label)];
}
sbIntermediate.append("").append(REGEX_COMMA);
}
}
sbIntermediate.append("\n");
return sbIntermediate.toString();
}
private stati
Code Snippets
StringBuilder sbIntermediate = new StringBuilder(blockSize);for (int i = 0; i < servicePath.length; i++) {
String level = servicePath[i];
LOGGER.debug("level is: " + level);
if (splitLine.length > getPositionFromIndex(level)) {
String name = splitLine[getPositionFromIndex(level)];
sbIntermediate.append(name);
hashes.add(name);
sbIntermediate.append(REGEX_COMMA).append(HashUtils.calculateHash(hashes)).append(REGEX_COMMA);
LOGGER.debug(" ==sbIntermediate: " + sbIntermediate.toString());
}
}for (int i = 0; i < servicePath.length; i++) {
int position = getPositionFromIndex(servicePath[i]);
if (splitLine.length > position ) {
String name = splitLine[position];
hashes.add(name);
sbIntermediate
.append(name)
.append(REGEX_COMMA)
.append(HashUtils.calculateHash(hashes))
.append(REGEX_COMMA);
}
}if (metadata_element != null && metadata_element.length != 0)
for (int j = 0; j < metadata_element.length; j++) {
String label = metadata_element[j].getLabel();
LOGGER.debug(" ==label: " + label + " index_pos: " + j);
if (splitLine.length > getPositionFromIndex(label)) {
String actualValue = splitLine[getPositionFromIndex(label)];
if (!"".equals(actualValue))
sbIntermediate.append(actualValue).append(REGEX_COMMA);
else
sbIntermediate.append("").append(REGEX_COMMA);
} else
sbIntermediate.append("").append(REGEX_COMMA);
LOGGER.debug(" ==sbIntermediate: " + sbIntermediate.toString());
}//forif (metadata_element != null && metadata_element.length != 0) {
for (int j = 0; j < metadata_element.length; j++) {
String label = metadata_element[j].getLabel();
String actualValue = "";
if (splitLine.length > getPositionFromIndex(label)) {
actualValue = splitLine[getPositionFromIndex(label)];
}
sbIntermediate.append("").append(REGEX_COMMA);
}
}Context
StackExchange Code Review Q#71650, answer score: 3
Revisions (0)
No revisions yet.