patterncppMinor
Reading string from a text file and returning the number of occurrences of each substring of length k
Viewed 0 times
readingfilethenumberoccurrenceseachsubstringtextlengthreturning
Problem
This program takes a simple nucleotide sequence and finds the most common "k-mers" in the sequence, as determined by the supplied dataset (see below). The goal of the program is to find the origin of replication of a DNA sequence, as it is presumably the most common sequence, or in other words, the substring with the most occurrences.
```
#include
#include
#include
#include
#include
//-------------------------------------------
//FUNCTION PROTYPES
//Dataset files include k in them
//void chooseK();
//Open file and read sequence
void enterFileToOpen();
void openFile();
void readFile();
//Generate substrings and save them to file
void enterOFileName();
void openOFile();
void printSubstrToFile();
//Count substrings and determine modal substrings
//Step 1: Open substring output file
//Step 2: Read and aggregate substrings
void setOutPutReadFiletoOutputFile();
void openOutputFile();
void countSubStrings();
//---------------------------------
//VARIABLES
std::string sequence;
int k;
int kmerCount;
int i=0;
std::string subStrPrint;
std::ifstream inFile;
std::string fileName;
std::ofstream outFile;
std::string ofileName;
std::ifstream outputReadFile;
std::string outputReadFileName;
std::map subSequenceCountMap;
std::string stringMapElement;
//------------------------------------------
int main()
{
enterFileToOpen();
openFile();
readFile();
enterOFileName();
openOFile();
printSubstrToFile();
setOutPutReadFiletoOutputFile();
openOutputFile();
countSubStrings();
inFile.close();
outFile.close();
return 0;
}
void enterFileToOpen()
{
std::cout > fileName;
std::cout > sequence >> k;
std::cout > ofileName;
std::cout > stringMapElement;
if(subSequenceCountMap.find(stringMapElement)!=subSequenceCountMap.end())
{
subSequenceCountMap[stringMapElement]++;
}
else
{
subSequenceCountMap[stringMapElement]=1;
}
}
for (std::m
```
#include
#include
#include
#include
#include
//-------------------------------------------
//FUNCTION PROTYPES
//Dataset files include k in them
//void chooseK();
//Open file and read sequence
void enterFileToOpen();
void openFile();
void readFile();
//Generate substrings and save them to file
void enterOFileName();
void openOFile();
void printSubstrToFile();
//Count substrings and determine modal substrings
//Step 1: Open substring output file
//Step 2: Read and aggregate substrings
void setOutPutReadFiletoOutputFile();
void openOutputFile();
void countSubStrings();
//---------------------------------
//VARIABLES
std::string sequence;
int k;
int kmerCount;
int i=0;
std::string subStrPrint;
std::ifstream inFile;
std::string fileName;
std::ofstream outFile;
std::string ofileName;
std::ifstream outputReadFile;
std::string outputReadFileName;
std::map subSequenceCountMap;
std::string stringMapElement;
//------------------------------------------
int main()
{
enterFileToOpen();
openFile();
readFile();
enterOFileName();
openOFile();
printSubstrToFile();
setOutPutReadFiletoOutputFile();
openOutputFile();
countSubStrings();
inFile.close();
outFile.close();
return 0;
}
void enterFileToOpen()
{
std::cout > fileName;
std::cout > sequence >> k;
std::cout > ofileName;
std::cout > stringMapElement;
if(subSequenceCountMap.find(stringMapElement)!=subSequenceCountMap.end())
{
subSequenceCountMap[stringMapElement]++;
}
else
{
subSequenceCountMap[stringMapElement]=1;
}
}
for (std::m
Solution
Code Review
Replace:
With:
This is because
I don't see the point of using an intermediate file to store all the sub strings. Just put them dirextly into the map.
Replace:
With:
Not sure why you need this test:
Both sides of the
Ohh. I see it now. The extra
Design:
Don't use global variables.
Have local variables and pass them as parameters to functions and get results as a return value from the function.
Loading the whole gene into memory seems like it could be very expensive. Especially since you only need the
Error
You have an off by one error here:
Your code reports:
But if you manually count it. You will find that it happens 9 times.
Writting in C++ style
I would have used an object to encapsulate the gene fragment.
Replace:
if(subSequenceCountMap.find(stringMapElement)!=subSequenceCountMap.end())
{
subSequenceCountMap[stringMapElement]++;
}
else
{
subSequenceCountMap[stringMapElement]=1;
}With:
subSequenceCountMap[stringMapElement]++;This is because
operator[] will automatically insert the element 0 if it does not already exist before returning a reference. Thus if you just increment the value you get the correct result.I don't see the point of using an intermediate file to store all the sub strings. Just put them dirextly into the map.
Replace:
for(i=0; i<kmerCount; i++)
{
subStrPrint = sequence.substr(i,k);
outFile << subStrPrint << std::endl;
std::cout << "Iteration " << i+1 << ": sub-sequence " << subStrPrint << std::endl;
}With:
for(i=0; i<kmerCount; i++)
{
subSequenceCountMap[sequence.substr(i,k)]++;
}Not sure why you need this test:
if(itr->second >1)
{
std::cout first second first second << " time" << std::endl;
}Both sides of the
if statement are exactly the same. Remove all the ones that have only happened once by removing the whole else block!Ohh. I see it now. The extra
s on the end of time. Yea. If this were a million dollar project maybe I would go to the extra effort of pluralization. But I would do it simpler.std::cout first second second > 1)?"s":"")
<< "\n";Design:
Don't use global variables.
//---------------------------------
//VARIABLES
std::string sequence;
int k;
int kmerCount;
int i=0;
std::string subStrPrint;
std::ifstream inFile;
std::string fileName;
std::ofstream outFile;
std::string ofileName;
std::ifstream outputReadFile;
std::string outputReadFileName;
std::map subSequenceCountMap;
std::string stringMapElement;Have local variables and pass them as parameters to functions and get results as a return value from the function.
Loading the whole gene into memory seems like it could be very expensive. Especially since you only need the
k characters at a time. Why not load the first k characters (that's your first string). Then move everything down once then read one more character to the end.Error
You have an off by one error here:
kmerCount = sequence.length()-k;Your code reports:
TTGGAATCCCAA occurrs 8 timesBut if you manually count it. You will find that it happens 9 times.
Writting in C++ style
I would have used an object to encapsulate the gene fragment.
#include
#include
#include
#include
class Gene
{
std::istream& in;
std::string section;
public:
Gene(std::istream& in, std::size_t size)
: in(in)
{
section.resize(size);
in.read(§ion[0], size);
}
bool more()
{
return in.good();
}
void next()
{
std::move(std::begin(section) + 1, std::end(section), std::begin(section));
in.read(§ion.back(), 1);
}
operator std::string const&()
{
return section;
}
};
int main()
{
std::string fileName;
std::size_t size;
std::cout > fileName >> size))
{
std::cerr count;
for(Gene gene(file, size);gene.more();gene.next())
{
++count[gene];
}
for(auto const& item: count)
{
if (item.second > 1)
{
std::cout << item.first << " " << item.second << "\n";
}
}
}Code Snippets
if(subSequenceCountMap.find(stringMapElement)!=subSequenceCountMap.end())
{
subSequenceCountMap[stringMapElement]++;
}
else
{
subSequenceCountMap[stringMapElement]=1;
}subSequenceCountMap[stringMapElement]++;for(i=0; i<kmerCount; i++)
{
subStrPrint = sequence.substr(i,k);
outFile << subStrPrint << std::endl;
std::cout << "Iteration " << i+1 << ": sub-sequence " << subStrPrint << std::endl;
}for(i=0; i<kmerCount; i++)
{
subSequenceCountMap[sequence.substr(i,k)]++;
}if(itr->second >1)
{
std::cout << itr->first << " occurs " << itr->second << " times" << std::endl;
}
else
{
std::cout << itr->first << " occurs " << itr->second << " time" << std::endl;
}Context
StackExchange Code Review Q#148500, answer score: 6
Revisions (0)
No revisions yet.