patterncsharpMinor
Finding the most frequent character in a string and repeat this for millions of strings quickly
Viewed 0 times
thisquicklytherepeatmillionscharacterstringsforfindingand
Problem
I need to iterate through a couple million essays that are about 500 words long and for every essay & I have to return the character is repeated the most.
I have the following code but to iterate through 1 million essay (I have the same essay hard coded). It takes almost three minutes before it has a collection of all the popular characters from each essay.
How can I optimize this to be faster? If you have another method besides my
I have the following code but to iterate through 1 million essay (I have the same essay hard coded). It takes almost three minutes before it has a collection of all the popular characters from each essay.
How can I optimize this to be faster? If you have another method besides my
GetMostFrequentCharacter(), please suggest something w/ sample.class Program
{
static void Main(string[] args)
{
var popularCharacters = new List();
var EssayCount = 0;
var start = DateTime.Now;
const int noOfEssays = 1000000;
Console.WriteLine("Looking for most popular characters in {0} articles: ", noOfEssays);
// Gen 1,000,000 essays
for (int i = 0; i 1)
{
Console.WriteLine("Essay {0}: Most popular chr & equal in freq: ", EssayCount);
foreach (char c in popularCharacter)
Console.Write(c + ", ");
}
else
{
Console.WriteLine();
Console.Write("Essay {0}: Most frequent chr: ", EssayCount);
Console.WriteLine(popularCharacter);
}
}
Console.WriteLine();
}
}
static class Utilities
{
public static IEnumerable GetMostFrequentCharacter(this IEnumerable input)
{
var dict = input.ToLookup(x => x);
if (dict.Count == 0)
return Enumerable.Empty();
var maxCount = dict.Max(x => x.Count());
return dict.Where(x => x.Count() == maxCount).Select(x => x.Key);
}
}Solution
Some thoughts
After considering these your method looks like
If it is possible to limt the charset to
This is about 4 times faster than your solution.
Timed on my computer for 1.000.000 calls
Yours: 90 seconds
This: 24 seconds
- The method should not be generic because the methods name
GetMostFrequentCharacterimplies that it returns some kind ofChar
- The method should be renamed to
GetMostFrequentCharactersas it returns anIEnumerable.
- The variable
dictshould be renamed tolookup
- The check if
inputcontains any elements should be done before the call ofToLookup()
After considering these your method looks like
public static IEnumerable GetMostFrequentCharacters(this IEnumerable input)
{
if (!input.Any()) { return Enumerable.Empty(); }
var lookup = input.ToLookup(x => x);
var maxCount = lookup.Max(x => x.Count());
return lookup.Where(x => x.Count() == maxCount).Select(x => x.Key);
}If it is possible to limt the charset to
ASCII you could gain big speed improvement by using raporttech97 idea from the comments public static IEnumerable GetMostFrequentCharacters(this IEnumerable input)
{
if (!input.Any()) { return Enumerable.Empty(); }
const int maxChar = 128;
int[] histogram = new int[maxChar];
foreach (char c in input)
{
if (c > maxChar) { continue; }
histogram[c] = histogram[c] + 1;
}
IList mostFrequentChars = new List();
int max = int.MinValue;
for (int i = 33; i max)
{
max = histogram[i];
mostFrequentChars.Clear();
mostFrequentChars.Add((char)i);
}
else if (histogram[i] == max)
{
mostFrequentChars.Add((char)i);
}
}
return mostFrequentChars;
}This is about 4 times faster than your solution.
Timed on my computer for 1.000.000 calls
Yours: 90 seconds
This: 24 seconds
Code Snippets
public static IEnumerable<Char> GetMostFrequentCharacters(this IEnumerable<Char> input)
{
if (!input.Any()) { return Enumerable.Empty<Char>(); }
var lookup = input.ToLookup(x => x);
var maxCount = lookup.Max(x => x.Count<Char>());
return lookup.Where(x => x.Count<Char>() == maxCount).Select(x => x.Key);
}public static IEnumerable<Char> GetMostFrequentCharacters(this IEnumerable<Char> input)
{
if (!input.Any()) { return Enumerable.Empty<Char>(); }
const int maxChar = 128;
int[] histogram = new int[maxChar];
foreach (char c in input)
{
if (c > maxChar) { continue; }
histogram[c] = histogram[c] + 1;
}
IList<Char> mostFrequentChars = new List<Char>();
int max = int.MinValue;
for (int i = 33; i < histogram.Length; i++)
{
if (histogram[i] > max)
{
max = histogram[i];
mostFrequentChars.Clear();
mostFrequentChars.Add((char)i);
}
else if (histogram[i] == max)
{
mostFrequentChars.Add((char)i);
}
}
return mostFrequentChars;
}Context
StackExchange Code Review Q#69296, answer score: 3
Revisions (0)
No revisions yet.