patterncMinor
Loading dictionary using trie in C very slow with large data set
Viewed 0 times
withslowlargeusingloadingtriedictionarydatasetvery
Problem
The code below works for very small dictionary files I used to test (two words). However when I run this using a more typical dictionary file, my machine slows to the point of near crash. I would like a review of what the potential bottleneck may be which I believe to be in the
```
typedef struct node
{
bool is_word;
struct node* children[27];
}
node;
node* root;
int nwords = 0;
/**
* Returns true if word is in dictionary else false.
*/
bool check(const char* word)
{
node* current = root;
int i = 0;
while(word[i] != '\0')
{
int letter = tolower(word[i]);
if (letter == '\'')
{
return false;
}
if (current->children[letter-'a'] != NULL)
{
current = current->children[letter-'a'];
i++;
}
else
{
return false;
}
}
if (current->is_word == true)
{
return true;
}
return false;
}
/**
* Loads dictionary into memory. Returns true if successful else false.
*/
bool load(const char* dictionary)
{
FILE* dict = fopen(dictionary, "r");
if(dict == false)
{
return false;
}
//initialize nodes
root = calloc(27,sizeof(node));
node* current = NULL;
int c = 0;
while(fgetc(dict) != EOF)
{
fseek(dict, -1, SEEK_CUR); //go back one byte to beginning of word
current = root; //set index to root
for(c = fgetc(dict); c != '\n'; c = fgetc(dict)) //iterate through word
{
if(current->children[c-'a'] == NULL)
{
current->children[c-'a'] = calloc(27,sizeof(node));
current = current->children[c-'a'];
}
else
{
current = current->children[c-'a'];
}
}
current->i
load() function. Of course, general comments are also welcomed if they may help with optimization and overall performance.```
typedef struct node
{
bool is_word;
struct node* children[27];
}
node;
node* root;
int nwords = 0;
/**
* Returns true if word is in dictionary else false.
*/
bool check(const char* word)
{
node* current = root;
int i = 0;
while(word[i] != '\0')
{
int letter = tolower(word[i]);
if (letter == '\'')
{
return false;
}
if (current->children[letter-'a'] != NULL)
{
current = current->children[letter-'a'];
i++;
}
else
{
return false;
}
}
if (current->is_word == true)
{
return true;
}
return false;
}
/**
* Loads dictionary into memory. Returns true if successful else false.
*/
bool load(const char* dictionary)
{
FILE* dict = fopen(dictionary, "r");
if(dict == false)
{
return false;
}
//initialize nodes
root = calloc(27,sizeof(node));
node* current = NULL;
int c = 0;
while(fgetc(dict) != EOF)
{
fseek(dict, -1, SEEK_CUR); //go back one byte to beginning of word
current = root; //set index to root
for(c = fgetc(dict); c != '\n'; c = fgetc(dict)) //iterate through word
{
if(current->children[c-'a'] == NULL)
{
current->children[c-'a'] = calloc(27,sizeof(node));
current = current->children[c-'a'];
}
else
{
current = current->children[c-'a'];
}
}
current->i
Solution
A few notes:
-
-
Initialize your variables on the lowest scope as possible. Therefore, declare your variables within your
Also, I feel as if you should be checking
-
The statements inside of your
No matter what, you are going to set
-
You can remove the
-
Avoid global variables:
-
Avoid magic numbers.
Use a
But where does the number 27 even come from? Documentation would go a long way here to help developers understand what it is.
-
Set your
-
typedef struct names are commonly written in PascalCase today.Node* current = NULL;-
Initialize your variables on the lowest scope as possible. Therefore, declare your variables within your
for loops. (C99)for(int c = fgetc(dict); c != '\n'; c = fgetc(dict)) //iterate through wordAlso, I feel as if you should be checking
c for EOF, and then check for \n within the for loop.-
The statements inside of your
for loop look fishy to me:if(current->children[c-'a'] == NULL)
{
current->children[c-'a'] = calloc(27,sizeof(node))
current = current->children[c-'a'];
}
else
{
current = current->children[c-'a'];
}No matter what, you are going to set
children equal to current->children[c-'a'], so you can move that statement out of both branches of the conditional (DRY). Also, you should not be allocating 27 elements for every single Node. Here is how I would write it:if (!current->children[c-'a']) current->children[c-'a'] = calloc(1, sizeof(node));
current = current->children[c-'a'];-
You can remove the
!= NULL part in your if conditional statements.-
Avoid global variables:
root and nwords.-
Avoid magic numbers.
for (int i = 0; i < 27; i++)Use a
#define instead at the top of your code.#define NUM_NODES 27But where does the number 27 even come from? Documentation would go a long way here to help developers understand what it is.
-
Set your
Node values to NULL after you free them. This is so that you do not try to access the data again after it has been freed.Code Snippets
Node* current = NULL;for(int c = fgetc(dict); c != '\n'; c = fgetc(dict)) //iterate through wordif(current->children[c-'a'] == NULL)
{
current->children[c-'a'] = calloc(27,sizeof(node))
current = current->children[c-'a'];
}
else
{
current = current->children[c-'a'];
}if (!current->children[c-'a']) current->children[c-'a'] = calloc(1, sizeof(node));
current = current->children[c-'a'];for (int i = 0; i < 27; i++)Context
StackExchange Code Review Q#58699, answer score: 8
Revisions (0)
No revisions yet.