patterncsharpMinor
Char Trie and scanner in one
Viewed 0 times
charonescannertrieand
Problem
This is from the Resin codebase. It is fast at lookup but I could use some help making it faster at buildup. Loaded with 210K words, it looks up pretty much any node and its descendants in zero time. Buildup takes 3000 ms.
```
using System;
using System.Collections.Generic;
using System.IO;
namespace Resin
{
public class UseTrie
{
public void Main()
{
var words = new[]{"pre", "prefix"};
var trie = new Trie(words);
// Print "pre" and "prefix"
foreach(var word in trie.GetTokens("pr"))
{
Console.WriteLine(word);
}
}
}
public class Trie
{
public char Value { get; set; }
public bool Eow { get; set; }
public IDictionary Children { get; set; }
public bool Root { get; set; }
public Trie(bool isRoot)
{
Root = isRoot;
Children = new Dictionary();
}
public Trie(IList words)
{
if (words == null) throw new ArgumentNullException("words");
Root = true;
Children = new Dictionary();
foreach (var word in words)
{
AppendToDescendants(word);
}
}
public Trie(string text)
{
if (string.IsNullOrWhiteSpace(text))
{
throw new ArgumentException("text");
}
Value = text[0];
Children = new Dictionary();
if (text.Length > 1)
{
var overflow = text.Substring(1);
if (overflow.Length > 0)
{
AppendToDescendants(overflow);
}
}
else
{
Eow = true;
}
}
public IEnumerable GetTokens(string prefix)
{
var words = new List();
Trie child;
if (Children.TryGetVal
```
using System;
using System.Collections.Generic;
using System.IO;
namespace Resin
{
public class UseTrie
{
public void Main()
{
var words = new[]{"pre", "prefix"};
var trie = new Trie(words);
// Print "pre" and "prefix"
foreach(var word in trie.GetTokens("pr"))
{
Console.WriteLine(word);
}
}
}
public class Trie
{
public char Value { get; set; }
public bool Eow { get; set; }
public IDictionary Children { get; set; }
public bool Root { get; set; }
public Trie(bool isRoot)
{
Root = isRoot;
Children = new Dictionary();
}
public Trie(IList words)
{
if (words == null) throw new ArgumentNullException("words");
Root = true;
Children = new Dictionary();
foreach (var word in words)
{
AppendToDescendants(word);
}
}
public Trie(string text)
{
if (string.IsNullOrWhiteSpace(text))
{
throw new ArgumentException("text");
}
Value = text[0];
Children = new Dictionary();
if (text.Length > 1)
{
var overflow = text.Substring(1);
if (overflow.Length > 0)
{
AppendToDescendants(overflow);
}
}
else
{
Eow = true;
}
}
public IEnumerable GetTokens(string prefix)
{
var words = new List();
Trie child;
if (Children.TryGetVal
Solution
single-expression
I do not know if there is an official stance on formatting, but I find placing the
This is especially true if the boolean expression is long.
The safest formatting is to put the expression on the next line and enclose it in braces, even if it is a single line:
Personally, I err on the side of the above, but I generally find it acceptable to omit the braces when the expression leaves the current block (e.g.,
encapsulation
Most of the members in
With those use cases in mind, none of the properties need to be public, and in fact, none of them need to be properties. Instead, I would change them to private fields. By convention, you should then rename them to _camelCase (e.g.,
Also, since a caller is only going to be interested in the root, the third constructor should be private.
Additionally, the
This also allows you to remove the root flag altogether, since it was only there to prevent invalid access of
None of the properties (now fields) are modified outside a constructor, and given the behavior of
constructor chaining
The three constructors repeat assigning the children dictionary. A better way to do this is with constructor chaining. Move the dictionary assignment to a single constructor and call it from the other constructors:
With the addition of the
becomes
The benefit is that if you ever rename text, your refactor tools will update the second example, while commonly, they will miss the string in the first example.
As a general rule, I seal classes unless it has been specifically designed for extension. I could go into explanations why, but smarter people than I already have.
Final code:
```
public sealed class Trie
{
private readonly char _value;
private readonly bool _eow;
private readonly IDictionary _children;
public Trie()
{
_children = new Dictionary();
}
public Trie(IEnumerable words)
: this()
{
if (words == null)
throw new ArgumentNullException(nameof(words));
foreach (var word in words)
{
AppendToDescendants(word);
}
}
private Trie(string text)
: this()
{
if (string.IsNullOrWhiteSpace(text))
throw new ArgumentException(nameof(text));
_value = text[0];
if (text.Length > 1)
{
var overflow = text.Substring(1);
if (overflow.Length > 0)
{
AppendToDescendants(overflow);
}
}
else
{
_eow = true;
}
}
public IEnumerable GetTokens(string prefix)
{
var words = new List();
Trie child;
if (_children.TryGetValue(prefix[0], out child))
{
child.Scan(prefix, prefix, ref words);
}
return words;
}
private void Scan(string originalPrefix, string prefix, ref List words)
{
if (string.IsNullOrWhiteSpace(prefix))
throw new ArgumentException(nameof(prefix));
if (prefix.Length == 1 && prefix[0] == _value)
{
// The scan has reached its destination. Find words derived from this node.
if (_eow)
words.Add(originalPrefix);
foreach (var node in _children.Values)
{
node.Scan(
if statementsI do not know if there is an official stance on formatting, but I find placing the
if and the expression it contains side-by-side is difficult to read in most cases. It is also generally non-standard, and invites buggy code like the following:if(is true) do stuff;
do other stuff;This is especially true if the boolean expression is long.
The safest formatting is to put the expression on the next line and enclose it in braces, even if it is a single line:
if(is true)
{
do stuff;
}Personally, I err on the side of the above, but I generally find it acceptable to omit the braces when the expression leaves the current block (e.g.,
return, throw, continue, break):if(is true)
throw new SomeSpecificException("badness happened");encapsulation
Most of the members in
Trie are currently public. Many do not need to be. The general interaction with the trie is going to be creating it, adding words to it (currently part of the creation step), and fetching data from it.With those use cases in mind, none of the properties need to be public, and in fact, none of them need to be properties. Instead, I would change them to private fields. By convention, you should then rename them to _camelCase (e.g.,
_children).Also, since a caller is only going to be interested in the root, the third constructor should be private.
Additionally, the
Append method is dangerous to call on the root, but the above use cases dictate that callers only care about dealing with the root. Since all the children of Trie happen to also be Trie objects, you can make it private.This also allows you to remove the root flag altogether, since it was only there to prevent invalid access of
Append, and callers can no longer directly access children.readonly for great justiceNone of the properties (now fields) are modified outside a constructor, and given the behavior of
Trie, none of them should change. As such, it is best to make them explicitly read-only:private readonly char _value;
private readonly bool _eow;
private readonly IDictionary _children;constructor chaining
The three constructors repeat assigning the children dictionary. A better way to do this is with constructor chaining. Move the dictionary assignment to a single constructor and call it from the other constructors:
public Trie()
{
_children = new Dictionary();
}
public Trie(IEnumerable words)
: this()
{
...
}
public Trie(string text)
: this()
{
...
}nameof all the thingsWith the addition of the
nameof operator in C#6, I use it wherever possible when referencing the name of a member. In Trie, this is useful for all your ArgumentExceptions:if (string.IsNullOrWhiteSpace(text))
throw new ArgumentException("text");becomes
if (string.IsNullOrWhiteSpace(text))
throw new ArgumentException(nameof(text));The benefit is that if you ever rename text, your refactor tools will update the second example, while commonly, they will miss the string in the first example.
sealed by defaultAs a general rule, I seal classes unless it has been specifically designed for extension. I could go into explanations why, but smarter people than I already have.
Final code:
```
public sealed class Trie
{
private readonly char _value;
private readonly bool _eow;
private readonly IDictionary _children;
public Trie()
{
_children = new Dictionary();
}
public Trie(IEnumerable words)
: this()
{
if (words == null)
throw new ArgumentNullException(nameof(words));
foreach (var word in words)
{
AppendToDescendants(word);
}
}
private Trie(string text)
: this()
{
if (string.IsNullOrWhiteSpace(text))
throw new ArgumentException(nameof(text));
_value = text[0];
if (text.Length > 1)
{
var overflow = text.Substring(1);
if (overflow.Length > 0)
{
AppendToDescendants(overflow);
}
}
else
{
_eow = true;
}
}
public IEnumerable GetTokens(string prefix)
{
var words = new List();
Trie child;
if (_children.TryGetValue(prefix[0], out child))
{
child.Scan(prefix, prefix, ref words);
}
return words;
}
private void Scan(string originalPrefix, string prefix, ref List words)
{
if (string.IsNullOrWhiteSpace(prefix))
throw new ArgumentException(nameof(prefix));
if (prefix.Length == 1 && prefix[0] == _value)
{
// The scan has reached its destination. Find words derived from this node.
if (_eow)
words.Add(originalPrefix);
foreach (var node in _children.Values)
{
node.Scan(
Code Snippets
if(is true) do stuff;
do other stuff;if(is true)
{
do stuff;
}if(is true)
throw new SomeSpecificException("badness happened");private readonly char _value;
private readonly bool _eow;
private readonly IDictionary<char, Trie> _children;public Trie()
{
_children = new Dictionary<char, Trie>();
}
public Trie(IEnumerable<string> words)
: this()
{
...
}
public Trie(string text)
: this()
{
...
}Context
StackExchange Code Review Q#123674, answer score: 4
Revisions (0)
No revisions yet.