patterncsharpMinor
InfixDictionary: Data structure for Infix string lookup
Viewed 0 times
infixforstructurelookupinfixdictionarydatastring
Problem
I needed a data structure to quickly find all previously inserted values, that have the given string as key or substring (full text search).
At first, I tried out some tree structures (infix trie/tree) and was shocked by the slow insertion time and memory usage.
That's when I developed the
I previously used something like it at work and it worked flawlessly, now I rewrote it at home and wanted to get it reviewed.
I am especially afraid of synchronization/volatility issues. Is it enough to synchronize on the
An easy example of what exactly I want:
Imagine you have the contact 'Dr. Richard Stallman' with the phone number '\$+123456\$' who works for the 'fsf'.
Now, you can only remember his first name: 'richard', that he works for the 'fsf', and he is a 'Dr.'. Your search-string to find him, would be, for example:
You'd also find him searching for:
```
using System;
using System.Collections.Generic;
using System.Linq;
namespace at.loup.CSUtilities.Collections
{
public class InfixDictionary
{
private readonly string[] splitStrings = null;
private readonly Dictionary> data = new Dictionary>();
private bool normalizeKeysToUpper;
public InfixDictionary(bool normalizeKeysToUpper = true, params string[] splitStrings)
{
this.splitStrings = splitStrings;
this.normalizeKeysToUpper = normalizeKeysToUpper;
}
public InfixDictionary(bool normalizeKeysToUpper = true)
{
splitStrings = splitStrings = new[] { " " };
this.normalizeKeysToUpper = normalizeKeysToUpper;
}
#region diagnostics
public int SubKeyCount
{
get { return data.Count; }
}
At first, I tried out some tree structures (infix trie/tree) and was shocked by the slow insertion time and memory usage.
That's when I developed the
InfixDictionary, which is internally based on a Dictionary>.I previously used something like it at work and it worked flawlessly, now I rewrote it at home and wanted to get it reviewed.
I am especially afraid of synchronization/volatility issues. Is it enough to synchronize on the
Dictionary itself to guarantee access without trouble? I know the internals of the Dictionary are not volatile.An easy example of what exactly I want:
Imagine you have the contact 'Dr. Richard Stallman' with the phone number '\$+123456\$' who works for the 'fsf'.
Now, you can only remember his first name: 'richard', that he works for the 'fsf', and he is a 'Dr.'. Your search-string to find him, would be, for example:
"fsf richard Dr."You'd also find him searching for:
"ri ic ch ar ric chard f + 123"```
using System;
using System.Collections.Generic;
using System.Linq;
namespace at.loup.CSUtilities.Collections
{
public class InfixDictionary
{
private readonly string[] splitStrings = null;
private readonly Dictionary> data = new Dictionary>();
private bool normalizeKeysToUpper;
public InfixDictionary(bool normalizeKeysToUpper = true, params string[] splitStrings)
{
this.splitStrings = splitStrings;
this.normalizeKeysToUpper = normalizeKeysToUpper;
}
public InfixDictionary(bool normalizeKeysToUpper = true)
{
splitStrings = splitStrings = new[] { " " };
this.normalizeKeysToUpper = normalizeKeysToUpper;
}
#region diagnostics
public int SubKeyCount
{
get { return data.Count; }
}
Solution
lock (first) // lock HashSet element during duplication
first = new List(first);This will result in compiler warning
CS0728 C# Possibly incorrect assignment to local '' which is the argument to a using or lock statement. The Dispose call or unlocking will happen on the original value of the local.
Related to locking: https://stackoverflow.com/a/11775353/2655508
And basically I quite don't understand what you want to prevent with this. The
hashSets variable is a method scoped variable. There won't happen any threading issues here.A much easier way would be
IEnumerable first = new List(hashSetsList.First());but if you really are concerned about thread safety, you really should use the
ConcurrentDictionary.foreach (string subKey in key.Split(splitStrings, StringSplitOptions.RemoveEmptyEntries))
{
if (!data.TryGetValue(subKey, out tmp))
return new List();
hashSets.Add(tmp);
}
if (hashSets.Count == 0)
return new List();
if (hashSets.Count > 1)
{
IEnumerable> hashSetsList = hashSets.ToList();I don't see a reason to check for
hashSets.Count > 1. What else could Count be at this place ?A better way to write this would be to store the result of the
Split() in a variable and return a new List if Length == 0. In this way neither of these checks about the Count property would be needed.The
else if as itself isn't needed at all because if splitStrings != null it won't be reached. So changing it to a simple if will make it more obvious that the condition is about the TryGetValue.Like so
public IEnumerable Lookup(string key)
{
if (key == null || key.Length == 0)
{
return new List();
}
if (normalizeKeysToUpper)
{
key = key.ToUpper();
}
HashSet tmp = null;
if (splitStrings != null)
{
HashSet> hashSets = new HashSet>();
string[] subkeys = key.Split(splitStrings, StringSplitOptions.RemoveEmptyEntries);
if (subkeys.Length == 0)
{
return new List();
}
foreach (string subKey in subkeys)
{
if (!data.TryGetValue(subKey, out tmp))
{
return new List();
}
hashSets.Add(tmp);
}
IEnumerable> hashSetsList = hashSets.ToList();
// first element is duplicated so we can later use a generic way ...
// to aggregate all collected hashsets with only one lock per ...
// aggregation
// it's necessary to explicitly acquire the first element to lock it
IEnumerable first = new List(hashSetsList.First());
return hashSetsList
.Skip(1) // first element is the aggregation seed
.Aggregate(first,
(o, n) =>
{
lock (n)
{
return o.Intersect(n);
}
});
}
// if the key is never split we can simply lookup with it
if (!data.TryGetValue(key, out tmp))
{
return new List();
}
lock (tmp)
{
return tmp.ToList();
}
}Using braces
{} would make your code easier to read and also less error prone.public void Add(T value, params string[] keys)
{
if (keys == null)
return;
if (keys.Length == 1)
{
if (splitStrings == null)
AddInternalThreadsafe(keys[0], value);
else
foreach (string subKey in keys[0].Split(splitStrings, StringSplitOptions.RemoveEmptyEntries))
AddInternalThreadsafe(subKey, value);
}
else
{
if (splitStrings == null)
foreach (string key in keys)
AddInternalThreadsafe(key, value);
else
foreach (string key in keys)
foreach (string subKey in key.Split(splitStrings, StringSplitOptions.RemoveEmptyEntries))
AddInternalThreadsafe(subKey, value);
}
return;
}here you have duplicated code which can be easily removed like so
public void Add(T value, params string[] keys)
{
if (keys == null)
{
return;
}
if (splitStrings == null)
{
foreach (string key in keys)
{
AddInternalThreadsafe(key, value);
}
return;
}
foreach (string key in keys)
{
foreach (string subKey in key.Split(splitStrings, StringSplitOptions.RemoveEmptyEntries))
{
AddInternalThreadsafe(subKey, value);
}
}
}Regions
Please read are-regions-an-antipattern-or-code-smell
Is there a good use for regions?
No. There was a legacy use: generated code. Still, code generation tools just have to use partial classes instead. If C# has regions support, it's mostly because this legacy use, and because now that
Code Snippets
lock (first) // lock HashSet element during duplication
first = new List<T>(first);IEnumerable<T> first = new List<T>(hashSetsList.First());foreach (string subKey in key.Split(splitStrings, StringSplitOptions.RemoveEmptyEntries))
{
if (!data.TryGetValue(subKey, out tmp))
return new List<T>();
hashSets.Add(tmp);
}
if (hashSets.Count == 0)
return new List<T>();
if (hashSets.Count > 1)
{
IEnumerable<IEnumerable<T>> hashSetsList = hashSets.ToList();public IEnumerable<T> Lookup(string key)
{
if (key == null || key.Length == 0)
{
return new List<T>();
}
if (normalizeKeysToUpper)
{
key = key.ToUpper();
}
HashSet<T> tmp = null;
if (splitStrings != null)
{
HashSet<IEnumerable<T>> hashSets = new HashSet<IEnumerable<T>>();
string[] subkeys = key.Split(splitStrings, StringSplitOptions.RemoveEmptyEntries);
if (subkeys.Length == 0)
{
return new List<T>();
}
foreach (string subKey in subkeys)
{
if (!data.TryGetValue(subKey, out tmp))
{
return new List<T>();
}
hashSets.Add(tmp);
}
IEnumerable<IEnumerable<T>> hashSetsList = hashSets.ToList();
// first element is duplicated so we can later use a generic way ...
// to aggregate all collected hashsets with only one lock per ...
// aggregation
// it's necessary to explicitly acquire the first element to lock it
IEnumerable<T> first = new List<T>(hashSetsList.First());
return hashSetsList
.Skip(1) // first element is the aggregation seed
.Aggregate(first,
(o, n) =>
{
lock (n)
{
return o.Intersect(n);
}
});
}
// if the key is never split we can simply lookup with it
if (!data.TryGetValue(key, out tmp))
{
return new List<T>();
}
lock (tmp)
{
return tmp.ToList();
}
}public void Add(T value, params string[] keys)
{
if (keys == null)
return;
if (keys.Length == 1)
{
if (splitStrings == null)
AddInternalThreadsafe(keys[0], value);
else
foreach (string subKey in keys[0].Split(splitStrings, StringSplitOptions.RemoveEmptyEntries))
AddInternalThreadsafe(subKey, value);
}
else
{
if (splitStrings == null)
foreach (string key in keys)
AddInternalThreadsafe(key, value);
else
foreach (string key in keys)
foreach (string subKey in key.Split(splitStrings, StringSplitOptions.RemoveEmptyEntries))
AddInternalThreadsafe(subKey, value);
}
return;
}Context
StackExchange Code Review Q#102084, answer score: 2
Revisions (0)
No revisions yet.