patternsqlMinor
Fuzzy Matching with Postgresql 9.3
Viewed 0 times
withpostgresqlmatchingfuzzy
Problem
Right now I am using ts_rank (with ts_vector and ts_query) to search strings for relevance. While this is doing a great job for me so far. I was hoping to work in a bit of fuzzieness into the searching.
As it stands the search is done by a user entering their terms as a string, that I then parse into tokens. These tokens are then compared against a string. What I would like to do is use something similar to a levenshtein in these queries to allow for minor misspellings to be matched as well.
For example if the user typed in rubico instead of rubicon I would still get a match on it (because rubico only has a distance of 1 from rubicon).
Is this possible to do with Postgres' ts_rank functionality, or are there other options to allow for text searches to work with levenstheins?
As it stands the search is done by a user entering their terms as a string, that I then parse into tokens. These tokens are then compared against a string. What I would like to do is use something similar to a levenshtein in these queries to allow for minor misspellings to be matched as well.
For example if the user typed in rubico instead of rubicon I would still get a match on it (because rubico only has a distance of 1 from rubicon).
Is this possible to do with Postgres' ts_rank functionality, or are there other options to allow for text searches to work with levenstheins?
Solution
It can be done, but it's not necessarily fun, or fast.
Your best bet for fuzzy matching is "Soft TFIDF" (pdf), probably using Jaro Winkler similarity. Jaro Winkler is similar to Levenshtein but weights letters more heavily at the beginning of a string. It's available in the pg_similarity extension.
Here's a function I wrote a while ago, which I haven't tested rigorously. It's basically a port from here https://github.com/TeamCohen/secondstring
array_unique() is from the smlar extension. tokenize() just splits on non-alphanums. The table smlar_stats is like this:
You'll probably want to create such a table dynamically though.
Your best bet for fuzzy matching is "Soft TFIDF" (pdf), probably using Jaro Winkler similarity. Jaro Winkler is similar to Levenshtein but weights letters more heavily at the beginning of a string. It's available in the pg_similarity extension.
Here's a function I wrote a while ago, which I haven't tested rigorously. It's basically a port from here https://github.com/TeamCohen/secondstring
array_unique() is from the smlar extension. tokenize() just splits on non-alphanums. The table smlar_stats is like this:
create table smlar_stats (
value text unique,
ndoc int not null
);You'll probably want to create such a table dynamically though.
CREATE OR REPLACE FUNCTION soft_tfidf(s text, t text, debug boolean DEFAULT false)
RETURNS real AS
$BODY$
declare
s_bag text[] = array_unique( tokenize(s) );
t_bag text[] = array_unique( tokenize(t) );
tok text;
tok_j text;
sim real = 0.0;
jw real;
threshold real = 0.78;
match_score real;
match_tok text;
begin
if debug then raise notice 's_bag: % t_bag: %', s_bag, t_bag; end if;
foreach tok in array s_bag loop
if inarray(t_bag, tok) then
sim = sim + bag_weight(s_bag, tok) * bag_weight(t_bag, tok);
else
match_score = threshold;
match_tok = null;
foreach tok_j in array t_bag loop
jw = jarowinkler(tok, tok_j);
if jw >= match_score then
match_tok = tok_j;
match_score = jw; if debug then raise notice 'jw(%,%)=%', tok, tok_j, jw; end if;
end if;
end loop;
if match_tok is not null then
sim = sim + bag_weight(s_bag, tok) * bag_weight(t_bag, match_tok) * match_score;
end if;
end if;
end loop;
return sim;
end $BODY$
LANGUAGE plpgsql VOLATILE STRICT
CREATE OR REPLACE FUNCTION bag_weight(text[], text)
RETURNS real AS
$BODY$
declare
w real;
normalizer real = 0.0;
token text;
df real;
weights real[] = '{}';
begin
foreach token in array $1 loop
select ndoc from smlar_stats where value = token into df;
if df is null then df = 1.0; end if;
w = log(2) * log(301.1 / df);
weights = array_append(weights, w);
normalizer = normalizer + w*w;
end loop;
normalizer = sqrt(normalizer);
for i in 1..array_length($1, 1) loop
if $1[i] = $2 then
return weights[i] / normalizer;
end if;
end loop;
end $BODY$
LANGUAGE plpgsql VOLATILE STRICTCode Snippets
create table smlar_stats (
value text unique,
ndoc int not null
);CREATE OR REPLACE FUNCTION soft_tfidf(s text, t text, debug boolean DEFAULT false)
RETURNS real AS
$BODY$
declare
s_bag text[] = array_unique( tokenize(s) );
t_bag text[] = array_unique( tokenize(t) );
tok text;
tok_j text;
sim real = 0.0;
jw real;
threshold real = 0.78;
match_score real;
match_tok text;
begin
if debug then raise notice 's_bag: % t_bag: %', s_bag, t_bag; end if;
foreach tok in array s_bag loop
if inarray(t_bag, tok) then
sim = sim + bag_weight(s_bag, tok) * bag_weight(t_bag, tok);
else
match_score = threshold;
match_tok = null;
foreach tok_j in array t_bag loop
jw = jarowinkler(tok, tok_j);
if jw >= match_score then
match_tok = tok_j;
match_score = jw; if debug then raise notice 'jw(%,%)=%', tok, tok_j, jw; end if;
end if;
end loop;
if match_tok is not null then
sim = sim + bag_weight(s_bag, tok) * bag_weight(t_bag, match_tok) * match_score;
end if;
end if;
end loop;
return sim;
end $BODY$
LANGUAGE plpgsql VOLATILE STRICT
CREATE OR REPLACE FUNCTION bag_weight(text[], text)
RETURNS real AS
$BODY$
declare
w real;
normalizer real = 0.0;
token text;
df real;
weights real[] = '{}';
begin
foreach token in array $1 loop
select ndoc from smlar_stats where value = token into df;
if df is null then df = 1.0; end if;
w = log(2) * log(301.1 / df);
weights = array_append(weights, w);
normalizer = normalizer + w*w;
end loop;
normalizer = sqrt(normalizer);
for i in 1..array_length($1, 1) loop
if $1[i] = $2 then
return weights[i] / normalizer;
end if;
end loop;
end $BODY$
LANGUAGE plpgsql VOLATILE STRICTContext
StackExchange Database Administrators Q#80357, answer score: 3
Revisions (0)
No revisions yet.