patternjavascriptMinor
Reduce duplicate sequences
Viewed 0 times
duplicatereducesequences
Problem
I am writing a copy paste detector using esprima.js and Hmm.
This particular function checks a number of sequences and reduces them.
Due to the parsing I might have 2 sequences with the same piece of text, if so I need to merge the 2 sequences and add the start/ends ( both are source token indexes ) and occurrences ( contains location of the dupes in the original source ).
Obviously I could apply some DRY to this code, but I feel that every time I start I actually make the code less readable. Also I am not too happy with the naming, is there some good vocabulary out there for comparing sequences what is a good name for
Anyway, all feedback is welcome, though I know that I am on a lonely crusade that
```
Tokens.prototype.reduceSequences = function()
{
var sequences = this.sequences,
tokens = this.raw();
if( sequences.length < 2 )
return;
for( var outer = 1 ; outer < sequences.length ; outer++)
{
for( var inner = 0 ; inner < outer ; inner ++)
{
if( sequences[inner].content == sequences[outer].content )
{
if( !~sequences[inner].starts.indexOf( sequences[outer].starts[0] ) )
{
sequences[inner].starts.push( sequences[outer].starts[0] );
sequences[inner].ends.push( sequences[outer].starts[0] + sequences[inner].tokenCount );
sequences[inner].occurences.push( sequences[outer].occurences[0] );
}
if( !~sequences[inner].starts.indexOf( sequences[outer].starts[1] ) )
{
sequences[inner].starts.push( sequences[outer].starts[1] );
sequences[inner].ends.push( sequences[outer].starts[1] + sequences[inner].tokenCount );
sequences[inner].occurences.push( sequences[outer].occurences[1] );
}
sequences.splice(outer, 1);
This particular function checks a number of sequences and reduces them.
Due to the parsing I might have 2 sequences with the same piece of text, if so I need to merge the 2 sequences and add the start/ends ( both are source token indexes ) and occurrences ( contains location of the dupes in the original source ).
Obviously I could apply some DRY to this code, but I feel that every time I start I actually make the code less readable. Also I am not too happy with the naming, is there some good vocabulary out there for comparing sequences what is a good name for
sequences[inner] and sequences[outer].Anyway, all feedback is welcome, though I know that I am on a lonely crusade that
~ is so superior to -1 that I am sticking it to it indefinitely.```
Tokens.prototype.reduceSequences = function()
{
var sequences = this.sequences,
tokens = this.raw();
if( sequences.length < 2 )
return;
for( var outer = 1 ; outer < sequences.length ; outer++)
{
for( var inner = 0 ; inner < outer ; inner ++)
{
if( sequences[inner].content == sequences[outer].content )
{
if( !~sequences[inner].starts.indexOf( sequences[outer].starts[0] ) )
{
sequences[inner].starts.push( sequences[outer].starts[0] );
sequences[inner].ends.push( sequences[outer].starts[0] + sequences[inner].tokenCount );
sequences[inner].occurences.push( sequences[outer].occurences[0] );
}
if( !~sequences[inner].starts.indexOf( sequences[outer].starts[1] ) )
{
sequences[inner].starts.push( sequences[outer].starts[1] );
sequences[inner].ends.push( sequences[outer].starts[1] + sequences[inner].tokenCount );
sequences[inner].occurences.push( sequences[outer].occurences[1] );
}
sequences.splice(outer, 1);
Solution
It looks like you could use
I think that should work.
Oh, and yes, I've undone your brace-on-new-line style - sorry, couldn't help myself. But I've left out braces I'd usually include, so I've met you halfway! :)
If
It's not brilliant, but either way it feels a tiny bit cleaner to encapsulate the "merge" step in a function.
By the way,
It might be more beneficial to simply add the
reduce to your advantage, and extracting a function to merge two sequences. (Edit: messed up 1st version; this should be better)Tokens.prototype.reduceSequences = function () {
// source and destination seem like slightly better names
function merge(source, destination) {
// can't merge, so return false
if( !~destination.content.indexOf( source.content ) )
return false;
// no need to actually merge (source included in destination),
// so just return true
if( destination.content != source.content )
return true;
// otherwise, merge
for( var i = 0, l = source.starts.length ; i < l ; i++ ) {
if( ~destination.starts.indexOf(source.starts[i]) ) continue;
destination.starts.push( source.starts[i] );
destination.ends.push( source.starts[i] + destination.tokenCount );
destination.occurences.push( source.occurences[i] );
}
return true;
}
if(this.sequences.length < 2) return;
this.sequences = this.sequences.reduce(function (sequences, current) {
var merged, i, l;
for( i = 0, l = sequences.length ; i < l && !merged ; i++ )
merged = merge(current, sequences[i]);
if( !merged )
sequences.push(current);
return sequences;
}, []);
}I think that should work.
Oh, and yes, I've undone your brace-on-new-line style - sorry, couldn't help myself. But I've left out braces I'd usually include, so I've met you halfway! :)
If
reduce is not an option, I can't really come up with anything besides extracting a merge function (basically a "lighter" refactoring). Like soTokens.prototype.reduceSequences = function () {
// source and destination seem like slightly better names to use *here*
// but they don't make as much sense in the loops below, IMO
function merge(source, destination) {
for( var i = 0, l = source.starts.length ; i < l ; i++ ) {
if( ~destination.starts.indexOf(source.starts[i]) ) continue;
destination.starts.push( source.starts[i] );
destination.ends.push( source.starts[i] + destination.tokenCount );
destination.occurences.push( source.occurences[i] );
}
}
var sequences = this.sequences;
for( var outer = 1 ; outer < sequences.length ; outer++) {
for( var inner = 0 ; inner < outer ; inner ++) {
if( sequences[inner].content == sequences[outer].content ) {
merge(sequences[outer], sequences[inner]);
sequences.splice(outer, 1);
outer = 0;
break;
}
if( ~sequences[inner].content.indexOf( sequences[outer].content ) ) {
sequences.splice(outer, 1);
outer = 0;
break;
}
}
}
}It's not brilliant, but either way it feels a tiny bit cleaner to encapsulate the "merge" step in a function.
By the way,
var tokens = this.raw() didn't seem to be used for anything.It might be more beneficial to simply add the
merge function to your sequence objects, though. It'd clean up this particular code nicely. But you know better than I if something like that would make sense in your project.Code Snippets
Tokens.prototype.reduceSequences = function () {
// source and destination seem like slightly better names
function merge(source, destination) {
// can't merge, so return false
if( !~destination.content.indexOf( source.content ) )
return false;
// no need to actually merge (source included in destination),
// so just return true
if( destination.content != source.content )
return true;
// otherwise, merge
for( var i = 0, l = source.starts.length ; i < l ; i++ ) {
if( ~destination.starts.indexOf(source.starts[i]) ) continue;
destination.starts.push( source.starts[i] );
destination.ends.push( source.starts[i] + destination.tokenCount );
destination.occurences.push( source.occurences[i] );
}
return true;
}
if(this.sequences.length < 2) return;
this.sequences = this.sequences.reduce(function (sequences, current) {
var merged, i, l;
for( i = 0, l = sequences.length ; i < l && !merged ; i++ )
merged = merge(current, sequences[i]);
if( !merged )
sequences.push(current);
return sequences;
}, []);
}Tokens.prototype.reduceSequences = function () {
// source and destination seem like slightly better names to use *here*
// but they don't make as much sense in the loops below, IMO
function merge(source, destination) {
for( var i = 0, l = source.starts.length ; i < l ; i++ ) {
if( ~destination.starts.indexOf(source.starts[i]) ) continue;
destination.starts.push( source.starts[i] );
destination.ends.push( source.starts[i] + destination.tokenCount );
destination.occurences.push( source.occurences[i] );
}
}
var sequences = this.sequences;
for( var outer = 1 ; outer < sequences.length ; outer++) {
for( var inner = 0 ; inner < outer ; inner ++) {
if( sequences[inner].content == sequences[outer].content ) {
merge(sequences[outer], sequences[inner]);
sequences.splice(outer, 1);
outer = 0;
break;
}
if( ~sequences[inner].content.indexOf( sequences[outer].content ) ) {
sequences.splice(outer, 1);
outer = 0;
break;
}
}
}
}Context
StackExchange Code Review Q#55934, answer score: 2
Revisions (0)
No revisions yet.