HiveBrain v1.2.0
Get Started
← Back to all entries
patternjavascriptMinor

Reduce duplicate sequences

Submitted by: @import:stackexchange-codereview··
0
Viewed 0 times
duplicatereducesequences

Problem

I am writing a copy paste detector using esprima.js and Hmm.

This particular function checks a number of sequences and reduces them.
Due to the parsing I might have 2 sequences with the same piece of text, if so I need to merge the 2 sequences and add the start/ends ( both are source token indexes ) and occurrences ( contains location of the dupes in the original source ).

Obviously I could apply some DRY to this code, but I feel that every time I start I actually make the code less readable. Also I am not too happy with the naming, is there some good vocabulary out there for comparing sequences what is a good name for sequences[inner] and sequences[outer].

Anyway, all feedback is welcome, though I know that I am on a lonely crusade that ~ is so superior to -1 that I am sticking it to it indefinitely.

```
Tokens.prototype.reduceSequences = function()
{
var sequences = this.sequences,
tokens = this.raw();

if( sequences.length < 2 )
return;

for( var outer = 1 ; outer < sequences.length ; outer++)
{
for( var inner = 0 ; inner < outer ; inner ++)
{
if( sequences[inner].content == sequences[outer].content )
{

if( !~sequences[inner].starts.indexOf( sequences[outer].starts[0] ) )
{
sequences[inner].starts.push( sequences[outer].starts[0] );
sequences[inner].ends.push( sequences[outer].starts[0] + sequences[inner].tokenCount );
sequences[inner].occurences.push( sequences[outer].occurences[0] );
}

if( !~sequences[inner].starts.indexOf( sequences[outer].starts[1] ) )
{
sequences[inner].starts.push( sequences[outer].starts[1] );
sequences[inner].ends.push( sequences[outer].starts[1] + sequences[inner].tokenCount );
sequences[inner].occurences.push( sequences[outer].occurences[1] );
}

sequences.splice(outer, 1);

Solution

It looks like you could use reduce to your advantage, and extracting a function to merge two sequences. (Edit: messed up 1st version; this should be better)

Tokens.prototype.reduceSequences = function () {
  // source and destination seem like slightly better names
  function merge(source, destination) {
    // can't merge, so return false
    if( !~destination.content.indexOf( source.content ) )
      return false;

    // no need to actually merge (source included in destination),
    // so just return true
    if( destination.content != source.content )
      return true;

    // otherwise, merge
    for( var i = 0, l = source.starts.length ; i < l ; i++ ) {
      if( ~destination.starts.indexOf(source.starts[i]) ) continue;
      destination.starts.push( source.starts[i] );
      destination.ends.push( source.starts[i] + destination.tokenCount );
      destination.occurences.push( source.occurences[i] );
    }

    return true;
  }

  if(this.sequences.length < 2) return;

  this.sequences = this.sequences.reduce(function (sequences, current) {
    var merged, i, l;

    for( i = 0, l = sequences.length ; i < l && !merged ; i++ )
      merged = merge(current, sequences[i]);

    if( !merged )
      sequences.push(current);

    return sequences;
  }, []);
}


I think that should work.

Oh, and yes, I've undone your brace-on-new-line style - sorry, couldn't help myself. But I've left out braces I'd usually include, so I've met you halfway! :)

If reduce is not an option, I can't really come up with anything besides extracting a merge function (basically a "lighter" refactoring). Like so

Tokens.prototype.reduceSequences = function () {
  // source and destination seem like slightly better names to use *here*
  // but they don't make as much sense in the loops below, IMO
  function merge(source, destination) {
    for( var i = 0, l = source.starts.length ; i < l ; i++ ) {
      if( ~destination.starts.indexOf(source.starts[i]) ) continue;
      destination.starts.push( source.starts[i] );
      destination.ends.push( source.starts[i] + destination.tokenCount );
      destination.occurences.push( source.occurences[i] );
    }
  }

  var sequences = this.sequences;

  for( var outer = 1 ; outer < sequences.length ; outer++) {
    for( var inner = 0 ; inner < outer ; inner ++) {
      if( sequences[inner].content == sequences[outer].content ) {
        merge(sequences[outer], sequences[inner]);
        sequences.splice(outer, 1);
        outer = 0;
        break;
      }

      if( ~sequences[inner].content.indexOf( sequences[outer].content )  ) {
        sequences.splice(outer, 1);
        outer = 0;
        break;
      }
    }
  }
}


It's not brilliant, but either way it feels a tiny bit cleaner to encapsulate the "merge" step in a function.

By the way, var tokens = this.raw() didn't seem to be used for anything.

It might be more beneficial to simply add the merge function to your sequence objects, though. It'd clean up this particular code nicely. But you know better than I if something like that would make sense in your project.

Code Snippets

Tokens.prototype.reduceSequences = function () {
  // source and destination seem like slightly better names
  function merge(source, destination) {
    // can't merge, so return false
    if( !~destination.content.indexOf( source.content ) )
      return false;

    // no need to actually merge (source included in destination),
    // so just return true
    if( destination.content != source.content )
      return true;

    // otherwise, merge
    for( var i = 0, l = source.starts.length ; i < l ; i++ ) {
      if( ~destination.starts.indexOf(source.starts[i]) ) continue;
      destination.starts.push( source.starts[i] );
      destination.ends.push( source.starts[i] + destination.tokenCount );
      destination.occurences.push( source.occurences[i] );
    }

    return true;
  }

  if(this.sequences.length < 2) return;

  this.sequences = this.sequences.reduce(function (sequences, current) {
    var merged, i, l;

    for( i = 0, l = sequences.length ; i < l && !merged ; i++ )
      merged = merge(current, sequences[i]);

    if( !merged )
      sequences.push(current);

    return sequences;
  }, []);
}
Tokens.prototype.reduceSequences = function () {
  // source and destination seem like slightly better names to use *here*
  // but they don't make as much sense in the loops below, IMO
  function merge(source, destination) {
    for( var i = 0, l = source.starts.length ; i < l ; i++ ) {
      if( ~destination.starts.indexOf(source.starts[i]) ) continue;
      destination.starts.push( source.starts[i] );
      destination.ends.push( source.starts[i] + destination.tokenCount );
      destination.occurences.push( source.occurences[i] );
    }
  }

  var sequences = this.sequences;

  for( var outer = 1 ; outer < sequences.length ; outer++) {
    for( var inner = 0 ; inner < outer ; inner ++) {
      if( sequences[inner].content == sequences[outer].content ) {
        merge(sequences[outer], sequences[inner]);
        sequences.splice(outer, 1);
        outer = 0;
        break;
      }

      if( ~sequences[inner].content.indexOf( sequences[outer].content )  ) {
        sequences.splice(outer, 1);
        outer = 0;
        break;
      }
    }
  }
}

Context

StackExchange Code Review Q#55934, answer score: 2

Revisions (0)

No revisions yet.