summaryrefslogtreecommitdiffstats
path: root/pintos-progos/tests/Algorithm/Diff.pm
diff options
context:
space:
mode:
Diffstat (limited to 'pintos-progos/tests/Algorithm/Diff.pm')
-rw-r--r--pintos-progos/tests/Algorithm/Diff.pm1713
1 files changed, 1713 insertions, 0 deletions
diff --git a/pintos-progos/tests/Algorithm/Diff.pm b/pintos-progos/tests/Algorithm/Diff.pm
new file mode 100644
index 0000000..904c530
--- /dev/null
+++ b/pintos-progos/tests/Algorithm/Diff.pm
@@ -0,0 +1,1713 @@
1package Algorithm::Diff;
2# Skip to first "=head" line for documentation.
3use strict;
4
5use integer; # see below in _replaceNextLargerWith() for mod to make
6 # if you don't use this
7use vars qw( $VERSION @EXPORT_OK );
8$VERSION = 1.19_01;
9# ^ ^^ ^^-- Incremented at will
10# | \+----- Incremented for non-trivial changes to features
11# \-------- Incremented for fundamental changes
12require Exporter;
13*import = \&Exporter::import;
14@EXPORT_OK = qw(
15 prepare LCS LCDidx LCS_length
16 diff sdiff compact_diff
17 traverse_sequences traverse_balanced
18);
19
20# McIlroy-Hunt diff algorithm
21# Adapted from the Smalltalk code of Mario I. Wolczko, <mario@wolczko.com>
22# by Ned Konz, perl@bike-nomad.com
23# Updates by Tye McQueen, http://perlmonks.org/?node=tye
24
25# Create a hash that maps each element of $aCollection to the set of
26# positions it occupies in $aCollection, restricted to the elements
27# within the range of indexes specified by $start and $end.
28# The fourth parameter is a subroutine reference that will be called to
29# generate a string to use as a key.
30# Additional parameters, if any, will be passed to this subroutine.
31#
32# my $hashRef = _withPositionsOfInInterval( \@array, $start, $end, $keyGen );
33
34sub _withPositionsOfInInterval
35{
36 my $aCollection = shift; # array ref
37 my $start = shift;
38 my $end = shift;
39 my $keyGen = shift;
40 my %d;
41 my $index;
42 for ( $index = $start ; $index <= $end ; $index++ )
43 {
44 my $element = $aCollection->[$index];
45 my $key = &$keyGen( $element, @_ );
46 if ( exists( $d{$key} ) )
47 {
48 unshift ( @{ $d{$key} }, $index );
49 }
50 else
51 {
52 $d{$key} = [$index];
53 }
54 }
55 return wantarray ? %d : \%d;
56}
57
58# Find the place at which aValue would normally be inserted into the
59# array. If that place is already occupied by aValue, do nothing, and
60# return undef. If the place does not exist (i.e., it is off the end of
61# the array), add it to the end, otherwise replace the element at that
62# point with aValue. It is assumed that the array's values are numeric.
63# This is where the bulk (75%) of the time is spent in this module, so
64# try to make it fast!
65
66sub _replaceNextLargerWith
67{
68 my ( $array, $aValue, $high ) = @_;
69 $high ||= $#$array;
70
71 # off the end?
72 if ( $high == -1 || $aValue > $array->[-1] )
73 {
74 push ( @$array, $aValue );
75 return $high + 1;
76 }
77
78 # binary search for insertion point...
79 my $low = 0;
80 my $index;
81 my $found;
82 while ( $low <= $high )
83 {
84 $index = ( $high + $low ) / 2;
85
86 # $index = int(( $high + $low ) / 2); # without 'use integer'
87 $found = $array->[$index];
88
89 if ( $aValue == $found )
90 {
91 return undef;
92 }
93 elsif ( $aValue > $found )
94 {
95 $low = $index + 1;
96 }
97 else
98 {
99 $high = $index - 1;
100 }
101 }
102
103 # now insertion point is in $low.
104 $array->[$low] = $aValue; # overwrite next larger
105 return $low;
106}
107
108# This method computes the longest common subsequence in $a and $b.
109
110# Result is array or ref, whose contents is such that
111# $a->[ $i ] == $b->[ $result[ $i ] ]
112# foreach $i in ( 0 .. $#result ) if $result[ $i ] is defined.
113
114# An additional argument may be passed; this is a hash or key generating
115# function that should return a string that uniquely identifies the given
116# element. It should be the case that if the key is the same, the elements
117# will compare the same. If this parameter is undef or missing, the key
118# will be the element as a string.
119
120# By default, comparisons will use "eq" and elements will be turned into keys
121# using the default stringizing operator '""'.
122
123# Additional parameters, if any, will be passed to the key generation
124# routine.
125
126sub _longestCommonSubsequence
127{
128 my $a = shift; # array ref or hash ref
129 my $b = shift; # array ref or hash ref
130 my $counting = shift; # scalar
131 my $keyGen = shift; # code ref
132 my $compare; # code ref
133
134 if ( ref($a) eq 'HASH' )
135 { # prepared hash must be in $b
136 my $tmp = $b;
137 $b = $a;
138 $a = $tmp;
139 }
140
141 # Check for bogus (non-ref) argument values
142 if ( !ref($a) || !ref($b) )
143 {
144 my @callerInfo = caller(1);
145 die 'error: must pass array or hash references to ' . $callerInfo[3];
146 }
147
148 # set up code refs
149 # Note that these are optimized.
150 if ( !defined($keyGen) ) # optimize for strings
151 {
152 $keyGen = sub { $_[0] };
153 $compare = sub { my ( $a, $b ) = @_; $a eq $b };
154 }
155 else
156 {
157 $compare = sub {
158 my $a = shift;
159 my $b = shift;
160 &$keyGen( $a, @_ ) eq &$keyGen( $b, @_ );
161 };
162 }
163
164 my ( $aStart, $aFinish, $matchVector ) = ( 0, $#$a, [] );
165 my ( $prunedCount, $bMatches ) = ( 0, {} );
166
167 if ( ref($b) eq 'HASH' ) # was $bMatches prepared for us?
168 {
169 $bMatches = $b;
170 }
171 else
172 {
173 my ( $bStart, $bFinish ) = ( 0, $#$b );
174
175 # First we prune off any common elements at the beginning
176 while ( $aStart <= $aFinish
177 and $bStart <= $bFinish
178 and &$compare( $a->[$aStart], $b->[$bStart], @_ ) )
179 {
180 $matchVector->[ $aStart++ ] = $bStart++;
181 $prunedCount++;
182 }
183
184 # now the end
185 while ( $aStart <= $aFinish
186 and $bStart <= $bFinish
187 and &$compare( $a->[$aFinish], $b->[$bFinish], @_ ) )
188 {
189 $matchVector->[ $aFinish-- ] = $bFinish--;
190 $prunedCount++;
191 }
192
193 # Now compute the equivalence classes of positions of elements
194 $bMatches =
195 _withPositionsOfInInterval( $b, $bStart, $bFinish, $keyGen, @_ );
196 }
197 my $thresh = [];
198 my $links = [];
199
200 my ( $i, $ai, $j, $k );
201 for ( $i = $aStart ; $i <= $aFinish ; $i++ )
202 {
203 $ai = &$keyGen( $a->[$i], @_ );
204 if ( exists( $bMatches->{$ai} ) )
205 {
206 $k = 0;
207 for $j ( @{ $bMatches->{$ai} } )
208 {
209
210 # optimization: most of the time this will be true
211 if ( $k and $thresh->[$k] > $j and $thresh->[ $k - 1 ] < $j )
212 {
213 $thresh->[$k] = $j;
214 }
215 else
216 {
217 $k = _replaceNextLargerWith( $thresh, $j, $k );
218 }
219
220 # oddly, it's faster to always test this (CPU cache?).
221 if ( defined($k) )
222 {
223 $links->[$k] =
224 [ ( $k ? $links->[ $k - 1 ] : undef ), $i, $j ];
225 }
226 }
227 }
228 }
229
230 if (@$thresh)
231 {
232 return $prunedCount + @$thresh if $counting;
233 for ( my $link = $links->[$#$thresh] ; $link ; $link = $link->[0] )
234 {
235 $matchVector->[ $link->[1] ] = $link->[2];
236 }
237 }
238 elsif ($counting)
239 {
240 return $prunedCount;
241 }
242
243 return wantarray ? @$matchVector : $matchVector;
244}
245
246sub traverse_sequences
247{
248 my $a = shift; # array ref
249 my $b = shift; # array ref
250 my $callbacks = shift || {};
251 my $keyGen = shift;
252 my $matchCallback = $callbacks->{'MATCH'} || sub { };
253 my $discardACallback = $callbacks->{'DISCARD_A'} || sub { };
254 my $finishedACallback = $callbacks->{'A_FINISHED'};
255 my $discardBCallback = $callbacks->{'DISCARD_B'} || sub { };
256 my $finishedBCallback = $callbacks->{'B_FINISHED'};
257 my $matchVector = _longestCommonSubsequence( $a, $b, 0, $keyGen, @_ );
258
259 # Process all the lines in @$matchVector
260 my $lastA = $#$a;
261 my $lastB = $#$b;
262 my $bi = 0;
263 my $ai;
264
265 for ( $ai = 0 ; $ai <= $#$matchVector ; $ai++ )
266 {
267 my $bLine = $matchVector->[$ai];
268 if ( defined($bLine) ) # matched
269 {
270 &$discardBCallback( $ai, $bi++, @_ ) while $bi < $bLine;
271 &$matchCallback( $ai, $bi++, @_ );
272 }
273 else
274 {
275 &$discardACallback( $ai, $bi, @_ );
276 }
277 }
278
279 # The last entry (if any) processed was a match.
280 # $ai and $bi point just past the last matching lines in their sequences.
281
282 while ( $ai <= $lastA or $bi <= $lastB )
283 {
284
285 # last A?
286 if ( $ai == $lastA + 1 and $bi <= $lastB )
287 {
288 if ( defined($finishedACallback) )
289 {
290 &$finishedACallback( $lastA, @_ );
291 $finishedACallback = undef;
292 }
293 else
294 {
295 &$discardBCallback( $ai, $bi++, @_ ) while $bi <= $lastB;
296 }
297 }
298
299 # last B?
300 if ( $bi == $lastB + 1 and $ai <= $lastA )
301 {
302 if ( defined($finishedBCallback) )
303 {
304 &$finishedBCallback( $lastB, @_ );
305 $finishedBCallback = undef;
306 }
307 else
308 {
309 &$discardACallback( $ai++, $bi, @_ ) while $ai <= $lastA;
310 }
311 }
312
313 &$discardACallback( $ai++, $bi, @_ ) if $ai <= $lastA;
314 &$discardBCallback( $ai, $bi++, @_ ) if $bi <= $lastB;
315 }
316
317 return 1;
318}
319
320sub traverse_balanced
321{
322 my $a = shift; # array ref
323 my $b = shift; # array ref
324 my $callbacks = shift || {};
325 my $keyGen = shift;
326 my $matchCallback = $callbacks->{'MATCH'} || sub { };
327 my $discardACallback = $callbacks->{'DISCARD_A'} || sub { };
328 my $discardBCallback = $callbacks->{'DISCARD_B'} || sub { };
329 my $changeCallback = $callbacks->{'CHANGE'};
330 my $matchVector = _longestCommonSubsequence( $a, $b, 0, $keyGen, @_ );
331
332 # Process all the lines in match vector
333 my $lastA = $#$a;
334 my $lastB = $#$b;
335 my $bi = 0;
336 my $ai = 0;
337 my $ma = -1;
338 my $mb;
339
340 while (1)
341 {
342
343 # Find next match indices $ma and $mb
344 do {
345 $ma++;
346 } while(
347 $ma <= $#$matchVector
348 && !defined $matchVector->[$ma]
349 );
350
351 last if $ma > $#$matchVector; # end of matchVector?
352 $mb = $matchVector->[$ma];
353
354 # Proceed with discard a/b or change events until
355 # next match
356 while ( $ai < $ma || $bi < $mb )
357 {
358
359 if ( $ai < $ma && $bi < $mb )
360 {
361
362 # Change
363 if ( defined $changeCallback )
364 {
365 &$changeCallback( $ai++, $bi++, @_ );
366 }
367 else
368 {
369 &$discardACallback( $ai++, $bi, @_ );
370 &$discardBCallback( $ai, $bi++, @_ );
371 }
372 }
373 elsif ( $ai < $ma )
374 {
375 &$discardACallback( $ai++, $bi, @_ );
376 }
377 else
378 {
379
380 # $bi < $mb
381 &$discardBCallback( $ai, $bi++, @_ );
382 }
383 }
384
385 # Match
386 &$matchCallback( $ai++, $bi++, @_ );
387 }
388
389 while ( $ai <= $lastA || $bi <= $lastB )
390 {
391 if ( $ai <= $lastA && $bi <= $lastB )
392 {
393
394 # Change
395 if ( defined $changeCallback )
396 {
397 &$changeCallback( $ai++, $bi++, @_ );
398 }
399 else
400 {
401 &$discardACallback( $ai++, $bi, @_ );
402 &$discardBCallback( $ai, $bi++, @_ );
403 }
404 }
405 elsif ( $ai <= $lastA )
406 {
407 &$discardACallback( $ai++, $bi, @_ );
408 }
409 else
410 {
411
412 # $bi <= $lastB
413 &$discardBCallback( $ai, $bi++, @_ );
414 }
415 }
416
417 return 1;
418}
419
420sub prepare
421{
422 my $a = shift; # array ref
423 my $keyGen = shift; # code ref
424
425 # set up code ref
426 $keyGen = sub { $_[0] } unless defined($keyGen);
427
428 return scalar _withPositionsOfInInterval( $a, 0, $#$a, $keyGen, @_ );
429}
430
431sub LCS
432{
433 my $a = shift; # array ref
434 my $b = shift; # array ref or hash ref
435 my $matchVector = _longestCommonSubsequence( $a, $b, 0, @_ );
436 my @retval;
437 my $i;
438 for ( $i = 0 ; $i <= $#$matchVector ; $i++ )
439 {
440 if ( defined( $matchVector->[$i] ) )
441 {
442 push ( @retval, $a->[$i] );
443 }
444 }
445 return wantarray ? @retval : \@retval;
446}
447
448sub LCS_length
449{
450 my $a = shift; # array ref
451 my $b = shift; # array ref or hash ref
452 return _longestCommonSubsequence( $a, $b, 1, @_ );
453}
454
455sub LCSidx
456{
457 my $a= shift @_;
458 my $b= shift @_;
459 my $match= _longestCommonSubsequence( $a, $b, 0, @_ );
460 my @am= grep defined $match->[$_], 0..$#$match;
461 my @bm= @{$match}[@am];
462 return \@am, \@bm;
463}
464
465sub compact_diff
466{
467 my $a= shift @_;
468 my $b= shift @_;
469 my( $am, $bm )= LCSidx( $a, $b, @_ );
470 my @cdiff;
471 my( $ai, $bi )= ( 0, 0 );
472 push @cdiff, $ai, $bi;
473 while( 1 ) {
474 while( @$am && $ai == $am->[0] && $bi == $bm->[0] ) {
475 shift @$am;
476 shift @$bm;
477 ++$ai, ++$bi;
478 }
479 push @cdiff, $ai, $bi;
480 last if ! @$am;
481 $ai = $am->[0];
482 $bi = $bm->[0];
483 push @cdiff, $ai, $bi;
484 }
485 push @cdiff, 0+@$a, 0+@$b
486 if $ai < @$a || $bi < @$b;
487 return wantarray ? @cdiff : \@cdiff;
488}
489
490sub diff
491{
492 my $a = shift; # array ref
493 my $b = shift; # array ref
494 my $retval = [];
495 my $hunk = [];
496 my $discard = sub {
497 push @$hunk, [ '-', $_[0], $a->[ $_[0] ] ];
498 };
499 my $add = sub {
500 push @$hunk, [ '+', $_[1], $b->[ $_[1] ] ];
501 };
502 my $match = sub {
503 push @$retval, $hunk
504 if 0 < @$hunk;
505 $hunk = []
506 };
507 traverse_sequences( $a, $b,
508 { MATCH => $match, DISCARD_A => $discard, DISCARD_B => $add }, @_ );
509 &$match();
510 return wantarray ? @$retval : $retval;
511}
512
513sub sdiff
514{
515 my $a = shift; # array ref
516 my $b = shift; # array ref
517 my $retval = [];
518 my $discard = sub { push ( @$retval, [ '-', $a->[ $_[0] ], "" ] ) };
519 my $add = sub { push ( @$retval, [ '+', "", $b->[ $_[1] ] ] ) };
520 my $change = sub {
521 push ( @$retval, [ 'c', $a->[ $_[0] ], $b->[ $_[1] ] ] );
522 };
523 my $match = sub {
524 push ( @$retval, [ 'u', $a->[ $_[0] ], $b->[ $_[1] ] ] );
525 };
526 traverse_balanced(
527 $a,
528 $b,
529 {
530 MATCH => $match,
531 DISCARD_A => $discard,
532 DISCARD_B => $add,
533 CHANGE => $change,
534 },
535 @_
536 );
537 return wantarray ? @$retval : $retval;
538}
539
540########################################
541my $Root= __PACKAGE__;
542package Algorithm::Diff::_impl;
543use strict;
544
545sub _Idx() { 0 } # $me->[_Idx]: Ref to array of hunk indices
546 # 1 # $me->[1]: Ref to first sequence
547 # 2 # $me->[2]: Ref to second sequence
548sub _End() { 3 } # $me->[_End]: Diff between forward and reverse pos
549sub _Same() { 4 } # $me->[_Same]: 1 if pos 1 contains unchanged items
550sub _Base() { 5 } # $me->[_Base]: Added to range's min and max
551sub _Pos() { 6 } # $me->[_Pos]: Which hunk is currently selected
552sub _Off() { 7 } # $me->[_Off]: Offset into _Idx for current position
553sub _Min() { -2 } # Added to _Off to get min instead of max+1
554
555sub Die
556{
557 require Carp;
558 Carp::confess( @_ );
559}
560
561sub _ChkPos
562{
563 my( $me )= @_;
564 return if $me->[_Pos];
565 my $meth= ( caller(1) )[3];
566 Die( "Called $meth on 'reset' object" );
567}
568
569sub _ChkSeq
570{
571 my( $me, $seq )= @_;
572 return $seq + $me->[_Off]
573 if 1 == $seq || 2 == $seq;
574 my $meth= ( caller(1) )[3];
575 Die( "$meth: Invalid sequence number ($seq); must be 1 or 2" );
576}
577
578sub getObjPkg
579{
580 my( $us )= @_;
581 return ref $us if ref $us;
582 return $us . "::_obj";
583}
584
585sub new
586{
587 my( $us, $seq1, $seq2, $opts ) = @_;
588 my @args;
589 for( $opts->{keyGen} ) {
590 push @args, $_ if $_;
591 }
592 for( $opts->{keyGenArgs} ) {
593 push @args, @$_ if $_;
594 }
595 my $cdif= Algorithm::Diff::compact_diff( $seq1, $seq2, @args );
596 my $same= 1;
597 if( 0 == $cdif->[2] && 0 == $cdif->[3] ) {
598 $same= 0;
599 splice @$cdif, 0, 2;
600 }
601 my @obj= ( $cdif, $seq1, $seq2 );
602 $obj[_End] = (1+@$cdif)/2;
603 $obj[_Same] = $same;
604 $obj[_Base] = 0;
605 my $me = bless \@obj, $us->getObjPkg();
606 $me->Reset( 0 );
607 return $me;
608}
609
610sub Reset
611{
612 my( $me, $pos )= @_;
613 $pos= int( $pos || 0 );
614 $pos += $me->[_End]
615 if $pos < 0;
616 $pos= 0
617 if $pos < 0 || $me->[_End] <= $pos;
618 $me->[_Pos]= $pos || !1;
619 $me->[_Off]= 2*$pos - 1;
620 return $me;
621}
622
623sub Base
624{
625 my( $me, $base )= @_;
626 my $oldBase= $me->[_Base];
627 $me->[_Base]= 0+$base if defined $base;
628 return $oldBase;
629}
630
631sub Copy
632{
633 my( $me, $pos, $base )= @_;
634 my @obj= @$me;
635 my $you= bless \@obj, ref($me);
636 $you->Reset( $pos ) if defined $pos;
637 $you->Base( $base );
638 return $you;
639}
640
641sub Next {
642 my( $me, $steps )= @_;
643 $steps= 1 if ! defined $steps;
644 if( $steps ) {
645 my $pos= $me->[_Pos];
646 my $new= $pos + $steps;
647 $new= 0 if $pos && $new < 0;
648 $me->Reset( $new )
649 }
650 return $me->[_Pos];
651}
652
653sub Prev {
654 my( $me, $steps )= @_;
655 $steps= 1 if ! defined $steps;
656 my $pos= $me->Next(-$steps);
657 $pos -= $me->[_End] if $pos;
658 return $pos;
659}
660
661sub Diff {
662 my( $me )= @_;
663 $me->_ChkPos();
664 return 0 if $me->[_Same] == ( 1 & $me->[_Pos] );
665 my $ret= 0;
666 my $off= $me->[_Off];
667 for my $seq ( 1, 2 ) {
668 $ret |= $seq
669 if $me->[_Idx][ $off + $seq + _Min ]
670 < $me->[_Idx][ $off + $seq ];
671 }
672 return $ret;
673}
674
675sub Min {
676 my( $me, $seq, $base )= @_;
677 $me->_ChkPos();
678 my $off= $me->_ChkSeq($seq);
679 $base= $me->[_Base] if !defined $base;
680 return $base + $me->[_Idx][ $off + _Min ];
681}
682
683sub Max {
684 my( $me, $seq, $base )= @_;
685 $me->_ChkPos();
686 my $off= $me->_ChkSeq($seq);
687 $base= $me->[_Base] if !defined $base;
688 return $base + $me->[_Idx][ $off ] -1;
689}
690
691sub Range {
692 my( $me, $seq, $base )= @_;
693 $me->_ChkPos();
694 my $off = $me->_ChkSeq($seq);
695 if( !wantarray ) {
696 return $me->[_Idx][ $off ]
697 - $me->[_Idx][ $off + _Min ];
698 }
699 $base= $me->[_Base] if !defined $base;
700 return ( $base + $me->[_Idx][ $off + _Min ] )
701 .. ( $base + $me->[_Idx][ $off ] - 1 );
702}
703
704sub Items {
705 my( $me, $seq )= @_;
706 $me->_ChkPos();
707 my $off = $me->_ChkSeq($seq);
708 if( !wantarray ) {
709 return $me->[_Idx][ $off ]
710 - $me->[_Idx][ $off + _Min ];
711 }
712 return
713 @{$me->[$seq]}[
714 $me->[_Idx][ $off + _Min ]
715 .. ( $me->[_Idx][ $off ] - 1 )
716 ];
717}
718
719sub Same {
720 my( $me )= @_;
721 $me->_ChkPos();
722 return wantarray ? () : 0
723 if $me->[_Same] != ( 1 & $me->[_Pos] );
724 return $me->Items(1);
725}
726
727my %getName;
728BEGIN {
729 %getName= (
730 same => \&Same,
731 diff => \&Diff,
732 base => \&Base,
733 min => \&Min,
734 max => \&Max,
735 range=> \&Range,
736 items=> \&Items, # same thing
737 );
738}
739
740sub Get
741{
742 my $me= shift @_;
743 $me->_ChkPos();
744 my @value;
745 for my $arg ( @_ ) {
746 for my $word ( split ' ', $arg ) {
747 my $meth;
748 if( $word !~ /^(-?\d+)?([a-zA-Z]+)([12])?$/
749 || not $meth= $getName{ lc $2 }
750 ) {
751 Die( $Root, ", Get: Invalid request ($word)" );
752 }
753 my( $base, $name, $seq )= ( $1, $2, $3 );
754 push @value, scalar(
755 4 == length($name)
756 ? $meth->( $me )
757 : $meth->( $me, $seq, $base )
758 );
759 }
760 }
761 if( wantarray ) {
762 return @value;
763 } elsif( 1 == @value ) {
764 return $value[0];
765 }
766 Die( 0+@value, " values requested from ",
767 $Root, "'s Get in scalar context" );
768}
769
770
771my $Obj= getObjPkg($Root);
772no strict 'refs';
773
774for my $meth ( qw( new getObjPkg ) ) {
775 *{$Root."::".$meth} = \&{$meth};
776 *{$Obj ."::".$meth} = \&{$meth};
777}
778for my $meth ( qw(
779 Next Prev Reset Copy Base Diff
780 Same Items Range Min Max Get
781 _ChkPos _ChkSeq
782) ) {
783 *{$Obj."::".$meth} = \&{$meth};
784}
785
7861;
787__END__
788
789=head1 NAME
790
791Algorithm::Diff - Compute `intelligent' differences between two files / lists
792
793=head1 SYNOPSIS
794
795 require Algorithm::Diff;
796
797 # This example produces traditional 'diff' output:
798
799 my $diff = Algorithm::Diff->new( \@seq1, \@seq2 );
800
801 $diff->Base( 1 ); # Return line numbers, not indices
802 while( $diff->Next() ) {
803 next if $diff->Same();
804 my $sep = '';
805 if( ! $diff->Items(2) ) {
806 sprintf "%d,%dd%d\n",
807 $diff->Get(qw( Min1 Max1 Max2 ));
808 } elsif( ! $diff->Items(1) ) {
809 sprint "%da%d,%d\n",
810 $diff->Get(qw( Max1 Min2 Max2 ));
811 } else {
812 $sep = "---\n";
813 sprintf "%d,%dc%d,%d\n",
814 $diff->Get(qw( Min1 Max1 Min2 Max2 ));
815 }
816 print "< $_" for $diff->Items(1);
817 print $sep;
818 print "> $_" for $diff->Items(2);
819 }
820
821
822 # Alternate interfaces:
823
824 use Algorithm::Diff qw(
825 LCS LCS_length LCSidx
826 diff sdiff compact_diff
827 traverse_sequences traverse_balanced );
828
829 @lcs = LCS( \@seq1, \@seq2 );
830 $lcsref = LCS( \@seq1, \@seq2 );
831 $count = LCS_length( \@seq1, \@seq2 );
832
833 ( $seq1idxref, $seq2idxref ) = LCSidx( \@seq1, \@seq2 );
834
835
836 # Complicated interfaces:
837
838 @diffs = diff( \@seq1, \@seq2 );
839
840 @sdiffs = sdiff( \@seq1, \@seq2 );
841
842 @cdiffs = compact_diff( \@seq1, \@seq2 );
843
844 traverse_sequences(
845 \@seq1,
846 \@seq2,
847 { MATCH => \&callback1,
848 DISCARD_A => \&callback2,
849 DISCARD_B => \&callback3,
850 },
851 \&key_generator,
852 @extra_args,
853 );
854
855 traverse_balanced(
856 \@seq1,
857 \@seq2,
858 { MATCH => \&callback1,
859 DISCARD_A => \&callback2,
860 DISCARD_B => \&callback3,
861 CHANGE => \&callback4,
862 },
863 \&key_generator,
864 @extra_args,
865 );
866
867
868=head1 INTRODUCTION
869
870(by Mark-Jason Dominus)
871
872I once read an article written by the authors of C<diff>; they said
873that they worked very hard on the algorithm until they found the
874right one.
875
876I think what they ended up using (and I hope someone will correct me,
877because I am not very confident about this) was the `longest common
878subsequence' method. In the LCS problem, you have two sequences of
879items:
880
881 a b c d f g h j q z
882
883 a b c d e f g i j k r x y z
884
885and you want to find the longest sequence of items that is present in
886both original sequences in the same order. That is, you want to find
887a new sequence I<S> which can be obtained from the first sequence by
888deleting some items, and from the secend sequence by deleting other
889items. You also want I<S> to be as long as possible. In this case I<S>
890is
891
892 a b c d f g j z
893
894From there it's only a small step to get diff-like output:
895
896 e h i k q r x y
897 + - + + - + + +
898
899This module solves the LCS problem. It also includes a canned function
900to generate C<diff>-like output.
901
902It might seem from the example above that the LCS of two sequences is
903always pretty obvious, but that's not always the case, especially when
904the two sequences have many repeated elements. For example, consider
905
906 a x b y c z p d q
907 a b c a x b y c z
908
909A naive approach might start by matching up the C<a> and C<b> that
910appear at the beginning of each sequence, like this:
911
912 a x b y c z p d q
913 a b c a b y c z
914
915This finds the common subsequence C<a b c z>. But actually, the LCS
916is C<a x b y c z>:
917
918 a x b y c z p d q
919 a b c a x b y c z
920
921or
922
923 a x b y c z p d q
924 a b c a x b y c z
925
926=head1 USAGE
927
928(See also the README file and several example
929scripts include with this module.)
930
931This module now provides an object-oriented interface that uses less
932memory and is easier to use than most of the previous procedural
933interfaces. It also still provides several exportable functions. We'll
934deal with these in ascending order of difficulty: C<LCS>,
935C<LCS_length>, C<LCSidx>, OO interface, C<prepare>, C<diff>, C<sdiff>,
936C<traverse_sequences>, and C<traverse_balanced>.
937
938=head2 C<LCS>
939
940Given references to two lists of items, LCS returns an array containing
941their longest common subsequence. In scalar context, it returns a
942reference to such a list.
943
944 @lcs = LCS( \@seq1, \@seq2 );
945 $lcsref = LCS( \@seq1, \@seq2 );
946
947C<LCS> may be passed an optional third parameter; this is a CODE
948reference to a key generation function. See L</KEY GENERATION
949FUNCTIONS>.
950
951 @lcs = LCS( \@seq1, \@seq2, \&keyGen, @args );
952 $lcsref = LCS( \@seq1, \@seq2, \&keyGen, @args );
953
954Additional parameters, if any, will be passed to the key generation
955routine.
956
957=head2 C<LCS_length>
958
959This is just like C<LCS> except it only returns the length of the
960longest common subsequence. This provides a performance gain of about
9619% compared to C<LCS>.
962
963=head2 C<LCSidx>
964
965Like C<LCS> except it returns references to two arrays. The first array
966contains the indices into @seq1 where the LCS items are located. The
967second array contains the indices into @seq2 where the LCS items are located.
968
969Therefore, the following three lists will contain the same values:
970
971 my( $idx1, $idx2 ) = LCSidx( \@seq1, \@seq2 );
972 my @list1 = @seq1[ @$idx1 ];
973 my @list2 = @seq2[ @$idx2 ];
974 my @list3 = LCS( \@seq1, \@seq2 );
975
976=head2 C<new>
977
978 $diff = Algorithm::Diffs->new( \@seq1, \@seq2 );
979 $diff = Algorithm::Diffs->new( \@seq1, \@seq2, \%opts );
980
981C<new> computes the smallest set of additions and deletions necessary
982to turn the first sequence into the second and compactly records them
983in the object.
984
985You use the object to iterate over I<hunks>, where each hunk represents
986a contiguous section of items which should be added, deleted, replaced,
987or left unchanged.
988
989=over 4
990
991The following summary of all of the methods looks a lot like Perl code
992but some of the symbols have different meanings:
993
994 [ ] Encloses optional arguments
995 : Is followed by the default value for an optional argument
996 | Separates alternate return results
997
998Method summary:
999
1000 $obj = Algorithm::Diff->new( \@seq1, \@seq2, [ \%opts ] );
1001 $pos = $obj->Next( [ $count : 1 ] );
1002 $revPos = $obj->Prev( [ $count : 1 ] );
1003 $obj = $obj->Reset( [ $pos : 0 ] );
1004 $copy = $obj->Copy( [ $pos, [ $newBase ] ] );
1005 $oldBase = $obj->Base( [ $newBase ] );
1006
1007Note that all of the following methods C<die> if used on an object that
1008is "reset" (not currently pointing at any hunk).
1009
1010 $bits = $obj->Diff( );
1011 @items|$cnt = $obj->Same( );
1012 @items|$cnt = $obj->Items( $seqNum );
1013 @idxs |$cnt = $obj->Range( $seqNum, [ $base ] );
1014 $minIdx = $obj->Min( $seqNum, [ $base ] );
1015 $maxIdx = $obj->Max( $seqNum, [ $base ] );
1016 @values = $obj->Get( @names );
1017
1018Passing in C<undef> for an optional argument is always treated the same
1019as if no argument were passed in.
1020
1021=item C<Next>
1022
1023 $pos = $diff->Next(); # Move forward 1 hunk
1024 $pos = $diff->Next( 2 ); # Move forward 2 hunks
1025 $pos = $diff->Next(-5); # Move backward 5 hunks
1026
1027C<Next> moves the object to point at the next hunk. The object starts
1028out "reset", which means it isn't pointing at any hunk. If the object
1029is reset, then C<Next()> moves to the first hunk.
1030
1031C<Next> returns a true value iff the move didn't go past the last hunk.
1032So C<Next(0)> will return true iff the object is not reset.
1033
1034Actually, C<Next> returns the object's new position, which is a number
1035between 1 and the number of hunks (inclusive), or returns a false value.
1036
1037=item C<Prev>
1038
1039C<Prev($N)> is almost identical to C<Next(-$N)>; it moves to the $Nth
1040previous hunk. On a 'reset' object, C<Prev()> [and C<Next(-1)>] move
1041to the last hunk.
1042
1043The position returned by C<Prev> is relative to the I<end> of the
1044hunks; -1 for the last hunk, -2 for the second-to-last, etc.
1045
1046=item C<Reset>
1047
1048 $diff->Reset(); # Reset the object's position
1049 $diff->Reset($pos); # Move to the specified hunk
1050 $diff->Reset(1); # Move to the first hunk
1051 $diff->Reset(-1); # Move to the last hunk
1052
1053C<Reset> returns the object, so, for example, you could use
1054C<< $diff->Reset()->Next(-1) >> to get the number of hunks.
1055
1056=item C<Copy>
1057
1058 $copy = $diff->Copy( $newPos, $newBase );
1059
1060C<Copy> returns a copy of the object. The copy and the orignal object
1061share most of their data, so making copies takes very little memory.
1062The copy maintains its own position (separate from the original), which
1063is the main purpose of copies. It also maintains its own base.
1064
1065By default, the copy's position starts out the same as the original
1066object's position. But C<Copy> takes an optional first argument to set the
1067new position, so the following three snippets are equivalent:
1068
1069 $copy = $diff->Copy($pos);
1070
1071 $copy = $diff->Copy();
1072 $copy->Reset($pos);
1073
1074 $copy = $diff->Copy()->Reset($pos);
1075
1076C<Copy> takes an optional second argument to set the base for
1077the copy. If you wish to change the base of the copy but leave
1078the position the same as in the original, here are two
1079equivalent ways:
1080
1081 $copy = $diff->Copy();
1082 $copy->Base( 0 );
1083
1084 $copy = $diff->Copy(undef,0);
1085
1086Here are two equivalent way to get a "reset" copy:
1087
1088 $copy = $diff->Copy(0);
1089
1090 $copy = $diff->Copy()->Reset();
1091
1092=item C<Diff>
1093
1094 $bits = $obj->Diff();
1095
1096C<Diff> returns a true value iff the current hunk contains items that are
1097different between the two sequences. It actually returns one of the
1098follow 4 values:
1099
1100=over 4
1101
1102=item 3
1103
1104C<3==(1|2)>. This hunk contains items from @seq1 and the items
1105from @seq2 that should replace them. Both sequence 1 and 2
1106contain changed items so both the 1 and 2 bits are set.
1107
1108=item 2
1109
1110This hunk only contains items from @seq2 that should be inserted (not
1111items from @seq1). Only sequence 2 contains changed items so only the 2
1112bit is set.
1113
1114=item 1
1115
1116This hunk only contains items from @seq1 that should be deleted (not
1117items from @seq2). Only sequence 1 contains changed items so only the 1
1118bit is set.
1119
1120=item 0
1121
1122This means that the items in this hunk are the same in both sequences.
1123Neither sequence 1 nor 2 contain changed items so neither the 1 nor the
11242 bits are set.
1125
1126=back
1127
1128=item C<Same>
1129
1130C<Same> returns a true value iff the current hunk contains items that
1131are the same in both sequences. It actually returns the list of items
1132if they are the same or an emty list if they aren't. In a scalar
1133context, it returns the size of the list.
1134
1135=item C<Items>
1136
1137 $count = $diff->Items(2);
1138 @items = $diff->Items($seqNum);
1139
1140C<Items> returns the (number of) items from the specified sequence that
1141are part of the current hunk.
1142
1143If the current hunk contains only insertions, then
1144C<< $diff->Items(1) >> will return an empty list (0 in a scalar conext).
1145If the current hunk contains only deletions, then C<< $diff->Items(2) >>
1146will return an empty list (0 in a scalar conext).
1147
1148If the hunk contains replacements, then both C<< $diff->Items(1) >> and
1149C<< $diff->Items(2) >> will return different, non-empty lists.
1150
1151Otherwise, the hunk contains identical items and all of the following
1152will return the same lists:
1153
1154 @items = $diff->Items(1);
1155 @items = $diff->Items(2);
1156 @items = $diff->Same();
1157
1158=item C<Range>
1159
1160 $count = $diff->Range( $seqNum );
1161 @indices = $diff->Range( $seqNum );
1162 @indices = $diff->Range( $seqNum, $base );
1163
1164C<Range> is like C<Items> except that it returns a list of I<indices> to
1165the items rather than the items themselves. By default, the index of
1166the first item (in each sequence) is 0 but this can be changed by
1167calling the C<Base> method. So, by default, the following two snippets
1168return the same lists:
1169
1170 @list = $diff->Items(2);
1171 @list = @seq2[ $diff->Range(2) ];
1172
1173You can also specify the base to use as the second argument. So the
1174following two snippets I<always> return the same lists:
1175
1176 @list = $diff->Items(1);
1177 @list = @seq1[ $diff->Range(1,0) ];
1178
1179=item C<Base>
1180
1181 $curBase = $diff->Base();
1182 $oldBase = $diff->Base($newBase);
1183
1184C<Base> sets and/or returns the current base (usually 0 or 1) that is
1185used when you request range information. The base defaults to 0 so
1186that range information is returned as array indices. You can set the
1187base to 1 if you want to report traditional line numbers instead.
1188
1189=item C<Min>
1190
1191 $min1 = $diff->Min(1);
1192 $min = $diff->Min( $seqNum, $base );
1193
1194C<Min> returns the first value that C<Range> would return (given the
1195same arguments) or returns C<undef> if C<Range> would return an empty
1196list.
1197
1198=item C<Max>
1199
1200C<Max> returns the last value that C<Range> would return or C<undef>.
1201
1202=item C<Get>
1203
1204 ( $n, $x, $r ) = $diff->Get(qw( min1 max1 range1 ));
1205 @values = $diff->Get(qw( 0min2 1max2 range2 same base ));
1206
1207C<Get> returns one or more scalar values. You pass in a list of the
1208names of the values you want returned. Each name must match one of the
1209following regexes:
1210
1211 /^(-?\d+)?(min|max)[12]$/i
1212 /^(range[12]|same|diff|base)$/i
1213
1214The 1 or 2 after a name says which sequence you want the information
1215for (and where allowed, it is required). The optional number before
1216"min" or "max" is the base to use. So the following equalities hold:
1217
1218 $diff->Get('min1') == $diff->Min(1)
1219 $diff->Get('0min2') == $diff->Min(2,0)
1220
1221Using C<Get> in a scalar context when you've passed in more than one
1222name is a fatal error (C<die> is called).
1223
1224=back
1225
1226=head2 C<prepare>
1227
1228Given a reference to a list of items, C<prepare> returns a reference
1229to a hash which can be used when comparing this sequence to other
1230sequences with C<LCS> or C<LCS_length>.
1231
1232 $prep = prepare( \@seq1 );
1233 for $i ( 0 .. 10_000 )
1234 {
1235 @lcs = LCS( $prep, $seq[$i] );
1236 # do something useful with @lcs
1237 }
1238
1239C<prepare> may be passed an optional third parameter; this is a CODE
1240reference to a key generation function. See L</KEY GENERATION
1241FUNCTIONS>.
1242
1243 $prep = prepare( \@seq1, \&keyGen );
1244 for $i ( 0 .. 10_000 )
1245 {
1246 @lcs = LCS( $seq[$i], $prep, \&keyGen );
1247 # do something useful with @lcs
1248 }
1249
1250Using C<prepare> provides a performance gain of about 50% when calling LCS
1251many times compared with not preparing.
1252
1253=head2 C<diff>
1254
1255 @diffs = diff( \@seq1, \@seq2 );
1256 $diffs_ref = diff( \@seq1, \@seq2 );
1257
1258C<diff> computes the smallest set of additions and deletions necessary
1259to turn the first sequence into the second, and returns a description
1260of these changes. The description is a list of I<hunks>; each hunk
1261represents a contiguous section of items which should be added,
1262deleted, or replaced. (Hunks containing unchanged items are not
1263included.)
1264
1265The return value of C<diff> is a list of hunks, or, in scalar context, a
1266reference to such a list. If there are no differences, the list will be
1267empty.
1268
1269Here is an example. Calling C<diff> for the following two sequences:
1270
1271 a b c e h j l m n p
1272 b c d e f j k l m r s t
1273
1274would produce the following list:
1275
1276 (
1277 [ [ '-', 0, 'a' ] ],
1278
1279 [ [ '+', 2, 'd' ] ],
1280
1281 [ [ '-', 4, 'h' ],
1282 [ '+', 4, 'f' ] ],
1283
1284 [ [ '+', 6, 'k' ] ],
1285
1286 [ [ '-', 8, 'n' ],
1287 [ '-', 9, 'p' ],
1288 [ '+', 9, 'r' ],
1289 [ '+', 10, 's' ],
1290 [ '+', 11, 't' ] ],
1291 )
1292
1293There are five hunks here. The first hunk says that the C<a> at
1294position 0 of the first sequence should be deleted (C<->). The second
1295hunk says that the C<d> at position 2 of the second sequence should
1296be inserted (C<+>). The third hunk says that the C<h> at position 4
1297of the first sequence should be removed and replaced with the C<f>
1298from position 4 of the second sequence. And so on.
1299
1300C<diff> may be passed an optional third parameter; this is a CODE
1301reference to a key generation function. See L</KEY GENERATION
1302FUNCTIONS>.
1303
1304Additional parameters, if any, will be passed to the key generation
1305routine.
1306
1307=head2 C<sdiff>
1308
1309 @sdiffs = sdiff( \@seq1, \@seq2 );
1310 $sdiffs_ref = sdiff( \@seq1, \@seq2 );
1311
1312C<sdiff> computes all necessary components to show two sequences
1313and their minimized differences side by side, just like the
1314Unix-utility I<sdiff> does:
1315
1316 same same
1317 before | after
1318 old < -
1319 - > new
1320
1321It returns a list of array refs, each pointing to an array of
1322display instructions. In scalar context it returns a reference
1323to such a list. If there are no differences, the list will have one
1324entry per item, each indicating that the item was unchanged.
1325
1326Display instructions consist of three elements: A modifier indicator
1327(C<+>: Element added, C<->: Element removed, C<u>: Element unmodified,
1328C<c>: Element changed) and the value of the old and new elements, to
1329be displayed side-by-side.
1330
1331An C<sdiff> of the following two sequences:
1332
1333 a b c e h j l m n p
1334 b c d e f j k l m r s t
1335
1336results in
1337
1338 ( [ '-', 'a', '' ],
1339 [ 'u', 'b', 'b' ],
1340 [ 'u', 'c', 'c' ],
1341 [ '+', '', 'd' ],
1342 [ 'u', 'e', 'e' ],
1343 [ 'c', 'h', 'f' ],
1344 [ 'u', 'j', 'j' ],
1345 [ '+', '', 'k' ],
1346 [ 'u', 'l', 'l' ],
1347 [ 'u', 'm', 'm' ],
1348 [ 'c', 'n', 'r' ],
1349 [ 'c', 'p', 's' ],
1350 [ '+', '', 't' ],
1351 )
1352
1353C<sdiff> may be passed an optional third parameter; this is a CODE
1354reference to a key generation function. See L</KEY GENERATION
1355FUNCTIONS>.
1356
1357Additional parameters, if any, will be passed to the key generation
1358routine.
1359
1360=head2 C<compact_diff>
1361
1362C<compact_diff> is much like C<sdiff> except it returns a much more
1363compact description consisting of just one flat list of indices. An
1364example helps explain the format:
1365
1366 my @a = qw( a b c e h j l m n p );
1367 my @b = qw( b c d e f j k l m r s t );
1368 @cdiff = compact_diff( \@a, \@b );
1369 # Returns:
1370 # @a @b @a @b
1371 # start start values values
1372 ( 0, 0, # =
1373 0, 0, # a !
1374 1, 0, # b c = b c
1375 3, 2, # ! d
1376 3, 3, # e = e
1377 4, 4, # f ! h
1378 5, 5, # j = j
1379 6, 6, # ! k
1380 6, 7, # l m = l m
1381 8, 9, # n p ! r s t
1382 10, 12, #
1383 );
1384
1385The 0th, 2nd, 4th, etc. entries are all indices into @seq1 (@a in the
1386above example) indicating where a hunk begins. The 1st, 3rd, 5th, etc.
1387entries are all indices into @seq2 (@b in the above example) indicating
1388where the same hunk begins.
1389
1390So each pair of indices (except the last pair) describes where a hunk
1391begins (in each sequence). Since each hunk must end at the item just
1392before the item that starts the next hunk, the next pair of indices can
1393be used to determine where the hunk ends.
1394
1395So, the first 4 entries (0..3) describe the first hunk. Entries 0 and 1
1396describe where the first hunk begins (and so are always both 0).
1397Entries 2 and 3 describe where the next hunk begins, so subtracting 1
1398from each tells us where the first hunk ends. That is, the first hunk
1399contains items C<$diff[0]> through C<$diff[2] - 1> of the first sequence
1400and contains items C<$diff[1]> through C<$diff[3] - 1> of the second
1401sequence.
1402
1403In other words, the first hunk consists of the following two lists of items:
1404
1405 # 1st pair 2nd pair
1406 # of indices of indices
1407 @list1 = @a[ $cdiff[0] .. $cdiff[2]-1 ];
1408 @list2 = @b[ $cdiff[1] .. $cdiff[3]-1 ];
1409 # Hunk start Hunk end
1410
1411Note that the hunks will always alternate between those that are part of
1412the LCS (those that contain unchanged items) and those that contain
1413changes. This means that all we need to be told is whether the first
1414hunk is a 'same' or 'diff' hunk and we can determine which of the other
1415hunks contain 'same' items or 'diff' items.
1416
1417By convention, we always make the first hunk contain unchanged items.
1418So the 1st, 3rd, 5th, etc. hunks (all odd-numbered hunks if you start
1419counting from 1) all contain unchanged items. And the 2nd, 4th, 6th,
1420etc. hunks (all even-numbered hunks if you start counting from 1) all
1421contain changed items.
1422
1423Since @a and @b don't begin with the same value, the first hunk in our
1424example is empty (otherwise we'd violate the above convention). Note
1425that the first 4 index values in our example are all zero. Plug these
1426values into our previous code block and we get:
1427
1428 @hunk1a = @a[ 0 .. 0-1 ];
1429 @hunk1b = @b[ 0 .. 0-1 ];
1430
1431And C<0..-1> returns the empty list.
1432
1433Move down one pair of indices (2..5) and we get the offset ranges for
1434the second hunk, which contains changed items.
1435
1436Since C<@diff[2..5]> contains (0,0,1,0) in our example, the second hunk
1437consists of these two lists of items:
1438
1439 @hunk2a = @a[ $cdiff[2] .. $cdiff[4]-1 ];
1440 @hunk2b = @b[ $cdiff[3] .. $cdiff[5]-1 ];
1441 # or
1442 @hunk2a = @a[ 0 .. 1-1 ];
1443 @hunk2b = @b[ 0 .. 0-1 ];
1444 # or
1445 @hunk2a = @a[ 0 .. 0 ];
1446 @hunk2b = @b[ 0 .. -1 ];
1447 # or
1448 @hunk2a = ( 'a' );
1449 @hunk2b = ( );
1450
1451That is, we would delete item 0 ('a') from @a.
1452
1453Since C<@diff[4..7]> contains (1,0,3,2) in our example, the third hunk
1454consists of these two lists of items:
1455
1456 @hunk3a = @a[ $cdiff[4] .. $cdiff[6]-1 ];
1457 @hunk3a = @b[ $cdiff[5] .. $cdiff[7]-1 ];
1458 # or
1459 @hunk3a = @a[ 1 .. 3-1 ];
1460 @hunk3a = @b[ 0 .. 2-1 ];
1461 # or
1462 @hunk3a = @a[ 1 .. 2 ];
1463 @hunk3a = @b[ 0 .. 1 ];
1464 # or
1465 @hunk3a = qw( b c );
1466 @hunk3a = qw( b c );
1467
1468Note that this third hunk contains unchanged items as our convention demands.
1469
1470You can continue this process until you reach the last two indices,
1471which will always be the number of items in each sequence. This is
1472required so that subtracting one from each will give you the indices to
1473the last items in each sequence.
1474
1475=head2 C<traverse_sequences>
1476
1477C<traverse_sequences> used to be the most general facility provided by
1478this module (the new OO interface is more powerful and much easier to
1479use).
1480
1481Imagine that there are two arrows. Arrow A points to an element of
1482sequence A, and arrow B points to an element of the sequence B.
1483Initially, the arrows point to the first elements of the respective
1484sequences. C<traverse_sequences> will advance the arrows through the
1485sequences one element at a time, calling an appropriate user-specified
1486callback function before each advance. It willadvance the arrows in
1487such a way that if there are equal elements C<$A[$i]> and C<$B[$j]>
1488which are equal and which are part of the LCS, there will be some moment
1489during the execution of C<traverse_sequences> when arrow A is pointing
1490to C<$A[$i]> and arrow B is pointing to C<$B[$j]>. When this happens,
1491C<traverse_sequences> will call the C<MATCH> callback function and then
1492it will advance both arrows.
1493
1494Otherwise, one of the arrows is pointing to an element of its sequence
1495that is not part of the LCS. C<traverse_sequences> will advance that
1496arrow and will call the C<DISCARD_A> or the C<DISCARD_B> callback,
1497depending on which arrow it advanced. If both arrows point to elements
1498that are not part of the LCS, then C<traverse_sequences> will advance
1499one of them and call the appropriate callback, but it is not specified
1500which it will call.
1501
1502The arguments to C<traverse_sequences> are the two sequences to
1503traverse, and a hash which specifies the callback functions, like this:
1504
1505 traverse_sequences(
1506 \@seq1, \@seq2,
1507 { MATCH => $callback_1,
1508 DISCARD_A => $callback_2,
1509 DISCARD_B => $callback_3,
1510 }
1511 );
1512
1513Callbacks for MATCH, DISCARD_A, and DISCARD_B are invoked with at least
1514the indices of the two arrows as their arguments. They are not expected
1515to return any values. If a callback is omitted from the table, it is
1516not called.
1517
1518Callbacks for A_FINISHED and B_FINISHED are invoked with at least the
1519corresponding index in A or B.
1520
1521If arrow A reaches the end of its sequence, before arrow B does,
1522C<traverse_sequences> will call the C<A_FINISHED> callback when it
1523advances arrow B, if there is such a function; if not it will call
1524C<DISCARD_B> instead. Similarly if arrow B finishes first.
1525C<traverse_sequences> returns when both arrows are at the ends of their
1526respective sequences. It returns true on success and false on failure.
1527At present there is no way to fail.
1528
1529C<traverse_sequences> may be passed an optional fourth parameter; this
1530is a CODE reference to a key generation function. See L</KEY GENERATION
1531FUNCTIONS>.
1532
1533Additional parameters, if any, will be passed to the key generation function.
1534
1535If you want to pass additional parameters to your callbacks, but don't
1536need a custom key generation function, you can get the default by
1537passing undef:
1538
1539 traverse_sequences(
1540 \@seq1, \@seq2,
1541 { MATCH => $callback_1,
1542 DISCARD_A => $callback_2,
1543 DISCARD_B => $callback_3,
1544 },
1545 undef, # default key-gen
1546 $myArgument1,
1547 $myArgument2,
1548 $myArgument3,
1549 );
1550
1551C<traverse_sequences> does not have a useful return value; you are
1552expected to plug in the appropriate behavior with the callback
1553functions.
1554
1555=head2 C<traverse_balanced>
1556
1557C<traverse_balanced> is an alternative to C<traverse_sequences>. It
1558uses a different algorithm to iterate through the entries in the
1559computed LCS. Instead of sticking to one side and showing element changes
1560as insertions and deletions only, it will jump back and forth between
1561the two sequences and report I<changes> occurring as deletions on one
1562side followed immediatly by an insertion on the other side.
1563
1564In addition to the C<DISCARD_A>, C<DISCARD_B>, and C<MATCH> callbacks
1565supported by C<traverse_sequences>, C<traverse_balanced> supports
1566a C<CHANGE> callback indicating that one element got C<replaced> by another:
1567
1568 traverse_balanced(
1569 \@seq1, \@seq2,
1570 { MATCH => $callback_1,
1571 DISCARD_A => $callback_2,
1572 DISCARD_B => $callback_3,
1573 CHANGE => $callback_4,
1574 }
1575 );
1576
1577If no C<CHANGE> callback is specified, C<traverse_balanced>
1578will map C<CHANGE> events to C<DISCARD_A> and C<DISCARD_B> actions,
1579therefore resulting in a similar behaviour as C<traverse_sequences>
1580with different order of events.
1581
1582C<traverse_balanced> might be a bit slower than C<traverse_sequences>,
1583noticable only while processing huge amounts of data.
1584
1585The C<sdiff> function of this module
1586is implemented as call to C<traverse_balanced>.
1587
1588C<traverse_balanced> does not have a useful return value; you are expected to
1589plug in the appropriate behavior with the callback functions.
1590
1591=head1 KEY GENERATION FUNCTIONS
1592
1593Most of the functions accept an optional extra parameter. This is a
1594CODE reference to a key generating (hashing) function that should return
1595a string that uniquely identifies a given element. It should be the
1596case that if two elements are to be considered equal, their keys should
1597be the same (and the other way around). If no key generation function
1598is provided, the key will be the element as a string.
1599
1600By default, comparisons will use "eq" and elements will be turned into keys
1601using the default stringizing operator '""'.
1602
1603Where this is important is when you're comparing something other than
1604strings. If it is the case that you have multiple different objects
1605that should be considered to be equal, you should supply a key
1606generation function. Otherwise, you have to make sure that your arrays
1607contain unique references.
1608
1609For instance, consider this example:
1610
1611 package Person;
1612
1613 sub new
1614 {
1615 my $package = shift;
1616 return bless { name => '', ssn => '', @_ }, $package;
1617 }
1618
1619 sub clone
1620 {
1621 my $old = shift;
1622 my $new = bless { %$old }, ref($old);
1623 }
1624
1625 sub hash
1626 {
1627 return shift()->{'ssn'};
1628 }
1629
1630 my $person1 = Person->new( name => 'Joe', ssn => '123-45-6789' );
1631 my $person2 = Person->new( name => 'Mary', ssn => '123-47-0000' );
1632 my $person3 = Person->new( name => 'Pete', ssn => '999-45-2222' );
1633 my $person4 = Person->new( name => 'Peggy', ssn => '123-45-9999' );
1634 my $person5 = Person->new( name => 'Frank', ssn => '000-45-9999' );
1635
1636If you did this:
1637
1638 my $array1 = [ $person1, $person2, $person4 ];
1639 my $array2 = [ $person1, $person3, $person4, $person5 ];
1640 Algorithm::Diff::diff( $array1, $array2 );
1641
1642everything would work out OK (each of the objects would be converted
1643into a string like "Person=HASH(0x82425b0)" for comparison).
1644
1645But if you did this:
1646
1647 my $array1 = [ $person1, $person2, $person4 ];
1648 my $array2 = [ $person1, $person3, $person4->clone(), $person5 ];
1649 Algorithm::Diff::diff( $array1, $array2 );
1650
1651$person4 and $person4->clone() (which have the same name and SSN)
1652would be seen as different objects. If you wanted them to be considered
1653equivalent, you would have to pass in a key generation function:
1654
1655 my $array1 = [ $person1, $person2, $person4 ];
1656 my $array2 = [ $person1, $person3, $person4->clone(), $person5 ];
1657 Algorithm::Diff::diff( $array1, $array2, \&Person::hash );
1658
1659This would use the 'ssn' field in each Person as a comparison key, and
1660so would consider $person4 and $person4->clone() as equal.
1661
1662You may also pass additional parameters to the key generation function
1663if you wish.
1664
1665=head1 ERROR CHECKING
1666
1667If you pass these routines a non-reference and they expect a reference,
1668they will die with a message.
1669
1670=head1 AUTHOR
1671
1672This version released by Tye McQueen (http://perlmonks.org/?node=tye).
1673
1674=head1 LICENSE
1675
1676Parts Copyright (c) 2000-2004 Ned Konz. All rights reserved.
1677Parts by Tye McQueen.
1678
1679This program is free software; you can redistribute it and/or modify it
1680under the same terms as Perl.
1681
1682=head1 MAILING LIST
1683
1684Mark-Jason still maintains a mailing list. To join a low-volume mailing
1685list for announcements related to diff and Algorithm::Diff, send an
1686empty mail message to mjd-perl-diff-request@plover.com.
1687
1688=head1 CREDITS
1689
1690Versions through 0.59 (and much of this documentation) were written by:
1691
1692Mark-Jason Dominus, mjd-perl-diff@plover.com
1693
1694This version borrows some documentation and routine names from
1695Mark-Jason's, but Diff.pm's code was completely replaced.
1696
1697This code was adapted from the Smalltalk code of Mario Wolczko
1698<mario@wolczko.com>, which is available at
1699ftp://st.cs.uiuc.edu/pub/Smalltalk/MANCHESTER/manchester/4.0/diff.st
1700
1701C<sdiff> and C<traverse_balanced> were written by Mike Schilli
1702<m@perlmeister.com>.
1703
1704The algorithm is that described in
1705I<A Fast Algorithm for Computing Longest Common Subsequences>,
1706CACM, vol.20, no.5, pp.350-353, May 1977, with a few
1707minor improvements to improve the speed.
1708
1709Much work was done by Ned Konz (perl@bike-nomad.com).
1710
1711The OO interface and some other changes are by Tye McQueen.
1712
1713=cut