##-*- Mode: CPerl -*- ##====================================================================== ## Header Administrivia ##====================================================================== our $VERSION = '0.10'; pp_setversion($VERSION); ##------------------------------------------------------ ## pm additions pp_addpm({At=>'Top'},<<'EOPM'); use strict; use version; ## $PDL_ATLEAST_2_014 : avoid in-place reshape() in _edit_pdl() for PDL >= 2.014 ## + prior to PDL-2.014, PDL::reshape() returned a new PDL, but modifies ## the calling object in-place for v2.014 our $PDL_ATLEAST_2_014 = version->parse($PDL::VERSION) >= version->parse("2.014"); =pod =head1 NAME PDL::EditDistance - Wagner-Fischer edit distance and alignment for PDLs. =head1 SYNOPSIS use PDL; use PDL::EditDistance; ##-- input PDLs $a = pdl([map { ord($_) } qw(G U M B O)]); $b = pdl([map { ord($_) } qw(G A M B O L)]); $a1 = pdl([0, map { ord($_) } qw(G U M B O)]); $b1 = pdl([0, map { ord($_) } qw(G A M B O L)]); ##------------------------------------------------------------- ## Levenshtein distance $dist = edit_distance_static($a,$b, 0,1,1,1); ($dist,$align) = edit_align_static($a,$b, 0,1,1,1); ##------------------------------------------------------------- ## Wagner-Fischer distance @costs = ($costMatch=0,$costInsert=1,$costDelete=1,$costSubstitute=2); $dist = edit_distance_static($a,$b, @costs); ($dist,$align) = edit_align_static($a,$b, @costs); ##------------------------------------------------------------- ## General edit distance $costsMatch = random($a->nelem+1, $b->nelem+1); $costsIns = random($a->nelem+1, $b->nelem+1); $costsDel = random($a->nelem+1, $b->nelem+1); $costsSubst = random($a->nelem+1, $b->nelem+1); @costs = ($costsMatch,$costsIns,$costDel,$costsSubst); $dist = edit_distance_full($a,$b,@costs); ($dist,$align) = edit_align_full($a,$b,@costs); ##------------------------------------------------------------- ## Alignment $op_match = align_op_match(); ##-- constant $op_del = align_op_insert1(); ##-- constant $op_ins = align_op_insert2(); ##-- constant $op_subst = align_op_substitute(); ##-- constant ($apath,$bpath,$pathlen) = edit_bestpath($align); ($ai,$bi,$ops,$pathlen) = edit_pathtrace($align); ##------------------------------------------------------------- ## Longest Common Subsequence $lcs = edit_lcs($a,$b); ($ai,$bi,$lcslen) = lcs_backtrace($a,$b,$lcs); =cut EOPM ## /pm additions ##------------------------------------------------------ ##------------------------------------------------------ ## Exports: None pp_export_nothing(); ##------------------------------------------------------ ## Includes / defines #pp_addhdr(<<'EOH'); #EOH ##====================================================================== ## C Utilities ##====================================================================== pp_addhdr(<<'EOH'); #define ALIGN_OP_MATCH 0 #define ALIGN_OP_INSERT1 1 #define ALIGN_OP_INSERT2 2 #define ALIGN_OP_SUBSTITUTE 3 EOH ##====================================================================== ## PDL::PP Wrappers ##====================================================================== ##====================================================================== ## Basic Utilities #pp_addpm(<<'EOPM'); #=pod # #=head1 Basic Utilities # #=cut #EOPM ##====================================================================== ## Convenience Methods ##------------------------------------------------------ ## _edit_pdl($a()) : ensures pdl-ness #pp_add_exported('','_edit_pdl'); pp_addpm(<<'EOPM'); =pod =head2 _edit_pdl =for sig Signature: (a(N); [o]apdl(N+1)) Convenience method. Returns a pdl $apdl() suitable for representing $a(), which can be specified as a UTF-8 or byte-string, as an arrays of numbers, or as a PDL. $apdl(0) is always set to zero. =cut sub _edit_pdl { if (UNIVERSAL::isa($_[0],'PDL')) { return ($PDL_ATLEAST_2_014 ? $_[0]->pdl : $_[0])->flat->reshape($_[0]->nelem+1)->rotate(1); } #return pdl(byte,[0, map { ord($_) } split(//,$_[0])]) if (!ref($_[0]) && !utf8::is_utf8($_[0])); ##-- byte-string (old) elsif (!ref($_[0])) { return pdl(long,[0, unpack('C0C*',$_[0])]) if (utf8::is_utf8($_[0])); ##-- utf8-string return pdl(byte,[0, unpack('U0C*',$_[0])]); ##-- byte-string } return pdl([0,@{$_[0]}]); } EOPM ##------------------------------------------------------ ## edit_cost_matrices() : generate cost-matrices (convenience) pp_add_exported('','edit_costs'); pp_addpm(<<'EOPM'); =pod =head2 edit_costs =for sig Signature: (PDL::Type type; int N; int M; [o]costsMatch(N+1,M+1); [o]costsIns(N+1,M+1); [o]costsDel(N+1,M+1); [o]costsSubst(N+1,M+1)) Convenience method. Ensures existence and proper dimensionality of cost matrices for inputs of length N and M. =cut sub edit_costs { return _edit_costs($_[0],$_[1]+1,$_[2]+1,@_[3..$#_]); } EOPM ##------------------------------------------------------ ## _edit_costs() : generate cost-matrices (low-level, convenience) pp_add_exported('','_edit_costs'); pp_addpm(<<'EOPM'); =pod =head2 _edit_costs =for sig Signature: (PDL::Type type; int N1; int M1; [o]costsMatch(N1,M1); [o]costsIns(N1,M1); [o]costsDel(N1,M1); [o]costsSubst(N1,M1)) Low-level method. Ensures existence and proper dimensionality of cost matrices for inputs of length N1-1 and M1-1. =cut sub _edit_costs { #my ($type,$n1,$m1,$costsMatch,$costsIns,$costsDel,$costsSubst) = @_; return (_edit_matrix(@_[0..2],$_[3]), _edit_matrix(@_[0..2],$_[4]), _edit_matrix(@_[0..2],$_[5]), _edit_matrix(@_[0..2],$_[6])); } ##-- $matrix = _edit_matrix($type,$dim0,$dim1,$mat) sub _edit_matrix { return zeroes(@_[0..2]) if (!defined($_[3])); $_[3]->reshape(@_[1,2]) if ($_[3]->ndims != 2 || $_[3]->dim(0) != $_[1] || $_[3]->dim(1) != $_[2]); return $_[3]->type == $_[0] ? $_[3] : $_[3]->convert($_[0]); } EOPM ##------------------------------------------------------ ## edit_cost_static() : generate static cost-matrices (convenience) pp_add_exported('','edit_costs_static'); pp_addpm(<<'EOPM'); =pod =head2 edit_costs_static =for sig Signature: (PDL::Type type; int N; int M; staticCostMatch(); staticCostIns(); staticCostSubst(); [o]costsMatch(N+1,M+1); [o]costsIns(N+1,M+1); [o]costsDel(N+1,M+1); [o]costsSubst(N+1,M+1)) Convenience method. =cut sub edit_costs_static { #my ($type,$n,$m, $cMatch,$cIns,$cDel,$cSubst, $costsMatch,$costsIns,$costsDel,$costsSubst) = @_; my @costs = edit_costs(@_[0..2],@_[7..$#_]); $costs[$_] .= $_[$_+3] foreach (0..3); return @costs; } EOPM ##====================================================================== ## Distance: Full ##------------------------------------------------------ ## edit_distance_matrix_full() : full distance matrix pp_add_exported('','edit_distance_full'); pp_addpm(<<'EOPM'); =pod =head2 edit_distance_full =for sig Signature: (a(N); b(M); costsMatch(N+1,M+1); costsIns(N+1,M+1); costsDel(N+1,M+1); costsSubst(N+1,M+1); [o]dist(N+1,M+1); [o]align(N+1,M+1)) Convenience method. Compute the edit distance matrix for inputs $a() and $b(), and cost matrices $costsMatch(), $costsIns(), $costsDel(), and $costsSubst(). $a() and $b() may be specified as PDLs, arrays of numbers, or as strings. =cut sub edit_distance_full { return _edit_distance_full(_edit_pdl($_[0]), _edit_pdl($_[1]), @_[2..$#_]); } EOPM ##------------------------------------------------------ ## _edit_distance_full : get distance matrix from full cost matrices pp_add_exported('','_edit_distance_full'); pp_def('_edit_distance_full', Pars => ('a1(N1); b1(M1);' .' costsMatch(N1,M1); costsIns(N1,M1); costsDel(N1,M1); costsSubst(N1,M1);' .' [o]dist(N1,M1);'), Code => (' int i,j; // //-- initialize distance matrix: insertion costs $dist (N1=>0,M1=>0) = $costsMatch(N1=>0,M1=>0); //-- BOS always matches for (i=1; i < $SIZE(N1); i++) { $dist(N1=>i,M1=>0) = $dist(N1=>i-1,M1=>0) + $costsDel(N1=>i,M1=>0); //-- delete (insert1) } for (j=1; j < $SIZE(M1); j++) { $dist(N1=>0,M1=>j) = $dist(N1=>0,M1=>j-1) + $costsIns(N1=>0,M1=>j); //-- insert (insert2) } // //-- compute distance for (i=1; i < $SIZE(N1); i++) { for (j=1; j < $SIZE(M1); j++) { $GENERIC(dist) cost_insert_1 = $dist(N1=>i-1,M1=>j ) + $costsDel(N1=>i,M1=>j); $GENERIC(dist) cost_insert_2 = $dist(N1=>i, M1=>j-1) + $costsIns(N1=>i,M1=>j); $GENERIC(dist) cost_subst = $dist(N1=>i-1,M1=>j-1); if ($a1(N1=>i)==$b1(M1=>j)) { cost_subst += $costsMatch(N1=>i,M1=>j); } else { cost_subst += $costsSubst(N1=>i,M1=>j); } // if (cost_insert_1 < cost_insert_2) { if (cost_insert_1 < cost_subst) { $dist( N1=>i,M1=>j) = cost_insert_1; } else { $dist(N1=>i,M1=>j) = cost_subst; } } else if (cost_insert_2 < cost_subst) { $dist( N1=>i,M1=>j) = cost_insert_2; } else { $dist( N1=>i,M1=>j) = cost_subst; } } } '), Doc => q( Low-level method. Compute the edit distance matrix for input PDLs $a1() and $b1() and cost matrices $costsMatch(), $costsIns(), $costsDel(), and $costsSubst(). The first elements of $a1() and $b1() are ignored. ), ); ##====================================================================== ## Distance + Alignment: Full ##------------------------------------------------------ ## edit_align_full() : full distance & alignment matrix pp_add_exported('','edit_align_full'); pp_addpm(<<'EOPM'); =pod =head2 edit_align_full =for sig Signature: (a(N); b(M); costsMatch(N+1,M+1); costsIns(N+1,M+1); costsDel(N+1,N+1); costsSubst(N+1,M+1); [o]dist(N+1,M+1); [o]align(N+1,M+1)) Convenience method. Compute the edit distance and alignment matrices for inputs $a() and $b(), and cost matrices $costsMatch(), $costsIns(), $costsDel(), and $costsSubst(). $a() and $b() may be specified as PDLs, arrays of numbers, or as strings. =cut sub edit_align_full { return _edit_align_full(_edit_pdl($_[0]), _edit_pdl($_[1]), @_[2..$#_]); } EOPM ##------------------------------------------------------ ## _edit_align_full : get distance & alignment matrices given full cost matrices pp_add_exported('','_edit_align_full'); pp_def('_edit_align_full', Pars => ('a1(N1); b1(M1);' .' costsMatch(N1,M1); costsIns(N1,M1); costsDel(N1,M1); costsSubst(N1,M1);' .' [o]dist(N1,M1); byte [o]align(N1,M1);'), Code => (' int i,j; // //-- initialize distance matrix: insertion costs $dist (N1=>0,M1=>0) = $costsMatch(N1=>0,M1=>0); //-- BOS always matches $align(N1=>0,M1=>0) = ALIGN_OP_MATCH; //-- ... marked as "substitute" for (i=1; i < $SIZE(N1); i++) { $dist (N1=>i,M1=>0) = $dist(N1=>i-1,M1=>0) + $costsDel(N1=>i,M1=>0); $align(N1=>i,M1=>0) = ALIGN_OP_INSERT1; } for (j=1; j < $SIZE(M1); j++) { $dist (N1=>0,M1=>j) = $dist(N1=>0,M1=>j-1) + $costsIns(N1=>0,M1=>j); $align(N1=>0,M1=>j) = ALIGN_OP_INSERT2; } // //-- compute distance for (i=1; i < $SIZE(N1); i++) { for (j=1; j < $SIZE(M1); j++) { $GENERIC(dist) cost_insert_1 = $dist(N1=>i-1,M1=>j ) + $costsDel(N1=>i,M1=>j); $GENERIC(dist) cost_insert_2 = $dist(N1=>i, M1=>j-1) + $costsIns(N1=>i,M1=>j); $GENERIC(dist) cost_subst = $dist(N1=>i-1,M1=>j-1); char subst_op; // if ($a1(N1=>i)==$b1(M1=>j)) { cost_subst += $costsMatch(N1=>i,M1=>j); subst_op = ALIGN_OP_MATCH; } else { cost_subst += $costsSubst(N1=>i,M1=>j); subst_op = ALIGN_OP_SUBSTITUTE; } // if (cost_insert_1 < cost_insert_2) { if (cost_insert_1 < cost_subst) { $dist( N1=>i,M1=>j) = cost_insert_1; $align(N1=>i,M1=>j) = ALIGN_OP_INSERT1; } else { $dist(N1=>i,M1=>j) = cost_subst; $align(N1=>i,M1=>j) = subst_op; } } else if (cost_insert_2 < cost_subst) { $dist( N1=>i,M1=>j) = cost_insert_2; $align(N1=>i,M1=>j) = ALIGN_OP_INSERT2; } else { $dist( N1=>i,M1=>j) = cost_subst; $align(N1=>i,M1=>j) = subst_op; } } } '), Doc => q( Low-level method. Compute the edit distance and alignment matrix for input PDLs $a1() and $b1() and cost matrices $costsMatch(), $costsIns(), $costsDel(), and $costsSubst(). The first elements of $a1() and $b1() are ignored. ), ); ##====================================================================== ## Distance: Static ##------------------------------------------------------ ## edit_distance_static() : distance matrix using static cost schema pp_add_exported('','edit_distance_static'); pp_addpm(<<'EOPM'); =pod =head2 edit_distance_static =for sig Signature: (a(N); b(M); staticCostMatch(); staticCostIns(); staticCostDel(); staticCostSubst(); [o]dist(N+1,M+1)) Convenience method. Compute the edit distance matrix for inputs $a() and $b() given a static cost schema @costs = ($staticCostMatch(), $staticCostIns(), $staticCostDel(), and $staticCostSubst()). $a() and $b() may be specified as PDLs, arrays of numbers, or as strings. Functionally equivalent to edit_distance_full($matches,@costs,$dist), but slightly faster. =cut sub edit_distance_static { return _edit_distance_static(_edit_pdl($_[0]), _edit_pdl($_[1]), @_[2..$#_]); } EOPM ##------------------------------------------------------ ## _edit_distance_static : get distance matrix from static cost schema pp_add_exported('','_edit_distance_static'); pp_def('_edit_distance_static', Pars => ('a1(N1); b1(M1); costMatch(); costIns(); costDel(); costSubst();' .' [o]dist(N1,M1);'), Code => (' int i,j; // //-- initialize distance matrix: insertion costs $dist(N1=>0,M1=>0) = $costMatch(); //-- BOS always matches for (i=1; i < $SIZE(N1); i++) { $dist(N1=>i,M1=>0) = $dist(N1=>i-1,M1=>0) + $costDel(); //-- delete (insert1) } for (j=1; j < $SIZE(M1); j++) { $dist(N1=>0,M1=>j) = $dist(N1=>0,M1=>j-1) + $costIns(); //-- insert (insert2) } // //-- compute distance for (i=1; i < $SIZE(N1); i++) { for (j=1; j < $SIZE(M1); j++) { $GENERIC(dist) cost_insert_1 = $dist(N1=>i-1,M1=>j ) + $costDel(); $GENERIC(dist) cost_insert_2 = $dist(N1=>i, M1=>j-1) + $costIns(); $GENERIC(dist) cost_subst = $dist(N1=>i-1,M1=>j-1) + ($a1(N1=>i)==$b1(M1=>j) ? $costMatch() : $costSubst()); if (cost_insert_1 < cost_insert_2) { if (cost_insert_1 < cost_subst) { $dist(N1=>i,M1=>j) = cost_insert_1; } else { $dist(N1=>i,M1=>j) = cost_subst; } } else if (cost_insert_2 < cost_subst) { $dist(N1=>i,M1=>j) = cost_insert_2; } else { $dist(N1=>i,M1=>j) = cost_subst; } } } '), Doc => q( Low-level method. Compute the edit distance matrix for input PDLs $a1() and $b1() given a static cost schema @costs = ($costMatch(), $costIns(), $costDel(), $costSubst()). Functionally identitical to _edit_distance_matrix_full($matches,@costs,$dist), but slightly faster. The first elements of $a1() and $b1() are ignored. ), ); ##====================================================================== ## Distance + Alignment: Static ##------------------------------------------------------ ## edit_align_static() : distance + alignment matrices using static cost schema pp_add_exported('','edit_align_static'); pp_addpm(<<'EOPM'); =pod =head2 edit_align_static =for sig Signature: (a(N); b(M); staticCostMatch(); staticCostIns(); staticCostDel(); staticCostSubst(); [o]dist(N+1,M+1); [o]align(N+1,M+1)) Convenience method. Compute the edit distance and alignment matrices for inputs $a() and $b() given a static cost schema @costs = ($staticCostMatch(), $staticCostIns(), $staticCostDel(), and $staticCostSubst()). $a() and $b() may be specified as PDLs, arrays of numbers, or as strings. Functionally equivalent to edit_align_full($matches,@costs,$dist), but slightly faster. =cut sub edit_align_static { return _edit_align_static(_edit_pdl($_[0]), _edit_pdl($_[1]), @_[2..$#_]); } EOPM ##------------------------------------------------------ ## _edit_align_static : get distance & alignment matrices from static cost schema pp_add_exported('','_edit_align_static'); pp_def('_edit_align_static', Pars => ('a1(N1); b1(M1); costMatch(); costIns(); costDel(); costSubst();' .' [o]dist(N1,M1); byte [o]align(N1,M1)'), Code => (' int i,j; // //-- initialize distance matrix: insertion costs $dist( N1=>0,M1=>0) = $costMatch(); //-- BOS always matches $align(N1=>0,M1=>0) = ALIGN_OP_MATCH; //-- ... and is marked as "match" for (i=1; i < $SIZE(N1); i++) { $dist (N1=>i,M1=>0) = $dist(N1=>i-1,M1=>0) + $costDel(); //-- delete (insert1) $align(N1=>i,M1=>0) = ALIGN_OP_INSERT1; } for (j=1; j < $SIZE(M1); j++) { $dist (N1=>0,M1=>j) = $dist(N1=>0,M1=>j-1) + $costIns(); //-- insert (insert2) $align(N1=>0,M1=>j) = ALIGN_OP_INSERT2; } // //-- compute distance for (i=1; i < $SIZE(N1); i++) { for (j=1; j < $SIZE(M1); j++) { $GENERIC(dist) cost_insert_1 = $dist(N1=>i-1,M1=>j ) + $costDel(); $GENERIC(dist) cost_insert_2 = $dist(N1=>i, M1=>j-1) + $costIns(); $GENERIC(dist) cost_subst = $dist(N1=>i-1,M1=>j-1); char subst_op; // if ($a1(N1=>i)==$b1(M1=>j)) { cost_subst += $costMatch(); subst_op = ALIGN_OP_MATCH; } else { cost_subst += $costSubst(); subst_op = ALIGN_OP_SUBSTITUTE; } // if (cost_insert_1 < cost_insert_2) { if (cost_insert_1 < cost_subst) { $dist(N1=>i,M1=>j) = cost_insert_1; $align(N1=>i,M1=>j) = ALIGN_OP_INSERT1; } else { $dist(N1=>i,M1=>j) = cost_subst; $align(N1=>i,M1=>j) = subst_op; } } else if (cost_insert_2 < cost_subst) { $dist(N1=>i,M1=>j) = cost_insert_2; $align(N1=>i,M1=>j) = ALIGN_OP_INSERT2; } else { $dist(N1=>i,M1=>j) = cost_subst; $align(N1=>i,M1=>j) = subst_op; } } } '), Doc => q( Low-level method. Compute the edit distance and alignment matrices for input PDLs $a1() and $b1() given a static cost schema @costs = ($costMatch(), $costIns(), $costDel(), $costSubst()). Functionally identitical to _edit_distance_matrix_full($matches,@costs,$dist), but slightly faster. The first elements of $a1() and $b1() are ignored. ), ); ##============================================================== ## Alignment ##------------------------------------------------------ ## Alignment: Constants pp_add_exported('','align_op_insert1'); pp_def('align_op_insert1', Pars=>'[o]a()', Code=>'$a() = ALIGN_OP_INSERT1;', Doc => 'Alignment matrix value constant for insertion operations on $a() string.', ); pp_add_exported('','align_op_insert2'); pp_def('align_op_insert2', Pars=>'[o]a()', Code=>'$a() = ALIGN_OP_INSERT2;', Doc => 'Alignment matrix value constant for insertion operations on $a() string.', ); pp_add_exported('','align_op_match'); pp_def('align_op_match', Pars=>'[o]a()', Code=>'$a() = ALIGN_OP_MATCH;', Doc => 'Alignment matrix value constant for matches.', ); pp_add_exported('','align_op_substitute'); pp_def('align_op_substitute', Pars=>'[o]a()', Code=>'$a() = ALIGN_OP_SUBSTITUTE;', Doc => 'Alignment matrix value constant for substitution operations.', ); pp_add_exported('','align_op_insert'); pp_add_exported('','align_op_delete'); pp_addpm(<<'EOPM'); =pod =head2 align_op_delete Alias for align_op_insert1() =head2 align_op_insert Alias for align_op_insert2() =cut *align_op_delete = \&align_op_insert1; *align_op_insert = \&align_op_insert2; EOPM pp_add_exported('','align_ops'); pp_addpm(<<'EOPM'); =pod =head2 align_ops =for sig Signature: ([o]ops(4)) Alignment matrix value constants 4-element pdl (match,insert1,insert2,substitute).a =cut sub align_ops { return PDL->sequence(PDL::byte(),4); } EOPM ##------------------------------------------------------ ## Alignment: best path ##------------------------------------------------------ ## edit_bestpath() : distance + alignment matrices using static cost schema pp_add_exported('','edit_bestpath'); pp_addpm(<<'EOPM'); =pod =head2 edit_bestpath =for sig Signature: (align(N+1,M+1); [o]apath(N+M+2); [o]bpath(N+M+2); [o]pathlen()) Convenience method. Compute best path through alignment matrix $align(). Stores paths for original input strings $a() and $b() in $apath() and $bpath() respectively. Negative values in $apath() and $bpath() indicate insertion/deletion operations. On completion, $pathlen() holds the actual length of the paths. =cut sub edit_bestpath { my ($align,$apath,$bpath,$len) = @_; $len=pdl(long,$align->dim(0)+$align->dim(1)) if (!defined($len)); if (!defined($apath)) { $apath=zeroes(long,$len); } else { $apath->reshape($len) if ($apath->nelem < $len); } if (!defined($bpath)) { $bpath = zeroes(long,$len); } else { $bpath->reshape($len) if ($bpath->nelem < $len); } _edit_bestpath($align, $apath, $bpath, $len, $align->dim(0)-1, $align->dim(1)-1); return ($apath,$bpath,$len); } EOPM ##------------------------------------------------------ ## _edit_bestpath : get best path pp_add_exported('','_edit_bestpath'); pp_def('_edit_bestpath', Pars => ('align(N1,M1); int [o]apath(L); int [o]bpath(L); int [o]len();'), OtherPars => ('int ifinal; int jfinal'), Code => (' int i,j,p,endp,tmp; char op; //-- get reversed path & real path length $len()=0; for (p=0,i=$COMP(ifinal),j=$COMP(jfinal); p < $SIZE(L) && (i>0 || j>0); p++) { $apath(L=>p) = i-1; $bpath(L=>p) = j-1; // op = $align(N1=>i,M1=>j); if (op==ALIGN_OP_INSERT1) { --i; } else if (op==ALIGN_OP_INSERT2) { --j; } else { /* if (op==ALIGN_OP_MATCH || op==ALIGN_OP_SUBSTITUTE) */ --i; --j; } } $len() = p; //-- now reverse the paths for (p=0; p < ($len()+1)/2; p++) { endp = $len()-p-1; // tmp = $apath(L=>p); $apath(L=>p) = $apath(L=>endp); $apath(L=>endp) = tmp; // tmp = $bpath(L=>p); $bpath(L=>p) = $bpath(L=>endp); $bpath(L=>endp) = tmp; } //-- now, sanitize the paths for (p=$len(); p > 0; p--) { if ($apath(L=>p) == $apath(L=>p-1)) $apath(L=>p) = -1; if ($bpath(L=>p) == $bpath(L=>p-1)) $bpath(L=>p) = -1; } '), Doc => q( Low-level method. Compute best path through alignment matrix $align() from final index ($ifinal,$jfinal). Stores paths for (original) input strings $a() and $b() in $apath() and $bpath() respectively. Negative values in $apath() and $bpath() indicate insertion/deletion operations. On completion, $pathlen() holds the actual length of the paths. ), ); ##------------------------------------------------------ ## Alignment: operation backtrace ##------------------------------------------------------ ## edit_backtrace() : alignment operation backtrace pp_add_exported('','edit_pathtrace'); pp_addpm(<<'EOPM'); =pod =head2 edit_pathtrace =for sig Signature: ( align(N+1,M+1); [o]ai(L); [o]bi(L); [o]ops(L); [o]$pathlen() ) Convenience method. Compute alignment path backtrace through alignment matrix $align() from final index ($ifinal,$jfinal). Stores raw paths for (original) input strings $a() and $b() in $ai() and $bi() respectively. Unlike edit_bestpath(), null-moves for $ai() and $bi() are not stored here as negative values. Returned pdls ($ai,$bi,$ops) are trimmed to the appropriate path length. =cut sub edit_pathtrace { my ($align,$ai,$bi,$ops,$len) = @_; $len=pdl(long,$align->dim(0)+$align->dim(1)) if (!defined($len)); if (!defined($ai)) { $ai=zeroes(long,$len); } else { $ai->reshape($len) if ($ai->nelem < $len); } if (!defined($bi)) { $bi = zeroes(long,$len); } else { $bi->reshape($len) if ($bi->nelem < $len); } if (!defined($ops)) { $ops = zeroes(long,$len); } else { $ops->reshape($len) if ($ops->nelem < $len); } _edit_pathtrace($align, $ai,$bi,$ops,$len, $align->dim(0)-1,$align->dim(1)-1); my $lens = ($len->sclr-1); return ((map { $_->slice("0:$lens") } ($ai,$bi,$ops)), $len); } EOPM ##------------------------------------------------------ ## _edit_pathlog : trace path pp_add_exported('','_edit_pathtrace'); pp_def('_edit_pathtrace', Pars => ('align(N1,M1); int [o]ai(L); int [o]bi(L); int [o]ops(L); int [o]len();'), OtherPars => ('int ifinal; int jfinal'), Code => (' int i,j,p, tmp; //-- get reversed path & real path length $len() = 0; for (p=0, i=$COMP(ifinal),j=$COMP(jfinal); p < $SIZE(L) && (i>0 || j>0); p++) { int op = $align(N1=>i,M1=>j); $ai(L=>p) = i; $bi(L=>p) = j; $ops(L=>p) = op; switch (op) { case ALIGN_OP_INSERT1: --i; break; case ALIGN_OP_INSERT2: --j; break; case ALIGN_OP_MATCH: case ALIGN_OP_SUBSTITUTE: default: --i; --j; break; } } $len() = p; //-- now reverse the paths for (p=0; p < ($len()+1)/2; p++) { int endp = $len()-p-1; // tmp = $ai(L=>p); $ai(L=>p) = $ai(L=>endp); $ai(L=>endp) = tmp; // tmp = $bi(L=>p); $bi(L=>p) = $bi(L=>endp); $bi(L=>endp) = tmp; // tmp = $ops(L=>p); $ops(L=>p) = $ops(L=>endp); $ops(L=>endp) = tmp; } '), Doc => q( Low-level method. Compute alignment path backtrace through alignment matrix $align() from final index ($ifinal,$jfinal). Stores raw paths for (original) input strings $a() and $b() in $ai() and $bi() respectively. Unlike edit_bestpath(), null-moves for $ai() and $bi() are not stored here as negative values. Returned pdls ($ai,$bi,$ops) are trimmed to the appropriate path length. ), ); ##====================================================================== ## Longest Common Subsequence (LCS) ##------------------------------------------------------ ## edit_lcs() : lcs matrix (convenience) pp_add_exported('','edit_lcs'); pp_addpm(<<'EOPM'); =pod =head2 edit_lcs =for sig Signature: (a(N); b(M); int [o]lcs(N+1,M+1);) Convenience method. Compute the longest common subsequence (LCS) matrix for input PDLs $a1() and $b1(). The output matrix $lcs() contains at cell ($i+1,$j+1) the length of the LCS between $a1(0..$i) and $b1(0..$j); thus $lcs($N,$M) contains the length of the LCS between $a() and $b(). =cut sub edit_lcs { return _edit_lcs(_edit_pdl($_[0]), _edit_pdl($_[1]), @_[2..$#_]); } EOPM ##------------------------------------------------------ ## _edit_lcs : get longest common subsequence matrix pp_add_exported('','_edit_lcs'); pp_def('_edit_lcs', Pars => ('a1(N1); b1(M1); int [o]lcs(N1,M1);'), Code => (' int i,j, iprev,jprev; // //-- initialize lcs matrix: lcs=0 at borders for (i=0; i < $SIZE(N1); i++) { $lcs(N1=>i,M1=>0) = 0; } for (j=1; j < $SIZE(M1); j++) { $lcs(N1=>0,M1=>j) = 0; } // //-- compute lcs for (i=1,iprev=0; i < $SIZE(N1); i++,iprev++) { for (j=1,jprev=0; j < $SIZE(M1); j++,jprev++) { // if ($a1(N1=>i)==$b1(M1=>j)) { $lcs(N1=>i,M1=>j) = $lcs(N1=>iprev,M1=>jprev)+1; } else { $GENERIC(lcs) lcs_i_jp = $lcs(N1=>i,M1=>jprev); $GENERIC(lcs) lcs_ip_j = $lcs(N1=>iprev,M1=>j); $lcs(N1=>i,M1=>j) = lcs_i_jp > lcs_ip_j ? lcs_i_jp : lcs_ip_j; } } } '), Doc => q( Low-level method. Compute the longest common subsequence (LCS) matrix for input PDLs $a1() and $b1(). The initial (zeroth) elements of $a1() and $b1() are ignored. The output matrix $lcs() contains at cell ($i,$j) the length of the LCS between $a1(1..$i) and $b1(1..$j); thus $lcs($N1-1,$M1-1) contains the length of the LCS between $a1() and $b1(). ), ); ##------------------------------------------------------ ## lcs_backtrace : get LCS backtrace (convenience) pp_add_exported('','lcs_backtrace'); pp_addpm(<<'EOPM'); =pod =head2 lcs_backtrace =for sig Signature: (a(N); b(M); int lcs(N+1,M+1); int ifinal(); int jfinal(); int [o]ai(L); int [o]bi(L); int [o]len()) Convenience method. Compute longest-common-subsequence backtrace through LCS matrix $lcs() for original input strings ($a(),$b()) from final index ($ifinal,$jfinal). Stores raw paths for (original) input strings $a() and $b() in $ai() and $bi() respectively. =cut sub lcs_backtrace { my ($a,$b,$lcs,$ifinal,$jfinal,$ai,$bi,$len) = @_; $len=pdl(long, pdl(long,$lcs->dims)->min) if (!defined($len)); if (!defined($ai)) { $ai=zeroes(long,$len); } else { $ai->reshape($len) if ($ai->nelem < $len); } if (!defined($bi)) { $bi = zeroes(long,$len); } else { $bi->reshape($len) if ($bi->nelem < $len); } if (!defined($ifinal)) { $ifinal = $lcs->dim(0)-1; } if (!defined($jfinal)) { $jfinal = $lcs->dim(1)-1; } _lcs_backtrace(_edit_pdl($a),_edit_pdl($b), $lcs,$ifinal,$jfinal, $ai,$bi,$len); my $lens = ($len->sclr-1); return ($ai->slice("0:$lens"),$bi->slice("0:$lens"), $len); } EOPM ##------------------------------------------------------ ## _lcs_backtrace : get LCS backtrace pp_add_exported('','_lcs_backtrace'); pp_def('_lcs_backtrace', Pars => ('a1(N1); b1(M1); int lcs(N1,M1); int ifinal(); int jfinal(); [o]ai(L); [o]bi(L); int [o]len()'), Code => (' int i,j,p, iprev,jprev, tmp; //-- get reversed path & real path length $len() = 0; for (p=0, i=$ifinal(), j=$jfinal(); p < $SIZE(L) && i>0 && j>0; ) { $GENERIC(a1) a1i = $a1(N1=>i); $GENERIC(b1) b1j = $b1(M1=>j); iprev=i-1; jprev=j-1; if (a1i==b1j) { $ai(L=>p) = iprev; $bi(L=>p) = jprev; --i; --j; ++p; } else if ($lcs(N1=>i,M1=>jprev) > $lcs(N1=>iprev,M1=>j)) { --j; } else { --i; } } $len() = p; //-- now reverse the paths for (p=0; p < ($len()+1)/2; p++) { int endp = $len()-p-1; // tmp = $ai(L=>p); $ai(L=>p) = $ai(L=>endp); $ai(L=>endp) = tmp; // tmp = $bi(L=>p); $bi(L=>p) = $bi(L=>endp); $bi(L=>endp) = tmp; } '), Doc => q( Low-level method. Compute longest-common-subsequence backtrace through LCS matrix $lcs() for initial-padded strings ($a1(),$b1()) from final index ($ifinal,$jfinal). Stores raw paths for (original) input strings $a() and $b() in $ai() and $bi() respectively. ), ); ##====================================================================== ## Footer Administrivia ##====================================================================== ##------------------------------------------------------ ## pm additions pp_addpm(<<'EOPM'); ##--------------------------------------------------------------------- =pod =head1 ACKNOWLEDGEMENTS Perl by Larry Wall. PDL by Karl Glazebrook, Tuomas J. Lukka, Christian Soeller, and others. =cut ##---------------------------------------------------------------------- =pod =head1 KNOWN BUGS Probably many. =cut ##--------------------------------------------------------------------- =pod =head1 AUTHOR Bryan Jurish Emoocow@cpan.orgE =head2 Copyright Policy Copyright (C) 2006-2015, Bryan Jurish. All rights reserved. This package is free software, and entirely without warranty. You may redistribute it and/or modify it under the same terms as Perl itself, either Perl 5.20.2, or at your option any later version of Perl 5. =head1 SEE ALSO perl(1), PDL(3perl). =cut EOPM # Always make sure that you finish your PP declarations with # pp_done pp_done(); ##----------------------------------------------------------------------