Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Normal
Revision Log

source: trunk/www.guidonia.net/wp/wp-includes/Text/Diff/Engine/native.php@ 44

Last change on this file since 44 was 44, checked in by luciano, 15 years ago

File size: 15.6 KB

Rev	Line
[44]	1	<?php
	2	/**
	3	* $Horde: framework/Text_Diff/Diff/Engine/native.php,v 1.10 2008/01/04 10:27:53 jan Exp $
	4	*
	5	* Class used internally by Text_Diff to actually compute the diffs. This
	6	* class is implemented using native PHP code.
	7	*
	8	* The algorithm used here is mostly lifted from the perl module
	9	* Algorithm::Diff (version 1.06) by Ned Konz, which is available at:
	10	* http://www.perl.com/CPAN/authors/id/N/NE/NEDKONZ/Algorithm-Diff-1.06.zip
	11	*
	12	* More ideas are taken from: http://www.ics.uci.edu/~eppstein/161/960229.html
	13	*
	14	* Some ideas (and a bit of code) are taken from analyze.c, of GNU
	15	* diffutils-2.7, which can be found at:
	16	* ftp://gnudist.gnu.org/pub/gnu/diffutils/diffutils-2.7.tar.gz
	17	*
	18	* Some ideas (subdivision by NCHUNKS > 2, and some optimizations) are from
	19	* Geoffrey T. Dairiki <dairiki@dairiki.org>. The original PHP version of this
	20	* code was written by him, and is used/adapted with his permission.
	21	*
	22	* Copyright 2004-2008 The Horde Project (http://www.horde.org/)
	23	*
	24	* See the enclosed file COPYING for license information (LGPL). If you did
	25	* not receive this file, see http://opensource.org/licenses/lgpl-license.php.
	26	*
	27	* @author Geoffrey T. Dairiki <dairiki@dairiki.org>
	28	* @package Text_Diff
	29	*/
	30	class Text_Diff_Engine_native {
	31
	32	function diff($from_lines, $to_lines)
	33	{
	34	array_walk($from_lines, array('Text_Diff', 'trimNewlines'));
	35	array_walk($to_lines, array('Text_Diff', 'trimNewlines'));
	36
	37	$n_from = count($from_lines);
	38	$n_to = count($to_lines);
	39
	40	$this->xchanged = $this->ychanged = array();
	41	$this->xv = $this->yv = array();
	42	$this->xind = $this->yind = array();
	43	unset($this->seq);
	44	unset($this->in_seq);
	45	unset($this->lcs);
	46
	47	// Skip leading common lines.
	48	for ($skip = 0; $skip < $n_from && $skip < $n_to; $skip++) {
	49	if ($from_lines[$skip] !== $to_lines[$skip]) {
	50	break;
	51	}
	52	$this->xchanged[$skip] = $this->ychanged[$skip] = false;
	53	}
	54
	55	// Skip trailing common lines.
	56	$xi = $n_from; $yi = $n_to;
	57	for ($endskip = 0; --$xi > $skip && --$yi > $skip; $endskip++) {
	58	if ($from_lines[$xi] !== $to_lines[$yi]) {
	59	break;
	60	}
	61	$this->xchanged[$xi] = $this->ychanged[$yi] = false;
	62	}
	63
	64	// Ignore lines which do not exist in both files.
	65	for ($xi = $skip; $xi < $n_from - $endskip; $xi++) {
	66	$xhash[$from_lines[$xi]] = 1;
	67	}
	68	for ($yi = $skip; $yi < $n_to - $endskip; $yi++) {
	69	$line = $to_lines[$yi];
	70	if (($this->ychanged[$yi] = empty($xhash[$line]))) {
	71	continue;
	72	}
	73	$yhash[$line] = 1;
	74	$this->yv[] = $line;
	75	$this->yind[] = $yi;
	76	}
	77	for ($xi = $skip; $xi < $n_from - $endskip; $xi++) {
	78	$line = $from_lines[$xi];
	79	if (($this->xchanged[$xi] = empty($yhash[$line]))) {
	80	continue;
	81	}
	82	$this->xv[] = $line;
	83	$this->xind[] = $xi;
	84	}
	85
	86	// Find the LCS.
	87	$this->_compareseq(0, count($this->xv), 0, count($this->yv));
	88
	89	// Merge edits when possible.
	90	$this->_shiftBoundaries($from_lines, $this->xchanged, $this->ychanged);
	91	$this->_shiftBoundaries($to_lines, $this->ychanged, $this->xchanged);
	92
	93	// Compute the edit operations.
	94	$edits = array();
	95	$xi = $yi = 0;
	96	while ($xi < $n_from \|\| $yi < $n_to) {
	97	assert($yi < $n_to \|\| $this->xchanged[$xi]);
	98	assert($xi < $n_from \|\| $this->ychanged[$yi]);
	99
	100	// Skip matching "snake".
	101	$copy = array();
	102	while ($xi < $n_from && $yi < $n_to
	103	&& !$this->xchanged[$xi] && !$this->ychanged[$yi]) {
	104	$copy[] = $from_lines[$xi++];
	105	++$yi;
	106	}
	107	if ($copy) {
	108	$edits[] = &new Text_Diff_Op_copy($copy);
	109	}
	110
	111	// Find deletes & adds.
	112	$delete = array();
	113	while ($xi < $n_from && $this->xchanged[$xi]) {
	114	$delete[] = $from_lines[$xi++];
	115	}
	116
	117	$add = array();
	118	while ($yi < $n_to && $this->ychanged[$yi]) {
	119	$add[] = $to_lines[$yi++];
	120	}
	121
	122	if ($delete && $add) {
	123	$edits[] = &new Text_Diff_Op_change($delete, $add);
	124	} elseif ($delete) {
	125	$edits[] = &new Text_Diff_Op_delete($delete);
	126	} elseif ($add) {
	127	$edits[] = &new Text_Diff_Op_add($add);
	128	}
	129	}
	130
	131	return $edits;
	132	}
	133
	134	/**
	135	* Divides the Largest Common Subsequence (LCS) of the sequences (XOFF,
	136	* XLIM) and (YOFF, YLIM) into NCHUNKS approximately equally sized
	137	* segments.
	138	*
	139	* Returns (LCS, PTS). LCS is the length of the LCS. PTS is an array of
	140	* NCHUNKS+1 (X, Y) indexes giving the diving points between sub
	141	* sequences. The first sub-sequence is contained in (X0, X1), (Y0, Y1),
	142	* the second in (X1, X2), (Y1, Y2) and so on. Note that (X0, Y0) ==
	143	* (XOFF, YOFF) and (X[NCHUNKS], Y[NCHUNKS]) == (XLIM, YLIM).
	144	*
	145	* This function assumes that the first lines of the specified portions of
	146	* the two files do not match, and likewise that the last lines do not
	147	* match. The caller must trim matching lines from the beginning and end
	148	* of the portions it is going to specify.
	149	*/
	150	function _diag ($xoff, $xlim, $yoff, $ylim, $nchunks)
	151	{
	152	$flip = false;
	153
	154	if ($xlim - $xoff > $ylim - $yoff) {
	155	/* Things seems faster (I'm not sure I understand why) when the
	156	* shortest sequence is in X. */
	157	$flip = true;
	158	list ($xoff, $xlim, $yoff, $ylim)
	159	= array($yoff, $ylim, $xoff, $xlim);
	160	}
	161
	162	if ($flip) {
	163	for ($i = $ylim - 1; $i >= $yoff; $i--) {
	164	$ymatches[$this->xv[$i]][] = $i;
	165	}
	166	} else {
	167	for ($i = $ylim - 1; $i >= $yoff; $i--) {
	168	$ymatches[$this->yv[$i]][] = $i;
	169	}
	170	}
	171
	172	$this->lcs = 0;
	173	$this->seq[0]= $yoff - 1;
	174	$this->in_seq = array();
	175	$ymids[0] = array();
	176
	177	$numer = $xlim - $xoff + $nchunks - 1;
	178	$x = $xoff;
	179	for ($chunk = 0; $chunk < $nchunks; $chunk++) {
	180	if ($chunk > 0) {
	181	for ($i = 0; $i <= $this->lcs; $i++) {
	182	$ymids[$i][$chunk - 1] = $this->seq[$i];
	183	}
	184	}
	185
	186	$x1 = $xoff + (int)(($numer + ($xlim - $xoff) * $chunk) / $nchunks);
	187	for (; $x < $x1; $x++) {
	188	$line = $flip ? $this->yv[$x] : $this->xv[$x];
	189	if (empty($ymatches[$line])) {
	190	continue;
	191	}
	192	$matches = $ymatches[$line];
	193	reset($matches);
	194	while (list(, $y) = each($matches)) {
	195	if (empty($this->in_seq[$y])) {
	196	$k = $this->_lcsPos($y);
	197	assert($k > 0);
	198	$ymids[$k] = $ymids[$k - 1];
	199	break;
	200	}
	201	}
	202	while (list(, $y) = each($matches)) {
	203	if ($y > $this->seq[$k - 1]) {
	204	assert($y <= $this->seq[$k]);
	205	/* Optimization: this is a common case: next match is
	206	* just replacing previous match. */
	207	$this->in_seq[$this->seq[$k]] = false;
	208	$this->seq[$k] = $y;
	209	$this->in_seq[$y] = 1;
	210	} elseif (empty($this->in_seq[$y])) {
	211	$k = $this->_lcsPos($y);
	212	assert($k > 0);
	213	$ymids[$k] = $ymids[$k - 1];
	214	}
	215	}
	216	}
	217	}
	218
	219	$seps[] = $flip ? array($yoff, $xoff) : array($xoff, $yoff);
	220	$ymid = $ymids[$this->lcs];
	221	for ($n = 0; $n < $nchunks - 1; $n++) {
	222	$x1 = $xoff + (int)(($numer + ($xlim - $xoff) * $n) / $nchunks);
	223	$y1 = $ymid[$n] + 1;
	224	$seps[] = $flip ? array($y1, $x1) : array($x1, $y1);
	225	}
	226	$seps[] = $flip ? array($ylim, $xlim) : array($xlim, $ylim);
	227
	228	return array($this->lcs, $seps);
	229	}
	230
	231	function _lcsPos($ypos)
	232	{
	233	$end = $this->lcs;
	234	if ($end == 0 \|\| $ypos > $this->seq[$end]) {
	235	$this->seq[++$this->lcs] = $ypos;
	236	$this->in_seq[$ypos] = 1;
	237	return $this->lcs;
	238	}
	239
	240	$beg = 1;
	241	while ($beg < $end) {
	242	$mid = (int)(($beg + $end) / 2);
	243	if ($ypos > $this->seq[$mid]) {
	244	$beg = $mid + 1;
	245	} else {
	246	$end = $mid;
	247	}
	248	}
	249
	250	assert($ypos != $this->seq[$end]);
	251
	252	$this->in_seq[$this->seq[$end]] = false;
	253	$this->seq[$end] = $ypos;
	254	$this->in_seq[$ypos] = 1;
	255	return $end;
	256	}
	257
	258	/**
	259	* Finds LCS of two sequences.
	260	*
	261	* The results are recorded in the vectors $this->{x,y}changed[], by
	262	* storing a 1 in the element for each line that is an insertion or
	263	* deletion (ie. is not in the LCS).
	264	*
	265	* The subsequence of file 0 is (XOFF, XLIM) and likewise for file 1.
	266	*
	267	* Note that XLIM, YLIM are exclusive bounds. All line numbers are
	268	* origin-0 and discarded lines are not counted.
	269	*/
	270	function _compareseq ($xoff, $xlim, $yoff, $ylim)
	271	{
	272	/* Slide down the bottom initial diagonal. */
	273	while ($xoff < $xlim && $yoff < $ylim
	274	&& $this->xv[$xoff] == $this->yv[$yoff]) {
	275	++$xoff;
	276	++$yoff;
	277	}
	278
	279	/* Slide up the top initial diagonal. */
	280	while ($xlim > $xoff && $ylim > $yoff
	281	&& $this->xv[$xlim - 1] == $this->yv[$ylim - 1]) {
	282	--$xlim;
	283	--$ylim;
	284	}
	285
	286	if ($xoff == $xlim \|\| $yoff == $ylim) {
	287	$lcs = 0;
	288	} else {
	289	/* This is ad hoc but seems to work well. $nchunks =
	290	* sqrt(min($xlim - $xoff, $ylim - $yoff) / 2.5); $nchunks =
	291	* max(2,min(8,(int)$nchunks)); */
	292	$nchunks = min(7, $xlim - $xoff, $ylim - $yoff) + 1;
	293	list($lcs, $seps)
	294	= $this->_diag($xoff, $xlim, $yoff, $ylim, $nchunks);
	295	}
	296
	297	if ($lcs == 0) {
	298	/* X and Y sequences have no common subsequence: mark all
	299	* changed. */
	300	while ($yoff < $ylim) {
	301	$this->ychanged[$this->yind[$yoff++]] = 1;
	302	}
	303	while ($xoff < $xlim) {
	304	$this->xchanged[$this->xind[$xoff++]] = 1;
	305	}
	306	} else {
	307	/* Use the partitions to split this problem into subproblems. */
	308	reset($seps);
	309	$pt1 = $seps[0];
	310	while ($pt2 = next($seps)) {
	311	$this->_compareseq ($pt1[0], $pt2[0], $pt1[1], $pt2[1]);
	312	$pt1 = $pt2;
	313	}
	314	}
	315	}
	316
	317	/**
	318	* Adjusts inserts/deletes of identical lines to join changes as much as
	319	* possible.
	320	*
	321	* We do something when a run of changed lines include a line at one end
	322	* and has an excluded, identical line at the other. We are free to
	323	* choose which identical line is included. `compareseq' usually chooses
	324	* the one at the beginning, but usually it is cleaner to consider the
	325	* following identical line to be the "change".
	326	*
	327	* This is extracted verbatim from analyze.c (GNU diffutils-2.7).
	328	*/
	329	function _shiftBoundaries($lines, &$changed, $other_changed)
	330	{
	331	$i = 0;
	332	$j = 0;
	333
	334	assert('count($lines) == count($changed)');
	335	$len = count($lines);
	336	$other_len = count($other_changed);
	337
	338	while (1) {
	339	/* Scan forward to find the beginning of another run of
	340	* changes. Also keep track of the corresponding point in the
	341	* other file.
	342	*
	343	* Throughout this code, $i and $j are adjusted together so that
	344	* the first $i elements of $changed and the first $j elements of
	345	* $other_changed both contain the same number of zeros (unchanged
	346	* lines).
	347	*
	348	* Furthermore, $j is always kept so that $j == $other_len or
	349	* $other_changed[$j] == false. */
	350	while ($j < $other_len && $other_changed[$j]) {
	351	$j++;
	352	}
	353
	354	while ($i < $len && ! $changed[$i]) {
	355	assert('$j < $other_len && ! $other_changed[$j]');
	356	$i++; $j++;
	357	while ($j < $other_len && $other_changed[$j]) {
	358	$j++;
	359	}
	360	}
	361
	362	if ($i == $len) {
	363	break;
	364	}
	365
	366	$start = $i;
	367
	368	/* Find the end of this run of changes. */
	369	while (++$i < $len && $changed[$i]) {
	370	continue;
	371	}
	372
	373	do {
	374	/* Record the length of this run of changes, so that we can
	375	* later determine whether the run has grown. */
	376	$runlength = $i - $start;
	377
	378	/* Move the changed region back, so long as the previous
	379	* unchanged line matches the last changed one. This merges
	380	* with previous changed regions. */
	381	while ($start > 0 && $lines[$start - 1] == $lines[$i - 1]) {
	382	$changed[--$start] = 1;
	383	$changed[--$i] = false;
	384	while ($start > 0 && $changed[$start - 1]) {
	385	$start--;
	386	}
	387	assert('$j > 0');
	388	while ($other_changed[--$j]) {
	389	continue;
	390	}
	391	assert('$j >= 0 && !$other_changed[$j]');
	392	}
	393
	394	/* Set CORRESPONDING to the end of the changed run, at the
	395	* last point where it corresponds to a changed run in the
	396	* other file. CORRESPONDING == LEN means no such point has
	397	* been found. */
	398	$corresponding = $j < $other_len ? $i : $len;
	399
	400	/* Move the changed region forward, so long as the first
	401	* changed line matches the following unchanged one. This
	402	* merges with following changed regions. Do this second, so
	403	* that if there are no merges, the changed region is moved
	404	* forward as far as possible. */
	405	while ($i < $len && $lines[$start] == $lines[$i]) {
	406	$changed[$start++] = false;
	407	$changed[$i++] = 1;
	408	while ($i < $len && $changed[$i]) {
	409	$i++;
	410	}
	411
	412	assert('$j < $other_len && ! $other_changed[$j]');
	413	$j++;
	414	if ($j < $other_len && $other_changed[$j]) {
	415	$corresponding = $i;
	416	while ($j < $other_len && $other_changed[$j]) {
	417	$j++;
	418	}
	419	}
	420	}
	421	} while ($runlength != $i - $start);
	422
	423	/* If possible, move the fully-merged run of changes back to a
	424	* corresponding run in the other file. */
	425	while ($corresponding < $i) {
	426	$changed[--$start] = 1;
	427	$changed[--$i] = 0;
	428	assert('$j > 0');
	429	while ($other_changed[--$j]) {
	430	continue;
	431	}
	432	assert('$j >= 0 && !$other_changed[$j]');
	433	}
	434	}
	435	}
	436
	437	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: