Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: trunk/www.guidonia.net/wp/wp-includes/Text/Diff/Engine/native.php@ 44

Last change on this file since 44 was 44, checked in by luciano, 15 years ago

File size: 15.6 KB

Line
1	<?php
2	/**
3	* $Horde: framework/Text_Diff/Diff/Engine/native.php,v 1.10 2008/01/04 10:27:53 jan Exp $
4	*
5	* Class used internally by Text_Diff to actually compute the diffs. This
6	* class is implemented using native PHP code.
7	*
8	* The algorithm used here is mostly lifted from the perl module
9	* Algorithm::Diff (version 1.06) by Ned Konz, which is available at:
10	* http://www.perl.com/CPAN/authors/id/N/NE/NEDKONZ/Algorithm-Diff-1.06.zip
11	*
12	* More ideas are taken from: http://www.ics.uci.edu/~eppstein/161/960229.html
13	*
14	* Some ideas (and a bit of code) are taken from analyze.c, of GNU
15	* diffutils-2.7, which can be found at:
16	* ftp://gnudist.gnu.org/pub/gnu/diffutils/diffutils-2.7.tar.gz
17	*
18	* Some ideas (subdivision by NCHUNKS > 2, and some optimizations) are from
19	* Geoffrey T. Dairiki <dairiki@dairiki.org>. The original PHP version of this
20	* code was written by him, and is used/adapted with his permission.
21	*
22	* Copyright 2004-2008 The Horde Project (http://www.horde.org/)
23	*
24	* See the enclosed file COPYING for license information (LGPL). If you did
25	* not receive this file, see http://opensource.org/licenses/lgpl-license.php.
26	*
27	* @author Geoffrey T. Dairiki <dairiki@dairiki.org>
28	* @package Text_Diff
29	*/
30	class Text_Diff_Engine_native {
31
32	function diff($from_lines, $to_lines)
33	{
34	array_walk($from_lines, array('Text_Diff', 'trimNewlines'));
35	array_walk($to_lines, array('Text_Diff', 'trimNewlines'));
36
37	$n_from = count($from_lines);
38	$n_to = count($to_lines);
39
40	$this->xchanged = $this->ychanged = array();
41	$this->xv = $this->yv = array();
42	$this->xind = $this->yind = array();
43	unset($this->seq);
44	unset($this->in_seq);
45	unset($this->lcs);
46
47	// Skip leading common lines.
48	for ($skip = 0; $skip < $n_from && $skip < $n_to; $skip++) {
49	if ($from_lines[$skip] !== $to_lines[$skip]) {
50	break;
51	}
52	$this->xchanged[$skip] = $this->ychanged[$skip] = false;
53	}
54
55	// Skip trailing common lines.
56	$xi = $n_from; $yi = $n_to;
57	for ($endskip = 0; --$xi > $skip && --$yi > $skip; $endskip++) {
58	if ($from_lines[$xi] !== $to_lines[$yi]) {
59	break;
60	}
61	$this->xchanged[$xi] = $this->ychanged[$yi] = false;
62	}
63
64	// Ignore lines which do not exist in both files.
65	for ($xi = $skip; $xi < $n_from - $endskip; $xi++) {
66	$xhash[$from_lines[$xi]] = 1;
67	}
68	for ($yi = $skip; $yi < $n_to - $endskip; $yi++) {
69	$line = $to_lines[$yi];
70	if (($this->ychanged[$yi] = empty($xhash[$line]))) {
71	continue;
72	}
73	$yhash[$line] = 1;
74	$this->yv[] = $line;
75	$this->yind[] = $yi;
76	}
77	for ($xi = $skip; $xi < $n_from - $endskip; $xi++) {
78	$line = $from_lines[$xi];
79	if (($this->xchanged[$xi] = empty($yhash[$line]))) {
80	continue;
81	}
82	$this->xv[] = $line;
83	$this->xind[] = $xi;
84	}
85
86	// Find the LCS.
87	$this->_compareseq(0, count($this->xv), 0, count($this->yv));
88
89	// Merge edits when possible.
90	$this->_shiftBoundaries($from_lines, $this->xchanged, $this->ychanged);
91	$this->_shiftBoundaries($to_lines, $this->ychanged, $this->xchanged);
92
93	// Compute the edit operations.
94	$edits = array();
95	$xi = $yi = 0;
96	while ($xi < $n_from \|\| $yi < $n_to) {
97	assert($yi < $n_to \|\| $this->xchanged[$xi]);
98	assert($xi < $n_from \|\| $this->ychanged[$yi]);
99
100	// Skip matching "snake".
101	$copy = array();
102	while ($xi < $n_from && $yi < $n_to
103	&& !$this->xchanged[$xi] && !$this->ychanged[$yi]) {
104	$copy[] = $from_lines[$xi++];
105	++$yi;
106	}
107	if ($copy) {
108	$edits[] = &new Text_Diff_Op_copy($copy);
109	}
110
111	// Find deletes & adds.
112	$delete = array();
113	while ($xi < $n_from && $this->xchanged[$xi]) {
114	$delete[] = $from_lines[$xi++];
115	}
116
117	$add = array();
118	while ($yi < $n_to && $this->ychanged[$yi]) {
119	$add[] = $to_lines[$yi++];
120	}
121
122	if ($delete && $add) {
123	$edits[] = &new Text_Diff_Op_change($delete, $add);
124	} elseif ($delete) {
125	$edits[] = &new Text_Diff_Op_delete($delete);
126	} elseif ($add) {
127	$edits[] = &new Text_Diff_Op_add($add);
128	}
129	}
130
131	return $edits;
132	}
133
134	/**
135	* Divides the Largest Common Subsequence (LCS) of the sequences (XOFF,
136	* XLIM) and (YOFF, YLIM) into NCHUNKS approximately equally sized
137	* segments.
138	*
139	* Returns (LCS, PTS). LCS is the length of the LCS. PTS is an array of
140	* NCHUNKS+1 (X, Y) indexes giving the diving points between sub
141	* sequences. The first sub-sequence is contained in (X0, X1), (Y0, Y1),
142	* the second in (X1, X2), (Y1, Y2) and so on. Note that (X0, Y0) ==
143	* (XOFF, YOFF) and (X[NCHUNKS], Y[NCHUNKS]) == (XLIM, YLIM).
144	*
145	* This function assumes that the first lines of the specified portions of
146	* the two files do not match, and likewise that the last lines do not
147	* match. The caller must trim matching lines from the beginning and end
148	* of the portions it is going to specify.
149	*/
150	function _diag ($xoff, $xlim, $yoff, $ylim, $nchunks)
151	{
152	$flip = false;
153
154	if ($xlim - $xoff > $ylim - $yoff) {
155	/* Things seems faster (I'm not sure I understand why) when the
156	* shortest sequence is in X. */
157	$flip = true;
158	list ($xoff, $xlim, $yoff, $ylim)
159	= array($yoff, $ylim, $xoff, $xlim);
160	}
161
162	if ($flip) {
163	for ($i = $ylim - 1; $i >= $yoff; $i--) {
164	$ymatches[$this->xv[$i]][] = $i;
165	}
166	} else {
167	for ($i = $ylim - 1; $i >= $yoff; $i--) {
168	$ymatches[$this->yv[$i]][] = $i;
169	}
170	}
171
172	$this->lcs = 0;
173	$this->seq[0]= $yoff - 1;
174	$this->in_seq = array();
175	$ymids[0] = array();
176
177	$numer = $xlim - $xoff + $nchunks - 1;
178	$x = $xoff;
179	for ($chunk = 0; $chunk < $nchunks; $chunk++) {
180	if ($chunk > 0) {
181	for ($i = 0; $i <= $this->lcs; $i++) {
182	$ymids[$i][$chunk - 1] = $this->seq[$i];
183	}
184	}
185
186	$x1 = $xoff + (int)(($numer + ($xlim - $xoff) * $chunk) / $nchunks);
187	for (; $x < $x1; $x++) {
188	$line = $flip ? $this->yv[$x] : $this->xv[$x];
189	if (empty($ymatches[$line])) {
190	continue;
191	}
192	$matches = $ymatches[$line];
193	reset($matches);
194	while (list(, $y) = each($matches)) {
195	if (empty($this->in_seq[$y])) {
196	$k = $this->_lcsPos($y);
197	assert($k > 0);
198	$ymids[$k] = $ymids[$k - 1];
199	break;
200	}
201	}
202	while (list(, $y) = each($matches)) {
203	if ($y > $this->seq[$k - 1]) {
204	assert($y <= $this->seq[$k]);
205	/* Optimization: this is a common case: next match is
206	* just replacing previous match. */
207	$this->in_seq[$this->seq[$k]] = false;
208	$this->seq[$k] = $y;
209	$this->in_seq[$y] = 1;
210	} elseif (empty($this->in_seq[$y])) {
211	$k = $this->_lcsPos($y);
212	assert($k > 0);
213	$ymids[$k] = $ymids[$k - 1];
214	}
215	}
216	}
217	}
218
219	$seps[] = $flip ? array($yoff, $xoff) : array($xoff, $yoff);
220	$ymid = $ymids[$this->lcs];
221	for ($n = 0; $n < $nchunks - 1; $n++) {
222	$x1 = $xoff + (int)(($numer + ($xlim - $xoff) * $n) / $nchunks);
223	$y1 = $ymid[$n] + 1;
224	$seps[] = $flip ? array($y1, $x1) : array($x1, $y1);
225	}
226	$seps[] = $flip ? array($ylim, $xlim) : array($xlim, $ylim);
227
228	return array($this->lcs, $seps);
229	}
230
231	function _lcsPos($ypos)
232	{
233	$end = $this->lcs;
234	if ($end == 0 \|\| $ypos > $this->seq[$end]) {
235	$this->seq[++$this->lcs] = $ypos;
236	$this->in_seq[$ypos] = 1;
237	return $this->lcs;
238	}
239
240	$beg = 1;
241	while ($beg < $end) {
242	$mid = (int)(($beg + $end) / 2);
243	if ($ypos > $this->seq[$mid]) {
244	$beg = $mid + 1;
245	} else {
246	$end = $mid;
247	}
248	}
249
250	assert($ypos != $this->seq[$end]);
251
252	$this->in_seq[$this->seq[$end]] = false;
253	$this->seq[$end] = $ypos;
254	$this->in_seq[$ypos] = 1;
255	return $end;
256	}
257
258	/**
259	* Finds LCS of two sequences.
260	*
261	* The results are recorded in the vectors $this->{x,y}changed[], by
262	* storing a 1 in the element for each line that is an insertion or
263	* deletion (ie. is not in the LCS).
264	*
265	* The subsequence of file 0 is (XOFF, XLIM) and likewise for file 1.
266	*
267	* Note that XLIM, YLIM are exclusive bounds. All line numbers are
268	* origin-0 and discarded lines are not counted.
269	*/
270	function _compareseq ($xoff, $xlim, $yoff, $ylim)
271	{
272	/* Slide down the bottom initial diagonal. */
273	while ($xoff < $xlim && $yoff < $ylim
274	&& $this->xv[$xoff] == $this->yv[$yoff]) {
275	++$xoff;
276	++$yoff;
277	}
278
279	/* Slide up the top initial diagonal. */
280	while ($xlim > $xoff && $ylim > $yoff
281	&& $this->xv[$xlim - 1] == $this->yv[$ylim - 1]) {
282	--$xlim;
283	--$ylim;
284	}
285
286	if ($xoff == $xlim \|\| $yoff == $ylim) {
287	$lcs = 0;
288	} else {
289	/* This is ad hoc but seems to work well. $nchunks =
290	* sqrt(min($xlim - $xoff, $ylim - $yoff) / 2.5); $nchunks =
291	* max(2,min(8,(int)$nchunks)); */
292	$nchunks = min(7, $xlim - $xoff, $ylim - $yoff) + 1;
293	list($lcs, $seps)
294	= $this->_diag($xoff, $xlim, $yoff, $ylim, $nchunks);
295	}
296
297	if ($lcs == 0) {
298	/* X and Y sequences have no common subsequence: mark all
299	* changed. */
300	while ($yoff < $ylim) {
301	$this->ychanged[$this->yind[$yoff++]] = 1;
302	}
303	while ($xoff < $xlim) {
304	$this->xchanged[$this->xind[$xoff++]] = 1;
305	}
306	} else {
307	/* Use the partitions to split this problem into subproblems. */
308	reset($seps);
309	$pt1 = $seps[0];
310	while ($pt2 = next($seps)) {
311	$this->_compareseq ($pt1[0], $pt2[0], $pt1[1], $pt2[1]);
312	$pt1 = $pt2;
313	}
314	}
315	}
316
317	/**
318	* Adjusts inserts/deletes of identical lines to join changes as much as
319	* possible.
320	*
321	* We do something when a run of changed lines include a line at one end
322	* and has an excluded, identical line at the other. We are free to
323	* choose which identical line is included. `compareseq' usually chooses
324	* the one at the beginning, but usually it is cleaner to consider the
325	* following identical line to be the "change".
326	*
327	* This is extracted verbatim from analyze.c (GNU diffutils-2.7).
328	*/
329	function _shiftBoundaries($lines, &$changed, $other_changed)
330	{
331	$i = 0;
332	$j = 0;
333
334	assert('count($lines) == count($changed)');
335	$len = count($lines);
336	$other_len = count($other_changed);
337
338	while (1) {
339	/* Scan forward to find the beginning of another run of
340	* changes. Also keep track of the corresponding point in the
341	* other file.
342	*
343	* Throughout this code, $i and $j are adjusted together so that
344	* the first $i elements of $changed and the first $j elements of
345	* $other_changed both contain the same number of zeros (unchanged
346	* lines).
347	*
348	* Furthermore, $j is always kept so that $j == $other_len or
349	* $other_changed[$j] == false. */
350	while ($j < $other_len && $other_changed[$j]) {
351	$j++;
352	}
353
354	while ($i < $len && ! $changed[$i]) {
355	assert('$j < $other_len && ! $other_changed[$j]');
356	$i++; $j++;
357	while ($j < $other_len && $other_changed[$j]) {
358	$j++;
359	}
360	}
361
362	if ($i == $len) {
363	break;
364	}
365
366	$start = $i;
367
368	/* Find the end of this run of changes. */
369	while (++$i < $len && $changed[$i]) {
370	continue;
371	}
372
373	do {
374	/* Record the length of this run of changes, so that we can
375	* later determine whether the run has grown. */
376	$runlength = $i - $start;
377
378	/* Move the changed region back, so long as the previous
379	* unchanged line matches the last changed one. This merges
380	* with previous changed regions. */
381	while ($start > 0 && $lines[$start - 1] == $lines[$i - 1]) {
382	$changed[--$start] = 1;
383	$changed[--$i] = false;
384	while ($start > 0 && $changed[$start - 1]) {
385	$start--;
386	}
387	assert('$j > 0');
388	while ($other_changed[--$j]) {
389	continue;
390	}
391	assert('$j >= 0 && !$other_changed[$j]');
392	}
393
394	/* Set CORRESPONDING to the end of the changed run, at the
395	* last point where it corresponds to a changed run in the
396	* other file. CORRESPONDING == LEN means no such point has
397	* been found. */
398	$corresponding = $j < $other_len ? $i : $len;
399
400	/* Move the changed region forward, so long as the first
401	* changed line matches the following unchanged one. This
402	* merges with following changed regions. Do this second, so
403	* that if there are no merges, the changed region is moved
404	* forward as far as possible. */
405	while ($i < $len && $lines[$start] == $lines[$i]) {
406	$changed[$start++] = false;
407	$changed[$i++] = 1;
408	while ($i < $len && $changed[$i]) {
409	$i++;
410	}
411
412	assert('$j < $other_len && ! $other_changed[$j]');
413	$j++;
414	if ($j < $other_len && $other_changed[$j]) {
415	$corresponding = $i;
416	while ($j < $other_len && $other_changed[$j]) {
417	$j++;
418	}
419	}
420	}
421	} while ($runlength != $i - $start);
422
423	/* If possible, move the fully-merged run of changes back to a
424	* corresponding run in the other file. */
425	while ($corresponding < $i) {
426	$changed[--$start] = 1;
427	$changed[--$i] = 0;
428	assert('$j > 0');
429	while ($other_changed[--$j]) {
430	continue;
431	}
432	assert('$j >= 0 && !$other_changed[$j]');
433	}
434	}
435	}
436
437	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: