1 | <?php
|
---|
2 | /**
|
---|
3 | * $Horde: framework/Text_Diff/Diff/Engine/native.php,v 1.10 2008/01/04 10:27:53 jan Exp $
|
---|
4 | *
|
---|
5 | * Class used internally by Text_Diff to actually compute the diffs. This
|
---|
6 | * class is implemented using native PHP code.
|
---|
7 | *
|
---|
8 | * The algorithm used here is mostly lifted from the perl module
|
---|
9 | * Algorithm::Diff (version 1.06) by Ned Konz, which is available at:
|
---|
10 | * http://www.perl.com/CPAN/authors/id/N/NE/NEDKONZ/Algorithm-Diff-1.06.zip
|
---|
11 | *
|
---|
12 | * More ideas are taken from: http://www.ics.uci.edu/~eppstein/161/960229.html
|
---|
13 | *
|
---|
14 | * Some ideas (and a bit of code) are taken from analyze.c, of GNU
|
---|
15 | * diffutils-2.7, which can be found at:
|
---|
16 | * ftp://gnudist.gnu.org/pub/gnu/diffutils/diffutils-2.7.tar.gz
|
---|
17 | *
|
---|
18 | * Some ideas (subdivision by NCHUNKS > 2, and some optimizations) are from
|
---|
19 | * Geoffrey T. Dairiki <dairiki@dairiki.org>. The original PHP version of this
|
---|
20 | * code was written by him, and is used/adapted with his permission.
|
---|
21 | *
|
---|
22 | * Copyright 2004-2008 The Horde Project (http://www.horde.org/)
|
---|
23 | *
|
---|
24 | * See the enclosed file COPYING for license information (LGPL). If you did
|
---|
25 | * not receive this file, see http://opensource.org/licenses/lgpl-license.php.
|
---|
26 | *
|
---|
27 | * @author Geoffrey T. Dairiki <dairiki@dairiki.org>
|
---|
28 | * @package Text_Diff
|
---|
29 | */
|
---|
30 | class Text_Diff_Engine_native {
|
---|
31 |
|
---|
32 | function diff($from_lines, $to_lines)
|
---|
33 | {
|
---|
34 | array_walk($from_lines, array('Text_Diff', 'trimNewlines'));
|
---|
35 | array_walk($to_lines, array('Text_Diff', 'trimNewlines'));
|
---|
36 |
|
---|
37 | $n_from = count($from_lines);
|
---|
38 | $n_to = count($to_lines);
|
---|
39 |
|
---|
40 | $this->xchanged = $this->ychanged = array();
|
---|
41 | $this->xv = $this->yv = array();
|
---|
42 | $this->xind = $this->yind = array();
|
---|
43 | unset($this->seq);
|
---|
44 | unset($this->in_seq);
|
---|
45 | unset($this->lcs);
|
---|
46 |
|
---|
47 | // Skip leading common lines.
|
---|
48 | for ($skip = 0; $skip < $n_from && $skip < $n_to; $skip++) {
|
---|
49 | if ($from_lines[$skip] !== $to_lines[$skip]) {
|
---|
50 | break;
|
---|
51 | }
|
---|
52 | $this->xchanged[$skip] = $this->ychanged[$skip] = false;
|
---|
53 | }
|
---|
54 |
|
---|
55 | // Skip trailing common lines.
|
---|
56 | $xi = $n_from; $yi = $n_to;
|
---|
57 | for ($endskip = 0; --$xi > $skip && --$yi > $skip; $endskip++) {
|
---|
58 | if ($from_lines[$xi] !== $to_lines[$yi]) {
|
---|
59 | break;
|
---|
60 | }
|
---|
61 | $this->xchanged[$xi] = $this->ychanged[$yi] = false;
|
---|
62 | }
|
---|
63 |
|
---|
64 | // Ignore lines which do not exist in both files.
|
---|
65 | for ($xi = $skip; $xi < $n_from - $endskip; $xi++) {
|
---|
66 | $xhash[$from_lines[$xi]] = 1;
|
---|
67 | }
|
---|
68 | for ($yi = $skip; $yi < $n_to - $endskip; $yi++) {
|
---|
69 | $line = $to_lines[$yi];
|
---|
70 | if (($this->ychanged[$yi] = empty($xhash[$line]))) {
|
---|
71 | continue;
|
---|
72 | }
|
---|
73 | $yhash[$line] = 1;
|
---|
74 | $this->yv[] = $line;
|
---|
75 | $this->yind[] = $yi;
|
---|
76 | }
|
---|
77 | for ($xi = $skip; $xi < $n_from - $endskip; $xi++) {
|
---|
78 | $line = $from_lines[$xi];
|
---|
79 | if (($this->xchanged[$xi] = empty($yhash[$line]))) {
|
---|
80 | continue;
|
---|
81 | }
|
---|
82 | $this->xv[] = $line;
|
---|
83 | $this->xind[] = $xi;
|
---|
84 | }
|
---|
85 |
|
---|
86 | // Find the LCS.
|
---|
87 | $this->_compareseq(0, count($this->xv), 0, count($this->yv));
|
---|
88 |
|
---|
89 | // Merge edits when possible.
|
---|
90 | $this->_shiftBoundaries($from_lines, $this->xchanged, $this->ychanged);
|
---|
91 | $this->_shiftBoundaries($to_lines, $this->ychanged, $this->xchanged);
|
---|
92 |
|
---|
93 | // Compute the edit operations.
|
---|
94 | $edits = array();
|
---|
95 | $xi = $yi = 0;
|
---|
96 | while ($xi < $n_from || $yi < $n_to) {
|
---|
97 | assert($yi < $n_to || $this->xchanged[$xi]);
|
---|
98 | assert($xi < $n_from || $this->ychanged[$yi]);
|
---|
99 |
|
---|
100 | // Skip matching "snake".
|
---|
101 | $copy = array();
|
---|
102 | while ($xi < $n_from && $yi < $n_to
|
---|
103 | && !$this->xchanged[$xi] && !$this->ychanged[$yi]) {
|
---|
104 | $copy[] = $from_lines[$xi++];
|
---|
105 | ++$yi;
|
---|
106 | }
|
---|
107 | if ($copy) {
|
---|
108 | $edits[] = &new Text_Diff_Op_copy($copy);
|
---|
109 | }
|
---|
110 |
|
---|
111 | // Find deletes & adds.
|
---|
112 | $delete = array();
|
---|
113 | while ($xi < $n_from && $this->xchanged[$xi]) {
|
---|
114 | $delete[] = $from_lines[$xi++];
|
---|
115 | }
|
---|
116 |
|
---|
117 | $add = array();
|
---|
118 | while ($yi < $n_to && $this->ychanged[$yi]) {
|
---|
119 | $add[] = $to_lines[$yi++];
|
---|
120 | }
|
---|
121 |
|
---|
122 | if ($delete && $add) {
|
---|
123 | $edits[] = &new Text_Diff_Op_change($delete, $add);
|
---|
124 | } elseif ($delete) {
|
---|
125 | $edits[] = &new Text_Diff_Op_delete($delete);
|
---|
126 | } elseif ($add) {
|
---|
127 | $edits[] = &new Text_Diff_Op_add($add);
|
---|
128 | }
|
---|
129 | }
|
---|
130 |
|
---|
131 | return $edits;
|
---|
132 | }
|
---|
133 |
|
---|
134 | /**
|
---|
135 | * Divides the Largest Common Subsequence (LCS) of the sequences (XOFF,
|
---|
136 | * XLIM) and (YOFF, YLIM) into NCHUNKS approximately equally sized
|
---|
137 | * segments.
|
---|
138 | *
|
---|
139 | * Returns (LCS, PTS). LCS is the length of the LCS. PTS is an array of
|
---|
140 | * NCHUNKS+1 (X, Y) indexes giving the diving points between sub
|
---|
141 | * sequences. The first sub-sequence is contained in (X0, X1), (Y0, Y1),
|
---|
142 | * the second in (X1, X2), (Y1, Y2) and so on. Note that (X0, Y0) ==
|
---|
143 | * (XOFF, YOFF) and (X[NCHUNKS], Y[NCHUNKS]) == (XLIM, YLIM).
|
---|
144 | *
|
---|
145 | * This function assumes that the first lines of the specified portions of
|
---|
146 | * the two files do not match, and likewise that the last lines do not
|
---|
147 | * match. The caller must trim matching lines from the beginning and end
|
---|
148 | * of the portions it is going to specify.
|
---|
149 | */
|
---|
150 | function _diag ($xoff, $xlim, $yoff, $ylim, $nchunks)
|
---|
151 | {
|
---|
152 | $flip = false;
|
---|
153 |
|
---|
154 | if ($xlim - $xoff > $ylim - $yoff) {
|
---|
155 | /* Things seems faster (I'm not sure I understand why) when the
|
---|
156 | * shortest sequence is in X. */
|
---|
157 | $flip = true;
|
---|
158 | list ($xoff, $xlim, $yoff, $ylim)
|
---|
159 | = array($yoff, $ylim, $xoff, $xlim);
|
---|
160 | }
|
---|
161 |
|
---|
162 | if ($flip) {
|
---|
163 | for ($i = $ylim - 1; $i >= $yoff; $i--) {
|
---|
164 | $ymatches[$this->xv[$i]][] = $i;
|
---|
165 | }
|
---|
166 | } else {
|
---|
167 | for ($i = $ylim - 1; $i >= $yoff; $i--) {
|
---|
168 | $ymatches[$this->yv[$i]][] = $i;
|
---|
169 | }
|
---|
170 | }
|
---|
171 |
|
---|
172 | $this->lcs = 0;
|
---|
173 | $this->seq[0]= $yoff - 1;
|
---|
174 | $this->in_seq = array();
|
---|
175 | $ymids[0] = array();
|
---|
176 |
|
---|
177 | $numer = $xlim - $xoff + $nchunks - 1;
|
---|
178 | $x = $xoff;
|
---|
179 | for ($chunk = 0; $chunk < $nchunks; $chunk++) {
|
---|
180 | if ($chunk > 0) {
|
---|
181 | for ($i = 0; $i <= $this->lcs; $i++) {
|
---|
182 | $ymids[$i][$chunk - 1] = $this->seq[$i];
|
---|
183 | }
|
---|
184 | }
|
---|
185 |
|
---|
186 | $x1 = $xoff + (int)(($numer + ($xlim - $xoff) * $chunk) / $nchunks);
|
---|
187 | for (; $x < $x1; $x++) {
|
---|
188 | $line = $flip ? $this->yv[$x] : $this->xv[$x];
|
---|
189 | if (empty($ymatches[$line])) {
|
---|
190 | continue;
|
---|
191 | }
|
---|
192 | $matches = $ymatches[$line];
|
---|
193 | reset($matches);
|
---|
194 | while (list(, $y) = each($matches)) {
|
---|
195 | if (empty($this->in_seq[$y])) {
|
---|
196 | $k = $this->_lcsPos($y);
|
---|
197 | assert($k > 0);
|
---|
198 | $ymids[$k] = $ymids[$k - 1];
|
---|
199 | break;
|
---|
200 | }
|
---|
201 | }
|
---|
202 | while (list(, $y) = each($matches)) {
|
---|
203 | if ($y > $this->seq[$k - 1]) {
|
---|
204 | assert($y <= $this->seq[$k]);
|
---|
205 | /* Optimization: this is a common case: next match is
|
---|
206 | * just replacing previous match. */
|
---|
207 | $this->in_seq[$this->seq[$k]] = false;
|
---|
208 | $this->seq[$k] = $y;
|
---|
209 | $this->in_seq[$y] = 1;
|
---|
210 | } elseif (empty($this->in_seq[$y])) {
|
---|
211 | $k = $this->_lcsPos($y);
|
---|
212 | assert($k > 0);
|
---|
213 | $ymids[$k] = $ymids[$k - 1];
|
---|
214 | }
|
---|
215 | }
|
---|
216 | }
|
---|
217 | }
|
---|
218 |
|
---|
219 | $seps[] = $flip ? array($yoff, $xoff) : array($xoff, $yoff);
|
---|
220 | $ymid = $ymids[$this->lcs];
|
---|
221 | for ($n = 0; $n < $nchunks - 1; $n++) {
|
---|
222 | $x1 = $xoff + (int)(($numer + ($xlim - $xoff) * $n) / $nchunks);
|
---|
223 | $y1 = $ymid[$n] + 1;
|
---|
224 | $seps[] = $flip ? array($y1, $x1) : array($x1, $y1);
|
---|
225 | }
|
---|
226 | $seps[] = $flip ? array($ylim, $xlim) : array($xlim, $ylim);
|
---|
227 |
|
---|
228 | return array($this->lcs, $seps);
|
---|
229 | }
|
---|
230 |
|
---|
231 | function _lcsPos($ypos)
|
---|
232 | {
|
---|
233 | $end = $this->lcs;
|
---|
234 | if ($end == 0 || $ypos > $this->seq[$end]) {
|
---|
235 | $this->seq[++$this->lcs] = $ypos;
|
---|
236 | $this->in_seq[$ypos] = 1;
|
---|
237 | return $this->lcs;
|
---|
238 | }
|
---|
239 |
|
---|
240 | $beg = 1;
|
---|
241 | while ($beg < $end) {
|
---|
242 | $mid = (int)(($beg + $end) / 2);
|
---|
243 | if ($ypos > $this->seq[$mid]) {
|
---|
244 | $beg = $mid + 1;
|
---|
245 | } else {
|
---|
246 | $end = $mid;
|
---|
247 | }
|
---|
248 | }
|
---|
249 |
|
---|
250 | assert($ypos != $this->seq[$end]);
|
---|
251 |
|
---|
252 | $this->in_seq[$this->seq[$end]] = false;
|
---|
253 | $this->seq[$end] = $ypos;
|
---|
254 | $this->in_seq[$ypos] = 1;
|
---|
255 | return $end;
|
---|
256 | }
|
---|
257 |
|
---|
258 | /**
|
---|
259 | * Finds LCS of two sequences.
|
---|
260 | *
|
---|
261 | * The results are recorded in the vectors $this->{x,y}changed[], by
|
---|
262 | * storing a 1 in the element for each line that is an insertion or
|
---|
263 | * deletion (ie. is not in the LCS).
|
---|
264 | *
|
---|
265 | * The subsequence of file 0 is (XOFF, XLIM) and likewise for file 1.
|
---|
266 | *
|
---|
267 | * Note that XLIM, YLIM are exclusive bounds. All line numbers are
|
---|
268 | * origin-0 and discarded lines are not counted.
|
---|
269 | */
|
---|
270 | function _compareseq ($xoff, $xlim, $yoff, $ylim)
|
---|
271 | {
|
---|
272 | /* Slide down the bottom initial diagonal. */
|
---|
273 | while ($xoff < $xlim && $yoff < $ylim
|
---|
274 | && $this->xv[$xoff] == $this->yv[$yoff]) {
|
---|
275 | ++$xoff;
|
---|
276 | ++$yoff;
|
---|
277 | }
|
---|
278 |
|
---|
279 | /* Slide up the top initial diagonal. */
|
---|
280 | while ($xlim > $xoff && $ylim > $yoff
|
---|
281 | && $this->xv[$xlim - 1] == $this->yv[$ylim - 1]) {
|
---|
282 | --$xlim;
|
---|
283 | --$ylim;
|
---|
284 | }
|
---|
285 |
|
---|
286 | if ($xoff == $xlim || $yoff == $ylim) {
|
---|
287 | $lcs = 0;
|
---|
288 | } else {
|
---|
289 | /* This is ad hoc but seems to work well. $nchunks =
|
---|
290 | * sqrt(min($xlim - $xoff, $ylim - $yoff) / 2.5); $nchunks =
|
---|
291 | * max(2,min(8,(int)$nchunks)); */
|
---|
292 | $nchunks = min(7, $xlim - $xoff, $ylim - $yoff) + 1;
|
---|
293 | list($lcs, $seps)
|
---|
294 | = $this->_diag($xoff, $xlim, $yoff, $ylim, $nchunks);
|
---|
295 | }
|
---|
296 |
|
---|
297 | if ($lcs == 0) {
|
---|
298 | /* X and Y sequences have no common subsequence: mark all
|
---|
299 | * changed. */
|
---|
300 | while ($yoff < $ylim) {
|
---|
301 | $this->ychanged[$this->yind[$yoff++]] = 1;
|
---|
302 | }
|
---|
303 | while ($xoff < $xlim) {
|
---|
304 | $this->xchanged[$this->xind[$xoff++]] = 1;
|
---|
305 | }
|
---|
306 | } else {
|
---|
307 | /* Use the partitions to split this problem into subproblems. */
|
---|
308 | reset($seps);
|
---|
309 | $pt1 = $seps[0];
|
---|
310 | while ($pt2 = next($seps)) {
|
---|
311 | $this->_compareseq ($pt1[0], $pt2[0], $pt1[1], $pt2[1]);
|
---|
312 | $pt1 = $pt2;
|
---|
313 | }
|
---|
314 | }
|
---|
315 | }
|
---|
316 |
|
---|
317 | /**
|
---|
318 | * Adjusts inserts/deletes of identical lines to join changes as much as
|
---|
319 | * possible.
|
---|
320 | *
|
---|
321 | * We do something when a run of changed lines include a line at one end
|
---|
322 | * and has an excluded, identical line at the other. We are free to
|
---|
323 | * choose which identical line is included. `compareseq' usually chooses
|
---|
324 | * the one at the beginning, but usually it is cleaner to consider the
|
---|
325 | * following identical line to be the "change".
|
---|
326 | *
|
---|
327 | * This is extracted verbatim from analyze.c (GNU diffutils-2.7).
|
---|
328 | */
|
---|
329 | function _shiftBoundaries($lines, &$changed, $other_changed)
|
---|
330 | {
|
---|
331 | $i = 0;
|
---|
332 | $j = 0;
|
---|
333 |
|
---|
334 | assert('count($lines) == count($changed)');
|
---|
335 | $len = count($lines);
|
---|
336 | $other_len = count($other_changed);
|
---|
337 |
|
---|
338 | while (1) {
|
---|
339 | /* Scan forward to find the beginning of another run of
|
---|
340 | * changes. Also keep track of the corresponding point in the
|
---|
341 | * other file.
|
---|
342 | *
|
---|
343 | * Throughout this code, $i and $j are adjusted together so that
|
---|
344 | * the first $i elements of $changed and the first $j elements of
|
---|
345 | * $other_changed both contain the same number of zeros (unchanged
|
---|
346 | * lines).
|
---|
347 | *
|
---|
348 | * Furthermore, $j is always kept so that $j == $other_len or
|
---|
349 | * $other_changed[$j] == false. */
|
---|
350 | while ($j < $other_len && $other_changed[$j]) {
|
---|
351 | $j++;
|
---|
352 | }
|
---|
353 |
|
---|
354 | while ($i < $len && ! $changed[$i]) {
|
---|
355 | assert('$j < $other_len && ! $other_changed[$j]');
|
---|
356 | $i++; $j++;
|
---|
357 | while ($j < $other_len && $other_changed[$j]) {
|
---|
358 | $j++;
|
---|
359 | }
|
---|
360 | }
|
---|
361 |
|
---|
362 | if ($i == $len) {
|
---|
363 | break;
|
---|
364 | }
|
---|
365 |
|
---|
366 | $start = $i;
|
---|
367 |
|
---|
368 | /* Find the end of this run of changes. */
|
---|
369 | while (++$i < $len && $changed[$i]) {
|
---|
370 | continue;
|
---|
371 | }
|
---|
372 |
|
---|
373 | do {
|
---|
374 | /* Record the length of this run of changes, so that we can
|
---|
375 | * later determine whether the run has grown. */
|
---|
376 | $runlength = $i - $start;
|
---|
377 |
|
---|
378 | /* Move the changed region back, so long as the previous
|
---|
379 | * unchanged line matches the last changed one. This merges
|
---|
380 | * with previous changed regions. */
|
---|
381 | while ($start > 0 && $lines[$start - 1] == $lines[$i - 1]) {
|
---|
382 | $changed[--$start] = 1;
|
---|
383 | $changed[--$i] = false;
|
---|
384 | while ($start > 0 && $changed[$start - 1]) {
|
---|
385 | $start--;
|
---|
386 | }
|
---|
387 | assert('$j > 0');
|
---|
388 | while ($other_changed[--$j]) {
|
---|
389 | continue;
|
---|
390 | }
|
---|
391 | assert('$j >= 0 && !$other_changed[$j]');
|
---|
392 | }
|
---|
393 |
|
---|
394 | /* Set CORRESPONDING to the end of the changed run, at the
|
---|
395 | * last point where it corresponds to a changed run in the
|
---|
396 | * other file. CORRESPONDING == LEN means no such point has
|
---|
397 | * been found. */
|
---|
398 | $corresponding = $j < $other_len ? $i : $len;
|
---|
399 |
|
---|
400 | /* Move the changed region forward, so long as the first
|
---|
401 | * changed line matches the following unchanged one. This
|
---|
402 | * merges with following changed regions. Do this second, so
|
---|
403 | * that if there are no merges, the changed region is moved
|
---|
404 | * forward as far as possible. */
|
---|
405 | while ($i < $len && $lines[$start] == $lines[$i]) {
|
---|
406 | $changed[$start++] = false;
|
---|
407 | $changed[$i++] = 1;
|
---|
408 | while ($i < $len && $changed[$i]) {
|
---|
409 | $i++;
|
---|
410 | }
|
---|
411 |
|
---|
412 | assert('$j < $other_len && ! $other_changed[$j]');
|
---|
413 | $j++;
|
---|
414 | if ($j < $other_len && $other_changed[$j]) {
|
---|
415 | $corresponding = $i;
|
---|
416 | while ($j < $other_len && $other_changed[$j]) {
|
---|
417 | $j++;
|
---|
418 | }
|
---|
419 | }
|
---|
420 | }
|
---|
421 | } while ($runlength != $i - $start);
|
---|
422 |
|
---|
423 | /* If possible, move the fully-merged run of changes back to a
|
---|
424 | * corresponding run in the other file. */
|
---|
425 | while ($corresponding < $i) {
|
---|
426 | $changed[--$start] = 1;
|
---|
427 | $changed[--$i] = 0;
|
---|
428 | assert('$j > 0');
|
---|
429 | while ($other_changed[--$j]) {
|
---|
430 | continue;
|
---|
431 | }
|
---|
432 | assert('$j >= 0 && !$other_changed[$j]');
|
---|
433 | }
|
---|
434 | }
|
---|
435 | }
|
---|
436 |
|
---|
437 | }
|
---|