001// License: GPL. For details, see LICENSE file.
002package org.openstreetmap.josm.tools;
003
004//// Taken from http://www.bmsi.com/java/#diff
005
006// http://www.bmsi.com/java/DiffPrint.java could also be useful
007
008/*
009 * $Log: Diff.java,v $
010 * Revision 1.15  2013/04/01 16:27:31  stuart
011 * Fix DiffPrint unified format with test cases.
012 * Begin porting some diff-2.7 features.
013 *
014 * Revision 1.14  2010/03/03 21:21:25  stuart
015 * Test new direct equivalence API
016 *
017 * Revision 1.13  2009/12/07 17:43:17  stuart
018 * Compute equivMax for int[] ctor
019 *
020 * Revision 1.12  2009/12/07 17:34:46  stuart
021 * Ctor with int[].
022 *
023 * Revision 1.11  2009/11/15 01:11:54  stuart
024 * Diff doesn't need to be generic
025 *
026 * Revision 1.10  2009/11/15 00:54:03  stuart
027 * Update to Java 5 containers
028 *
029 * Revision 1.7  2009/01/19 03:05:26  stuart
030 * Fix StackOverflow bug with heuristic on reported by Jimmy Han.
031 *
032 * Revision 1.6  2003/03/06 22:51:32  stuart
033 * Convert to CVS
034 *
035 * Revision 1.5  2002/07/19  19:14:40  stuart
036 * fix reverseScript, make change ctor public, update docs
037 *
038 * Revision 1.4  2002/04/09  17:53:39  stuart
039 * More flexible interface for diff() function.
040 *
041 * Revision 1.3  2000/03/03 21:58:03  stuart
042 * move discard_confusing_lines and shift_boundaries to class file_data
043 *
044 * Revision 1.2  2000/03/02  16:37:38  stuart
045 * Add GPL and copyright
046 *
047 */
048
049import java.util.HashMap;
050import java.util.Map;
051
052/** A class to compare vectors of objects.  The result of comparison
053    is a list of <code>change</code> objects which form an
054    edit script.  The objects compared are traditionally lines
055    of text from two files.  Comparison options such as "ignore
056    whitespace" are implemented by modifying the <code>equals</code>
057    and <code>hashcode</code> methods for the objects compared.
058<p>
059   The basic algorithm is described in: <br>
060   "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
061   Algorithmica Vol. 1 No. 2, 1986, p 251.
062<p>
063   This class outputs different results from GNU diff 1.15 on some
064   inputs.  Our results are actually better (smaller change list, smaller
065   total size of changes), but it would be nice to know why.  Perhaps
066   there is a memory overwrite bug in GNU diff 1.15.
067
068  @author Stuart D. Gathman, translated from GNU diff 1.15
069    Copyright (C) 2000  Business Management Systems, Inc.
070<p>
071    This program is free software; you can redistribute it and/or modify
072    it under the terms of the GNU General Public License as published by
073    the Free Software Foundation; either version 1, or (at your option)
074    any later version.
075<p>
076    This program is distributed in the hope that it will be useful,
077    but WITHOUT ANY WARRANTY; without even the implied warranty of
078    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
079    GNU General Public License for more details.
080<p>
081    You should have received a copy of the <a href=COPYING.txt>
082    GNU General Public License</a>
083    along with this program; if not, write to the Free Software
084    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
085 */
086public class Diff {
087
088    /** Prepare to find differences between two arrays.  Each element of
089      the arrays is translated to an "equivalence number" based on
090      the result of <code>equals</code>.  The original Object arrays
091      are no longer needed for computing the differences.  They will
092      be needed again later to print the results of the comparison as
093      an edit script, if desired.
094     * @param a first array
095     * @param b second array
096     */
097    public Diff(Object[] a, Object[] b) {
098        Map<Object, Integer> h = new HashMap<>(a.length + b.length);
099        filevec = new FileData[] {new FileData(a, h), new FileData(b, h)};
100    }
101
102    /** 1 more than the maximum equivalence value used for this or its
103     sibling file. */
104    private int equivMax = 1;
105
106    private int[] xvec, yvec; /* Vectors being compared. */
107    private int[] fdiag;      /* Vector, indexed by diagonal, containing
108                   the X coordinate of the point furthest
109                   along the given diagonal in the forward
110                   search of the edit matrix. */
111    private int[] bdiag;      /* Vector, indexed by diagonal, containing
112                   the X coordinate of the point furthest
113                   along the given diagonal in the backward
114                   search of the edit matrix. */
115    private int fdiagoff, bdiagoff;
116    private final FileData[] filevec;
117    private int cost;
118
119    /**
120     * Find the midpoint of the shortest edit script for a specified
121     * portion of the two files.
122     *
123     * We scan from the beginnings of the files, and simultaneously from the ends,
124     * doing a breadth-first search through the space of edit-sequence.
125     * When the two searches meet, we have found the midpoint of the shortest
126     * edit sequence.
127     *
128     * The value returned is the number of the diagonal on which the midpoint lies.
129     * The diagonal number equals the number of inserted lines minus the number
130     * of deleted lines (counting only lines before the midpoint).
131     * The edit cost is stored into COST; this is the total number of
132     * lines inserted or deleted (counting only lines before the midpoint).
133     *
134     * This function assumes that the first lines of the specified portions
135     * of the two files do not match, and likewise that the last lines do not
136     * match.  The caller must trim matching lines from the beginning and end
137     * of the portions it is going to specify.
138     *
139     * Note that if we return the "wrong" diagonal value, or if
140     * the value of bdiag at that diagonal is "wrong",
141     * the worst this can do is cause suboptimal diff output.
142     * It cannot cause incorrect diff output.
143     * @param xoff xoff
144     * @param xlim xlim
145     * @param yoff yoff
146     * @param ylim ylim
147     * @return midpoint of the shortest edit script
148     */
149    private int diag(int xoff, int xlim, int yoff, int ylim) {
150        final int[] fd = fdiag; // Give the compiler a chance.
151        final int[] bd = bdiag; // Additional help for the compiler.
152        final int[] xv = xvec;      // Still more help for the compiler.
153        final int[] yv = yvec;      // And more and more . . .
154        final int dmin = xoff - ylim;   // Minimum valid diagonal.
155        final int dmax = xlim - yoff;   // Maximum valid diagonal.
156        final int fmid = xoff - yoff;   // Center diagonal of top-down search.
157        final int bmid = xlim - ylim;   // Center diagonal of bottom-up search.
158        int fmin = fmid, fmax = fmid;   // Limits of top-down search.
159        int bmin = bmid, bmax = bmid;   // Limits of bottom-up search.
160        // True if southeast corner is on an odd diagonal with respect to the northwest.
161        final boolean odd = (fmid - bmid & 1) != 0;
162
163        fd[fdiagoff + fmid] = xoff;
164        bd[bdiagoff + bmid] = xlim;
165
166        for (int c = 1;; ++c) {
167            int d;          /* Active diagonal. */
168
169            /* Extend the top-down search by an edit step in each diagonal. */
170            if (fmin > dmin) {
171                fd[fdiagoff + --fmin - 1] = -1;
172            } else {
173                ++fmin;
174            }
175            if (fmax < dmax) {
176                fd[fdiagoff + ++fmax + 1] = -1;
177            } else {
178                --fmax;
179            }
180            for (d = fmax; d >= fmin; d -= 2) {
181                int x, y, tlo = fd[fdiagoff + d - 1], thi = fd[fdiagoff + d + 1];
182
183                if (tlo >= thi) {
184                    x = tlo + 1;
185                } else {
186                    x = thi;
187                }
188                y = x - d;
189                while (x < xlim && y < ylim && xv[x] == yv[y]) {
190                    ++x; ++y;
191                }
192                fd[fdiagoff + d] = x;
193                if (odd && bmin <= d && d <= bmax && bd[bdiagoff + d] <= fd[fdiagoff + d]) {
194                    cost = 2 * c - 1;
195                    return d;
196                }
197            }
198
199            /* Similar extend the bottom-up search. */
200            if (bmin > dmin) {
201                bd[bdiagoff + --bmin - 1] = Integer.MAX_VALUE;
202            } else {
203                ++bmin;
204            }
205            if (bmax < dmax) {
206                bd[bdiagoff + ++bmax + 1] = Integer.MAX_VALUE;
207            } else {
208                --bmax;
209            }
210            for (d = bmax; d >= bmin; d -= 2) {
211                int x, y, tlo = bd[bdiagoff + d - 1], thi = bd[bdiagoff + d + 1];
212
213                if (tlo < thi) {
214                    x = tlo;
215                } else {
216                    x = thi - 1;
217                }
218                y = x - d;
219                while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1]) {
220                    --x; --y;
221                }
222                bd[bdiagoff + d] = x;
223                if (!odd && fmin <= d && d <= fmax && bd[bdiagoff + d] <= fd[fdiagoff + d]) {
224                    cost = 2 * c;
225                    return d;
226                }
227            }
228        }
229    }
230
231    /**
232     * Compare in detail contiguous subsequences of the two files
233     * which are known, as a whole, to match each other.
234     *
235     * The results are recorded in the vectors filevec[N].changed_flag, by
236     * storing a 1 in the element for each line that is an insertion or deletion.
237     *
238     * The subsequence of file 0 is [XOFF, XLIM) and likewise for file 1.
239     *
240     * Note that XLIM, YLIM are exclusive bounds.
241     * All line numbers are origin-0 and discarded lines are not counted.
242     * @param xoff xoff
243     * @param xlim xlim
244     * @param yoff yoff
245     * @param ylim ylim
246     */
247    private void compareseq(int xoff, int xlim, int yoff, int ylim) {
248        /* Slide down the bottom initial diagonal. */
249        while (xoff < xlim && yoff < ylim && xvec[xoff] == yvec[yoff]) {
250            ++xoff; ++yoff;
251        }
252        /* Slide up the top initial diagonal. */
253        while (xlim > xoff && ylim > yoff && xvec[xlim - 1] == yvec[ylim - 1]) {
254            --xlim; --ylim;
255        }
256
257        /* Handle simple cases. */
258        if (xoff == xlim) {
259            while (yoff < ylim) {
260                filevec[1].changedFlag[1+filevec[1].realindexes[yoff++]] = true;
261            }
262        } else if (yoff == ylim) {
263            while (xoff < xlim) {
264                filevec[0].changedFlag[1+filevec[0].realindexes[xoff++]] = true;
265            }
266        } else {
267            /* Find a point of correspondence in the middle of the files.  */
268
269            int d = diag(xoff, xlim, yoff, ylim);
270            int c = cost;
271            int b = bdiag[bdiagoff + d];
272
273            if (c == 1)
274                /* This should be impossible, because it implies that
275                   one of the two subsequences is empty,
276                   and that case was handled above without calling `diag'.
277                   Let's verify that this is true.  */
278                throw new IllegalArgumentException("Empty subsequence");
279            else {
280                /* Use that point to split this problem into two subproblems.  */
281                compareseq(xoff, b, yoff, b - d);
282                /* This used to use f instead of b,
283                   but that is incorrect!
284                   It is not necessarily the case that diagonal d
285                   has a snake from b to f.  */
286                compareseq(b, xlim, b - d, ylim);
287            }
288        }
289    }
290
291    /** Discard lines from one file that have no matches in the other file.
292     */
293    private void discard_confusing_lines() {
294        filevec[0].discard_confusing_lines(filevec[1]);
295        filevec[1].discard_confusing_lines(filevec[0]);
296    }
297
298    /**
299     * Adjust inserts/deletes of blank lines to join changes as much as possible.
300     */
301    private void shift_boundaries() {
302        filevec[0].shift_boundaries(filevec[1]);
303        filevec[1].shift_boundaries(filevec[0]);
304    }
305
306    /**
307     * Script builder.
308     */
309    public interface ScriptBuilder {
310        /**
311         * Scan the tables of which lines are inserted and deleted, producing an edit script.
312         * @param changed0 true for lines in first file which do not match 2nd
313         * @param len0 number of lines in first file
314         * @param changed1 true for lines in 2nd file which do not match 1st
315         * @param len1 number of lines in 2nd file
316         * @return a linked list of changes - or null
317         */
318        Change build_script(
319                boolean[] changed0, int len0,
320                boolean[] changed1, int len1
321        );
322    }
323
324    /** Scan the tables of which lines are inserted and deleted,
325     producing an edit script in reverse order.  */
326
327    static class ReverseScript implements ScriptBuilder {
328        @Override
329        public  Change build_script(
330                final boolean[] changed0, int len0,
331                final boolean[] changed1, int len1) {
332            Change script = null;
333            int i0 = 0, i1 = 0;
334            while (i0 < len0 || i1 < len1) {
335                if (changed0[1+i0] || changed1[1+i1]) {
336                    int line0 = i0, line1 = i1;
337
338                    /* Find # lines changed here in each file.  */
339                    while (changed0[1+i0]) {
340                        ++i0;
341                    }
342                    while (changed1[1+i1]) {
343                        ++i1;
344                    }
345
346                    /* Record this change.  */
347                    script = new Change(line0, line1, i0 - line0, i1 - line1, script);
348                }
349
350                /* We have reached lines in the two files that match each other.  */
351                i0++; i1++;
352            }
353
354            return script;
355        }
356    }
357
358    static class ForwardScript implements ScriptBuilder {
359        /** Scan the tables of which lines are inserted and deleted,
360            producing an edit script in forward order.  */
361        @Override
362        public Change build_script(
363                final boolean[] changed0, int len0,
364                final boolean[] changed1, int len1) {
365            Change script = null;
366            int i0 = len0, i1 = len1;
367
368            while (i0 >= 0 || i1 >= 0) {
369                if (changed0[i0] || changed1[i1]) {
370                    int line0 = i0, line1 = i1;
371
372                    /* Find # lines changed here in each file.  */
373                    while (changed0[i0]) {
374                        --i0;
375                    }
376                    while (changed1[i1]) {
377                        --i1;
378                    }
379
380                    /* Record this change.  */
381                    script = new Change(i0, i1, line0 - i0, line1 - i1, script);
382                }
383
384                /* We have reached lines in the two files that match each other.  */
385                i0--; i1--;
386            }
387
388            return script;
389        }
390    }
391
392    /** Standard Forward ScriptBuilder. */
393    public static final ScriptBuilder forwardScript = new ForwardScript();
394    /** Standard Reverse ScriptBuilder. */
395    public static final ScriptBuilder reverseScript = new ReverseScript();
396
397    /**
398     * Report the differences of two files. DEPTH is the current directory depth.
399     * @param reverse if {@code true} use {@link #reverseScript} else use {@link #forwardScript}
400     * @return the differences of two files
401     */
402    public final Change diff_2(final boolean reverse) {
403        return diff(reverse ? reverseScript : forwardScript);
404    }
405
406    /**
407     * Get the results of comparison as an edit script.  The script
408     * is described by a list of changes.  The standard ScriptBuilder
409     * implementations provide for forward and reverse edit scripts.
410     * Alternate implementations could, for instance, list common elements
411     * instead of differences.
412     * @param bld an object to build the script from change flags
413     * @return the head of a list of changes
414     */
415    public Change diff(final ScriptBuilder bld) {
416
417        // Some lines are obviously insertions or deletions because they don't match anything.
418        // Detect them now, and avoid even thinking about them in the main comparison algorithm.
419        discard_confusing_lines();
420
421        // Now do the main comparison algorithm, considering just the undiscarded lines.
422        xvec = filevec[0].undiscarded;
423        yvec = filevec[1].undiscarded;
424
425        int diags = filevec[0].nondiscardedLines + filevec[1].nondiscardedLines + 3;
426        fdiag = new int[diags];
427        fdiagoff = filevec[1].nondiscardedLines + 1;
428        bdiag = new int[diags];
429        bdiagoff = filevec[1].nondiscardedLines + 1;
430
431        compareseq(0, filevec[0].nondiscardedLines,
432                   0, filevec[1].nondiscardedLines);
433        fdiag = null;
434        bdiag = null;
435
436        // Modify the results slightly to make them prettier in cases where that can validly be done.
437        shift_boundaries();
438
439        // Get the results of comparison in the form of a chain of `struct change's -- an edit script.
440        return bld.build_script(
441                filevec[0].changedFlag,
442                filevec[0].bufferedLines,
443                filevec[1].changedFlag,
444                filevec[1].bufferedLines
445        );
446    }
447
448    /** The result of comparison is an "edit script": a chain of change objects.
449     Each change represents one place where some lines are deleted
450     and some are inserted.
451
452     LINE0 and LINE1 are the first affected lines in the two files (origin 0).
453     DELETED is the number of lines deleted here from file 0.
454     INSERTED is the number of lines inserted here in file 1.
455
456     If DELETED is 0 then LINE0 is the number of the line before
457     which the insertion was done; vice versa for INSERTED and LINE1.  */
458
459    public static class Change {
460        /** Previous or next edit command. */
461        public Change link;
462        /** # lines of file 1 changed here.  */
463        public final int inserted;
464        /** # lines of file 0 changed here.  */
465        public final int deleted;
466        /** Line number of 1st deleted line.  */
467        public final int line0;
468        /** Line number of 1st inserted line.  */
469        public final int line1;
470
471        /**
472         * Cons an additional entry onto the front of an edit script OLD.
473         * LINE0 and LINE1 are the first affected lines in the two files (origin 0).
474         * DELETED is the number of lines deleted here from file 0.
475         * INSERTED is the number of lines inserted here in file 1.
476         *
477         * If DELETED is 0 then LINE0 is the number of the line before
478         * which the insertion was done; vice versa for INSERTED and LINE1.
479         * @param line0 first affected lines in the two files (origin 0)
480         * @param line1 first affected lines in the two files (origin 0)
481         * @param deleted the number of lines deleted here from file 0
482         * @param inserted the number of lines inserted here in file 1
483         * @param old edit script
484         */
485        public Change(int line0, int line1, int deleted, int inserted, Change old) {
486            this.line0 = line0;
487            this.line1 = line1;
488            this.inserted = inserted;
489            this.deleted = deleted;
490            this.link = old;
491        }
492
493        /**
494         * Returns the number of insertions and deletions of this change as well as
495         * (recursively) the changes linked via {@link #link}.
496         * @return recursive number of insertions and deletions
497         */
498        public int getTotalNumberOfChanges() {
499            return inserted + deleted + (link != null ? link.getTotalNumberOfChanges() : 0);
500        }
501
502        @Override
503        public String toString() {
504            String s = String.format("%d -%d +%d %d", line0, deleted, inserted, line1);
505            return (link != null) ? s + '\n' + link : s;
506        }
507    }
508
509    /**
510     * Data on one input file being compared.
511     */
512    class FileData {
513
514        /** Allocate changed array for the results of comparison.  */
515        void clear() {
516            // Allocate a flag for each line of each file, saying whether that line is an insertion or deletion.
517            // Allocate an extra element, always zero, at each end of each vector.
518            changedFlag = new boolean[bufferedLines + 2];
519        }
520
521        /**
522         * Return equiv_count[I] as the number of lines in this file that fall in equivalence class I.
523         * @return the array of equivalence class counts.
524         */
525        int[] equivCount() {
526            int[] equivCount = new int[equivMax];
527            for (int i = 0; i < bufferedLines; ++i) {
528                ++equivCount[equivs[i]];
529            }
530            return equivCount;
531        }
532
533        /**
534         * Discard lines that have no matches in another file.
535         *
536         * A line which is discarded will not be considered by the actual comparison algorithm;
537         * it will be as if that line were not in the file.
538         * The file's `realindexes' table maps virtual line numbers
539         * (which don't count the discarded lines) into real line numbers;
540         * this is how the actual comparison algorithm produces results
541         * that are comprehensible when the discarded lines are counted.
542         * <p>
543         * When we discard a line, we also mark it as a deletion or insertion so that it will be printed in the output.
544         * @param f the other file
545         */
546        void discard_confusing_lines(FileData f) {
547            clear();
548            // Set up table of which lines are going to be discarded.
549            final byte[] discarded = discardable(f.equivCount());
550
551            // Don't really discard the provisional lines except when they occur in a run of discardables,
552            // with nonprovisionals at the beginning and end.
553            filterDiscards(discarded);
554
555            // Actually discard the lines.
556            discard(discarded);
557        }
558
559        /**
560         * Mark to be discarded each line that matches no line of another file.
561         * If a line matches many lines, mark it as provisionally discardable.
562         * @param counts The count of each equivalence number for the other file.
563         * @return 0=nondiscardable, 1=discardable or 2=provisionally discardable for each line
564         * @see #equivCount()
565         */
566        private byte[] discardable(final int[] counts) {
567            final int end = bufferedLines;
568            final byte[] discards = new byte[end];
569            final int[] equivs = this.equivs;
570            int many = 5;
571            int tem = end / 64;
572
573            /* Multiply MANY by approximate square root of number of lines.
574               That is the threshold for provisionally discardable lines.  */
575            while ((tem = tem >> 2) > 0) {
576                many *= 2;
577            }
578
579            for (int i = 0; i < end; i++) {
580                int nmatch;
581                if (equivs[i] == 0) {
582                    continue;
583                }
584                nmatch = counts[equivs[i]];
585                if (nmatch == 0) {
586                    discards[i] = 1;
587                } else if (nmatch > many) {
588                    discards[i] = 2;
589                }
590            }
591            return discards;
592        }
593
594        /**
595         * Don't really discard the provisional lines except when they occur
596         * in a run of discardables, with nonprovisionals at the beginning and end.
597         * @param discards discards
598         */
599        private void filterDiscards(final byte[] discards) {
600            final int end = bufferedLines;
601
602            for (int i = 0; i < end; i++) {
603                /* Cancel provisional discards not in middle of run of discards.  */
604                if (discards[i] == 2) {
605                    discards[i] = 0;
606                } else if (discards[i] != 0) {
607                    /* We have found a nonprovisional discard.  */
608                    int j;
609                    int length;
610                    int provisional = 0;
611
612                    /* Find end of this run of discardable lines.
613                       Count how many are provisionally discardable.  */
614                    for (j = i; j < end; j++) {
615                        if (discards[j] == 0) {
616                            break;
617                        }
618                        if (discards[j] == 2) {
619                            ++provisional;
620                        }
621                    }
622
623                    /* Cancel provisional discards at end, and shrink the run.  */
624                    while (j > i && discards[j - 1] == 2) {
625                        discards[--j] = 0; --provisional;
626                    }
627
628                    /* Now we have the length of a run of discardable lines
629                       whose first and last are not provisional.  */
630                    length = j - i;
631
632                    /* If 1/4 of the lines in the run are provisional,
633                       cancel discarding of all provisional lines in the run.  */
634                    if (provisional * 4 > length) {
635                        while (j > i)
636                            if (discards[--j] == 2) {
637                                discards[j] = 0;
638                            }
639                    } else {
640                        int consec;
641                        int minimum = 1;
642                        int tem = length / 4;
643
644                        /* MINIMUM is approximate square root of LENGTH/4.
645                           A subrun of two or more provisionals can stand
646                           when LENGTH is at least 16.
647                           A subrun of 4 or more can stand when LENGTH >= 64.  */
648                        while ((tem = tem >> 2) > 0) {
649                            minimum *= 2;
650                        }
651                        minimum++;
652
653                        /* Cancel any subrun of MINIMUM or more provisionals
654                           within the larger run.  */
655                        for (j = 0, consec = 0; j < length; j++) {
656                            if (discards[i + j] != 2) {
657                                consec = 0;
658                            } else if (minimum == ++consec) {
659                                /* Back up to start of subrun, to cancel it all.  */
660                                j -= consec;
661                            } else if (minimum < consec) {
662                                discards[i + j] = 0;
663                            }
664                        }
665
666                        /* Scan from beginning of run
667                           until we find 3 or more nonprovisionals in a row
668                           or until the first nonprovisional at least 8 lines in.
669                           Until that point, cancel any provisionals.  */
670                        for (j = 0, consec = 0; j < length; j++) {
671                            if (j >= 8 && discards[i + j] == 1) {
672                                break;
673                            }
674                            if (discards[i + j] == 2) {
675                                consec = 0; discards[i + j] = 0;
676                            } else if (discards[i + j] == 0) {
677                                consec = 0;
678                            } else {
679                                consec++;
680                            }
681                            if (consec == 3) {
682                                break;
683                            }
684                        }
685
686                        /* I advances to the last line of the run.  */
687                        i += length - 1;
688
689                        /* Same thing, from end.  */
690                        for (j = 0, consec = 0; j < length; j++) {
691                            if (j >= 8 && discards[i - j] == 1) {
692                                break;
693                            }
694                            if (discards[i - j] == 2) {
695                                consec = 0; discards[i - j] = 0;
696                            } else if (discards[i - j] == 0) {
697                                consec = 0;
698                            } else {
699                                consec++;
700                            }
701                            if (consec == 3) {
702                                break;
703                            }
704                        }
705                    }
706                }
707            }
708        }
709
710        /** Actually discard the lines.
711            @param discards flags lines to be discarded
712         */
713        private void discard(final byte[] discards) {
714            final int end = bufferedLines;
715            int j = 0;
716            for (int i = 0; i < end; ++i) {
717                if (discards[i] == 0) {
718                    undiscarded[j] = equivs[i];
719                    realindexes[j++] = i;
720                } else {
721                    changedFlag[1+i] = true;
722                }
723            }
724            nondiscardedLines = j;
725        }
726
727        FileData(int length) {
728            bufferedLines = length;
729            equivs = new int[length];
730            undiscarded = new int[bufferedLines];
731            realindexes = new int[bufferedLines];
732        }
733
734        FileData(Object[] data, Map<Object, Integer> h) {
735            this(data.length);
736            // FIXME: diff 2.7 removes common prefix and common suffix
737            for (int i = 0; i < data.length; ++i) {
738                Integer ir = h.get(data[i]);
739                if (ir == null) {
740                    equivs[i] = equivMax++;
741                    h.put(data[i], equivs[i]);
742                } else {
743                    equivs[i] = ir.intValue();
744                }
745            }
746        }
747
748        /** Adjust inserts/deletes of blank lines to join changes
749       as much as possible.
750
751       We do something when a run of changed lines include a blank
752       line at one end and have an excluded blank line at the other.
753       We are free to choose which blank line is included.
754       `compareseq' always chooses the one at the beginning,
755       but usually it is cleaner to consider the following blank line
756       to be the "change".  The only exception is if the preceding blank line
757       would join this change to other changes.
758      @param f the file being compared against
759         */
760        void shift_boundaries(FileData f) {
761            final boolean[] changed = changedFlag;
762            final boolean[] otherChanged = f.changedFlag;
763            int i = 0;
764            int j = 0;
765            int iEnd = bufferedLines;
766            int preceding = -1;
767            int otherPreceding = -1;
768
769            for (;;) {
770                int start, end, otherStart;
771
772                /* Scan forwards to find beginning of another run of changes.
773                   Also keep track of the corresponding point in the other file.  */
774
775                while (i < iEnd && !changed[1+i]) {
776                    while (otherChanged[1+j++]) {
777                        /* Non-corresponding lines in the other file
778                           will count as the preceding batch of changes.  */
779                        otherPreceding = j;
780                    }
781                    i++;
782                }
783
784                if (i == iEnd) {
785                    break;
786                }
787
788                start = i;
789                otherStart = j;
790
791                for (;;) {
792                    /* Now find the end of this run of changes.  */
793
794                    while (i < iEnd && changed[1+i]) {
795                        i++;
796                    }
797                    end = i;
798
799                    /* If the first changed line matches the following unchanged one,
800                       and this run does not follow right after a previous run,
801                       and there are no lines deleted from the other file here,
802                       then classify the first changed line as unchanged
803                       and the following line as changed in its place.  */
804
805                    /* You might ask, how could this run follow right after another?
806                       Only because the previous run was shifted here.  */
807
808                    if (end != iEnd && equivs[start] == equivs[end] && !otherChanged[1+j]
809                         && !((preceding >= 0 && start == preceding) || (otherPreceding >= 0 && otherStart == otherPreceding))) {
810                        changed[1+end++] = true;
811                        changed[1+start++] = false;
812                        ++i;
813                        /* Since one line-that-matches is now before this run
814                           instead of after, we must advance in the other file
815                           to keep in synch.  */
816                        ++j;
817                    } else {
818                        break;
819                    }
820                }
821
822                preceding = i;
823                otherPreceding = j;
824            }
825        }
826
827        /** Number of elements (lines) in this file. */
828        private final int bufferedLines;
829
830        /** Vector, indexed by line number, containing an equivalence code for
831           each line.  It is this vector that is actually compared with that
832           of another file to generate differences. */
833        private final int[]     equivs;
834
835        /** Vector, like the previous one except that
836           the elements for discarded lines have been squeezed out.  */
837        private final int[]    undiscarded;
838
839        /** Vector mapping virtual line numbers (not counting discarded lines)
840           to real ones (counting those lines).  Both are origin-0.  */
841        private final int[]    realindexes;
842
843        /** Total number of nondiscarded lines. */
844        private int         nondiscardedLines;
845
846        /** Array, indexed by real origin-1 line number,
847           containing true for a line that is an insertion or a deletion.
848           The results of comparison are stored here.  */
849        private boolean[]       changedFlag;
850    }
851}