001    package org.openstreetmap.josm.tools;
002    
003    //// Taken from http://www.bmsi.com/java/#diff
004    
005    // http://www.bmsi.com/java/DiffPrint.java could also be useful
006    
007    /*
008     * $Log: Diff.java,v $
009     * Revision 1.7  2009/01/19 03:05:26  stuart
010     * Fix StackOverflow bug with heuristic on reported by Jimmy Han.
011     *
012     * Revision 1.6  2003/03/06 22:51:32  stuart
013     * Convert to CVS
014     *
015     * Revision 1.5  2002/07/19  19:14:40  stuart
016     * fix reverseScript, make change ctor public, update docs
017     *
018     * Revision 1.4  2002/04/09  17:53:39  stuart
019     * More flexible interface for diff() function.
020     *
021     * Revision 1.3  2000/03/03 21:58:03  stuart
022     * move discard_confusing_lines and shift_boundaries to class file_data
023     *
024     * Revision 1.2  2000/03/02  16:37:38  stuart
025     * Add GPL and copyright
026     *
027     */
028    
029    import java.util.Hashtable;
030    
031    /** A class to compare vectors of objects.  The result of comparison
032        is a list of <code>change</code> objects which form an
033        edit script.  The objects compared are traditionally lines
034        of text from two files.  Comparison options such as "ignore
035        whitespace" are implemented by modifying the <code>equals</code>
036        and <code>hashcode</code> methods for the objects compared.
037    <p>
038       The basic algorithm is described in: </br>
039       "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
040       Algorithmica Vol. 1 No. 2, 1986, p 251.
041    <p>
042       This class outputs different results from GNU diff 1.15 on some
043       inputs.  Our results are actually better (smaller change list, smaller
044       total size of changes), but it would be nice to know why.  Perhaps
045       there is a memory overwrite bug in GNU diff 1.15.
046    
047      @author Stuart D. Gathman, translated from GNU diff 1.15
048        Copyright (C) 2000  Business Management Systems, Inc.
049    <p>
050        This program is free software; you can redistribute it and/or modify
051        it under the terms of the GNU General Public License as published by
052        the Free Software Foundation; either version 1, or (at your option)
053        any later version.
054    <p>
055        This program is distributed in the hope that it will be useful,
056        but WITHOUT ANY WARRANTY; without even the implied warranty of
057        MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
058        GNU General Public License for more details.
059    <p>
060        You should have received a copy of the <a href=COPYING.txt>
061        GNU General Public License</a>
062        along with this program; if not, write to the Free Software
063        Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
064    
065     */
066    
067    public class Diff {
068    
069        /** Prepare to find differences between two arrays.  Each element of
070          the arrays is translated to an "equivalence number" based on
071          the result of <code>equals</code>.  The original Object arrays
072          are no longer needed for computing the differences.  They will
073          be needed again later to print the results of the comparison as
074          an edit script, if desired.
075         */
076        public Diff(Object[] a,Object[] b) {
077            Hashtable<Object, Integer> h = new Hashtable<Object, Integer>(a.length + b.length);
078            filevec[0] = new file_data(a,h);
079            filevec[1] = new file_data(b,h);
080        }
081    
082        /** 1 more than the maximum equivalence value used for this or its
083         sibling file. */
084        private int equiv_max = 1;
085    
086        /** When set to true, the comparison uses a heuristic to speed it up.
087        With this heuristic, for files with a constant small density
088        of changes, the algorithm is linear in the file size.  */
089        public boolean heuristic = false;
090    
091        /** When set to true, the algorithm returns a guarranteed minimal
092          set of changes.  This makes things slower, sometimes much slower. */
093        public boolean no_discards = false;
094    
095        private int[] xvec, yvec; /* Vectors being compared. */
096        private int[] fdiag;      /* Vector, indexed by diagonal, containing
097                       the X coordinate of the point furthest
098                       along the given diagonal in the forward
099                       search of the edit matrix. */
100        private int[] bdiag;      /* Vector, indexed by diagonal, containing
101                       the X coordinate of the point furthest
102                       along the given diagonal in the backward
103                       search of the edit matrix. */
104        private int fdiagoff, bdiagoff;
105        private final file_data[] filevec = new file_data[2];
106        private int cost;
107        /** Snakes bigger than this are considered "big". */
108        private static final int SNAKE_LIMIT = 20;
109    
110        /** Find the midpoint of the shortest edit script for a specified
111         portion of the two files.
112    
113         We scan from the beginnings of the files, and simultaneously from the ends,
114         doing a breadth-first search through the space of edit-sequence.
115         When the two searches meet, we have found the midpoint of the shortest
116         edit sequence.
117    
118         The value returned is the number of the diagonal on which the midpoint lies.
119         The diagonal number equals the number of inserted lines minus the number
120         of deleted lines (counting only lines before the midpoint).
121         The edit cost is stored into COST; this is the total number of
122         lines inserted or deleted (counting only lines before the midpoint).
123    
124         This function assumes that the first lines of the specified portions
125         of the two files do not match, and likewise that the last lines do not
126         match.  The caller must trim matching lines from the beginning and end
127         of the portions it is going to specify.
128    
129         Note that if we return the "wrong" diagonal value, or if
130         the value of bdiag at that diagonal is "wrong",
131         the worst this can do is cause suboptimal diff output.
132         It cannot cause incorrect diff output.  */
133    
134        private int diag (int xoff, int xlim, int yoff, int ylim) {
135            final int[] fd = fdiag; // Give the compiler a chance.
136            final int[] bd = bdiag; // Additional help for the compiler.
137            final int[] xv = xvec;      // Still more help for the compiler.
138            final int[] yv = yvec;      // And more and more . . .
139            final int dmin = xoff - ylim;   // Minimum valid diagonal.
140            final int dmax = xlim - yoff;   // Maximum valid diagonal.
141            final int fmid = xoff - yoff;   // Center diagonal of top-down search.
142            final int bmid = xlim - ylim;   // Center diagonal of bottom-up search.
143            int fmin = fmid, fmax = fmid;   // Limits of top-down search.
144            int bmin = bmid, bmax = bmid;   // Limits of bottom-up search.
145            /* True if southeast corner is on an odd
146                         diagonal with respect to the northwest. */
147            final boolean odd = (fmid - bmid & 1) != 0;
148    
149            fd[fdiagoff + fmid] = xoff;
150            bd[bdiagoff + bmid] = xlim;
151    
152            for (int c = 1;; ++c)
153            {
154                int d;          /* Active diagonal. */
155                boolean big_snake = false;
156    
157                /* Extend the top-down search by an edit step in each diagonal. */
158                if (fmin > dmin) {
159                    fd[fdiagoff + --fmin - 1] = -1;
160                } else {
161                    ++fmin;
162                }
163                if (fmax < dmax) {
164                    fd[fdiagoff + ++fmax + 1] = -1;
165                } else {
166                    --fmax;
167                }
168                for (d = fmax; d >= fmin; d -= 2)
169                {
170                    int x, y, oldx, tlo = fd[fdiagoff + d - 1], thi = fd[fdiagoff + d + 1];
171    
172                    if (tlo >= thi) {
173                        x = tlo + 1;
174                    } else {
175                        x = thi;
176                    }
177                    oldx = x;
178                    y = x - d;
179                    while (x < xlim && y < ylim && xv[x] == yv[y]) {
180                        ++x; ++y;
181                    }
182                    if (x - oldx > SNAKE_LIMIT) {
183                        big_snake = true;
184                    }
185                    fd[fdiagoff + d] = x;
186                    if (odd && bmin <= d && d <= bmax && bd[bdiagoff + d] <= fd[fdiagoff + d])
187                    {
188                        cost = 2 * c - 1;
189                        return d;
190                    }
191                }
192    
193                /* Similar extend the bottom-up search. */
194                if (bmin > dmin) {
195                    bd[bdiagoff + --bmin - 1] = Integer.MAX_VALUE;
196                } else {
197                    ++bmin;
198                }
199                if (bmax < dmax) {
200                    bd[bdiagoff + ++bmax + 1] = Integer.MAX_VALUE;
201                } else {
202                    --bmax;
203                }
204                for (d = bmax; d >= bmin; d -= 2)
205                {
206                    int x, y, oldx, tlo = bd[bdiagoff + d - 1], thi = bd[bdiagoff + d + 1];
207    
208                    if (tlo < thi) {
209                        x = tlo;
210                    } else {
211                        x = thi - 1;
212                    }
213                    oldx = x;
214                    y = x - d;
215                    while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1]) {
216                        --x; --y;
217                    }
218                    if (oldx - x > SNAKE_LIMIT) {
219                        big_snake = true;
220                    }
221                    bd[bdiagoff + d] = x;
222                    if (!odd && fmin <= d && d <= fmax && bd[bdiagoff + d] <= fd[fdiagoff + d])
223                    {
224                        cost = 2 * c;
225                        return d;
226                    }
227                }
228    
229                /* Heuristic: check occasionally for a diagonal that has made
230           lots of progress compared with the edit distance.
231           If we have any such, find the one that has made the most
232           progress and return it as if it had succeeded.
233    
234           With this heuristic, for files with a constant small density
235           of changes, the algorithm is linear in the file size.  */
236    
237                if (c > 200 && big_snake && heuristic)
238                {
239                    int best = 0;
240                    int bestpos = -1;
241    
242                    for (d = fmax; d >= fmin; d -= 2)
243                    {
244                        int dd = d - fmid;
245                        int x = fd[fdiagoff + d];
246                        int y = x - d;
247                        int v = (x - xoff) * 2 - dd;
248                        if (v > 12 * (c + (dd < 0 ? -dd : dd)))
249                        {
250                            if (v > best
251                                    && xoff + SNAKE_LIMIT <= x && x < xlim
252                                    && yoff + SNAKE_LIMIT <= y && y < ylim)
253                            {
254                                /* We have a good enough best diagonal;
255                   now insist that it end with a significant snake.  */
256                                int k;
257    
258                                for (k = 1; xvec[x - k] == yvec[y - k]; k++)
259                                    if (k == SNAKE_LIMIT)
260                                    {
261                                        best = v;
262                                        bestpos = d;
263                                        break;
264                                    }
265                            }
266                        }
267                    }
268                    if (best > 0)
269                    {
270                        cost = 2 * c - 1;
271                        return bestpos;
272                    }
273    
274                    best = 0;
275                    for (d = bmax; d >= bmin; d -= 2)
276                    {
277                        int dd = d - bmid;
278                        int x = bd[bdiagoff + d];
279                        int y = x - d;
280                        int v = (xlim - x) * 2 + dd;
281                        if (v > 12 * (c + (dd < 0 ? -dd : dd)))
282                        {
283                            if (v > best
284                                    && xoff < x && x <= xlim - SNAKE_LIMIT
285                                    && yoff < y && y <= ylim - SNAKE_LIMIT)
286                            {
287                                /* We have a good enough best diagonal;
288                   now insist that it end with a significant snake.  */
289                                int k;
290    
291                                for (k = 0; xvec[x + k] == yvec[y + k]; k++)
292                                    if (k == SNAKE_LIMIT)
293                                    {
294                                        best = v;
295                                        bestpos = d;
296                                        break;
297                                    }
298                            }
299                        }
300                    }
301                    if (best > 0)
302                    {
303                        cost = 2 * c - 1;
304                        return bestpos;
305                    }
306                }
307            }
308        }
309    
310        /** Compare in detail contiguous subsequences of the two files
311         which are known, as a whole, to match each other.
312    
313         The results are recorded in the vectors filevec[N].changed_flag, by
314         storing a 1 in the element for each line that is an insertion or deletion.
315    
316         The subsequence of file 0 is [XOFF, XLIM) and likewise for file 1.
317    
318         Note that XLIM, YLIM are exclusive bounds.
319         All line numbers are origin-0 and discarded lines are not counted.  */
320    
321        private void compareseq (int xoff, int xlim, int yoff, int ylim) {
322            /* Slide down the bottom initial diagonal. */
323            while (xoff < xlim && yoff < ylim && xvec[xoff] == yvec[yoff]) {
324                ++xoff; ++yoff;
325            }
326            /* Slide up the top initial diagonal. */
327            while (xlim > xoff && ylim > yoff && xvec[xlim - 1] == yvec[ylim - 1]) {
328                --xlim; --ylim;
329            }
330    
331            /* Handle simple cases. */
332            if (xoff == xlim) {
333                while (yoff < ylim) {
334                    filevec[1].changed_flag[1+filevec[1].realindexes[yoff++]] = true;
335                }
336            } else if (yoff == ylim) {
337                while (xoff < xlim) {
338                    filevec[0].changed_flag[1+filevec[0].realindexes[xoff++]] = true;
339                }
340            } else
341            {
342                /* Find a point of correspondence in the middle of the files.  */
343    
344                int d = diag (xoff, xlim, yoff, ylim);
345                int c = cost;
346                //int f = fdiag[fdiagoff + d];
347                int b = bdiag[bdiagoff + d];
348    
349                if (c == 1)
350                    /* This should be impossible, because it implies that
351               one of the two subsequences is empty,
352               and that case was handled above without calling `diag'.
353               Let's verify that this is true.  */
354                    throw new IllegalArgumentException("Empty subsequence");
355                else
356                {
357                    /* Use that point to split this problem into two subproblems.  */
358                    compareseq (xoff, b, yoff, b - d);
359                    /* This used to use f instead of b,
360               but that is incorrect!
361               It is not necessarily the case that diagonal d
362               has a snake from b to f.  */
363                    compareseq (b, xlim, b - d, ylim);
364                }
365            }
366        }
367    
368        /** Discard lines from one file that have no matches in the other file.
369         */
370    
371        private void discard_confusing_lines() {
372            filevec[0].discard_confusing_lines(filevec[1]);
373            filevec[1].discard_confusing_lines(filevec[0]);
374        }
375    
376        private boolean inhibit = false;
377    
378        /** Adjust inserts/deletes of blank lines to join changes
379         as much as possible.
380         */
381    
382        private void shift_boundaries() {
383            if (inhibit)
384                return;
385            filevec[0].shift_boundaries(filevec[1]);
386            filevec[1].shift_boundaries(filevec[0]);
387        }
388    
389        public interface ScriptBuilder {
390            /** Scan the tables of which lines are inserted and deleted,
391         producing an edit script.
392       @param changed0 true for lines in first file which do not match 2nd
393       @param len0 number of lines in first file
394       @param changed1 true for lines in 2nd file which do not match 1st
395       @param len1 number of lines in 2nd file
396       @return a linked list of changes - or null
397             */
398            public change build_script(
399                    boolean[] changed0,int len0,
400                    boolean[] changed1,int len1
401            );
402        }
403    
404        /** Scan the tables of which lines are inserted and deleted,
405         producing an edit script in reverse order.  */
406    
407        static class ReverseScript implements ScriptBuilder {
408            public  change build_script(
409                    final boolean[] changed0,int len0,
410                    final boolean[] changed1,int len1)
411            {
412                change script = null;
413                int i0 = 0, i1 = 0;
414                while (i0 < len0 || i1 < len1) {
415                    if (changed0[1+i0] || changed1[1+i1]) {
416                        int line0 = i0, line1 = i1;
417    
418                        /* Find # lines changed here in each file.  */
419                        while (changed0[1+i0]) {
420                            ++i0;
421                        }
422                        while (changed1[1+i1]) {
423                            ++i1;
424                        }
425    
426                        /* Record this change.  */
427                        script = new change(line0, line1, i0 - line0, i1 - line1, script);
428                    }
429    
430                    /* We have reached lines in the two files that match each other.  */
431                    i0++; i1++;
432                }
433    
434                return script;
435            }
436        }
437    
438        static class ForwardScript implements ScriptBuilder {
439            /** Scan the tables of which lines are inserted and deleted,
440           producing an edit script in forward order.  */
441            public change build_script(
442                    final boolean[] changed0,int len0,
443                    final boolean[] changed1,int len1)
444            {
445                change script = null;
446                int i0 = len0, i1 = len1;
447    
448                while (i0 >= 0 || i1 >= 0)
449                {
450                    if (changed0[i0] || changed1[i1])
451                    {
452                        int line0 = i0, line1 = i1;
453    
454                        /* Find # lines changed here in each file.  */
455                        while (changed0[i0]) {
456                            --i0;
457                        }
458                        while (changed1[i1]) {
459                            --i1;
460                        }
461    
462                        /* Record this change.  */
463                        script = new change(i0, i1, line0 - i0, line1 - i1, script);
464                    }
465    
466                    /* We have reached lines in the two files that match each other.  */
467                    i0--; i1--;
468                }
469    
470                return script;
471            }
472        }
473    
474        /** Standard ScriptBuilders. */
475        public final static ScriptBuilder
476        forwardScript = new ForwardScript(),
477        reverseScript = new ReverseScript();
478    
479        /* Report the differences of two files.  DEPTH is the current directory
480         depth. */
481        public final change diff_2(final boolean reverse) {
482            return diff(reverse ? reverseScript : forwardScript);
483        }
484    
485        /** Get the results of comparison as an edit script.  The script
486         is described by a list of changes.  The standard ScriptBuilder
487         implementations provide for forward and reverse edit scripts.
488         Alternate implementations could, for instance, list common elements
489         instead of differences.
490         @param bld an object to build the script from change flags
491         @return the head of a list of changes
492         */
493        public change diff(final ScriptBuilder bld) {
494    
495            /* Some lines are obviously insertions or deletions
496           because they don't match anything.  Detect them now,
497           and avoid even thinking about them in the main comparison algorithm.  */
498    
499            discard_confusing_lines ();
500    
501            /* Now do the main comparison algorithm, considering just the
502           undiscarded lines.  */
503    
504            xvec = filevec[0].undiscarded;
505            yvec = filevec[1].undiscarded;
506    
507            int diags =
508                filevec[0].nondiscarded_lines + filevec[1].nondiscarded_lines + 3;
509            fdiag = new int[diags];
510            fdiagoff = filevec[1].nondiscarded_lines + 1;
511            bdiag = new int[diags];
512            bdiagoff = filevec[1].nondiscarded_lines + 1;
513    
514            compareseq (0, filevec[0].nondiscarded_lines,
515                    0, filevec[1].nondiscarded_lines);
516            fdiag = null;
517            bdiag = null;
518    
519            /* Modify the results slightly to make them prettier
520           in cases where that can validly be done.  */
521    
522            shift_boundaries ();
523    
524            /* Get the results of comparison in the form of a chain
525           of `struct change's -- an edit script.  */
526            return bld.build_script(
527                    filevec[0].changed_flag,
528                    filevec[0].buffered_lines,
529                    filevec[1].changed_flag,
530                    filevec[1].buffered_lines
531            );
532    
533        }
534    
535        /** The result of comparison is an "edit script": a chain of change objects.
536         Each change represents one place where some lines are deleted
537         and some are inserted.
538    
539         LINE0 and LINE1 are the first affected lines in the two files (origin 0).
540         DELETED is the number of lines deleted here from file 0.
541         INSERTED is the number of lines inserted here in file 1.
542    
543         If DELETED is 0 then LINE0 is the number of the line before
544         which the insertion was done; vice versa for INSERTED and LINE1.  */
545    
546        public static class change {
547            /** Previous or next edit command. */
548            public change link;
549            /** # lines of file 1 changed here.  */
550            public final int inserted;
551            /** # lines of file 0 changed here.  */
552            public final int deleted;
553            /** Line number of 1st deleted line.  */
554            public final int line0;
555            /** Line number of 1st inserted line.  */
556            public final int line1;
557    
558            /** Cons an additional entry onto the front of an edit script OLD.
559           LINE0 and LINE1 are the first affected lines in the two files (origin 0).
560           DELETED is the number of lines deleted here from file 0.
561           INSERTED is the number of lines inserted here in file 1.
562    
563           If DELETED is 0 then LINE0 is the number of the line before
564           which the insertion was done; vice versa for INSERTED and LINE1.  */
565            public change(int line0, int line1, int deleted, int inserted, change old) {
566                this.line0 = line0;
567                this.line1 = line1;
568                this.inserted = inserted;
569                this.deleted = deleted;
570                this.link = old;
571                //System.err.println(line0+","+line1+","+inserted+","+deleted);
572            }
573        }
574    
575        /** Data on one input file being compared.
576         */
577    
578        class file_data {
579    
580            /** Allocate changed array for the results of comparison.  */
581            void clear() {
582                /* Allocate a flag for each line of each file, saying whether that line
583         is an insertion or deletion.
584         Allocate an extra element, always zero, at each end of each vector.
585                 */
586                changed_flag = new boolean[buffered_lines + 2];
587            }
588    
589            /** Return equiv_count[I] as the number of lines in this file
590           that fall in equivalence class I.
591             @return the array of equivalence class counts.
592             */
593            int[] equivCount() {
594                int[] equiv_count = new int[equiv_max];
595                for (int i = 0; i < buffered_lines; ++i) {
596                    ++equiv_count[equivs[i]];
597                }
598                return equiv_count;
599            }
600    
601            /** Discard lines that have no matches in another file.
602    
603           A line which is discarded will not be considered by the actual
604           comparison algorithm; it will be as if that line were not in the file.
605           The file's `realindexes' table maps virtual line numbers
606           (which don't count the discarded lines) into real line numbers;
607           this is how the actual comparison algorithm produces results
608           that are comprehensible when the discarded lines are counted.
609    <p>
610           When we discard a line, we also mark it as a deletion or insertion
611           so that it will be printed in the output.
612          @param f the other file
613             */
614            void discard_confusing_lines(file_data f) {
615                clear();
616                /* Set up table of which lines are going to be discarded. */
617                final byte[] discarded = discardable(f.equivCount());
618    
619                /* Don't really discard the provisional lines except when they occur
620           in a run of discardables, with nonprovisionals at the beginning
621           and end.  */
622                filterDiscards(discarded);
623    
624                /* Actually discard the lines. */
625                discard(discarded);
626            }
627    
628            /** Mark to be discarded each line that matches no line of another file.
629           If a line matches many lines, mark it as provisionally discardable.
630           @see equivCount()
631           @param counts The count of each equivalence number for the other file.
632           @return 0=nondiscardable, 1=discardable or 2=provisionally discardable
633            for each line
634             */
635    
636            private byte[] discardable(final int[] counts) {
637                final int end = buffered_lines;
638                final byte[] discards = new byte[end];
639                final int[] equivs = this.equivs;
640                int many = 5;
641                int tem = end / 64;
642    
643                /* Multiply MANY by approximate square root of number of lines.
644         That is the threshold for provisionally discardable lines.  */
645                while ((tem = tem >> 2) > 0) {
646                    many *= 2;
647                }
648    
649                for (int i = 0; i < end; i++)
650                {
651                    int nmatch;
652                    if (equivs[i] == 0) {
653                        continue;
654                    }
655                    nmatch = counts[equivs[i]];
656                    if (nmatch == 0) {
657                        discards[i] = 1;
658                    } else if (nmatch > many) {
659                        discards[i] = 2;
660                    }
661                }
662                return discards;
663            }
664    
665            /** Don't really discard the provisional lines except when they occur
666           in a run of discardables, with nonprovisionals at the beginning
667           and end.  */
668    
669            private void filterDiscards(final byte[] discards) {
670                final int end = buffered_lines;
671    
672                for (int i = 0; i < end; i++)
673                {
674                    /* Cancel provisional discards not in middle of run of discards.  */
675                    if (discards[i] == 2) {
676                        discards[i] = 0;
677                    } else if (discards[i] != 0)
678                    {
679                        /* We have found a nonprovisional discard.  */
680                        int j;
681                        int length;
682                        int provisional = 0;
683    
684                        /* Find end of this run of discardable lines.
685               Count how many are provisionally discardable.  */
686                        for (j = i; j < end; j++)
687                        {
688                            if (discards[j] == 0) {
689                                break;
690                            }
691                            if (discards[j] == 2) {
692                                ++provisional;
693                            }
694                        }
695    
696                        /* Cancel provisional discards at end, and shrink the run.  */
697                        while (j > i && discards[j - 1] == 2) {
698                            discards[--j] = 0; --provisional;
699                        }
700    
701                        /* Now we have the length of a run of discardable lines
702               whose first and last are not provisional.  */
703                        length = j - i;
704    
705                        /* If 1/4 of the lines in the run are provisional,
706               cancel discarding of all provisional lines in the run.  */
707                        if (provisional * 4 > length)
708                        {
709                            while (j > i)
710                                if (discards[--j] == 2) {
711                                    discards[j] = 0;
712                                }
713                        }
714                        else
715                        {
716                            int consec;
717                            int minimum = 1;
718                            int tem = length / 4;
719    
720                            /* MINIMUM is approximate square root of LENGTH/4.
721                   A subrun of two or more provisionals can stand
722                   when LENGTH is at least 16.
723                   A subrun of 4 or more can stand when LENGTH >= 64.  */
724                            while ((tem = tem >> 2) > 0) {
725                                minimum *= 2;
726                            }
727                            minimum++;
728    
729                            /* Cancel any subrun of MINIMUM or more provisionals
730                   within the larger run.  */
731                            for (j = 0, consec = 0; j < length; j++)
732                                if (discards[i + j] != 2) {
733                                    consec = 0;
734                                } else if (minimum == ++consec) {
735                                    /* Back up to start of subrun, to cancel it all.  */
736                                    j -= consec;
737                                } else if (minimum < consec) {
738                                    discards[i + j] = 0;
739                                }
740    
741                            /* Scan from beginning of run
742                   until we find 3 or more nonprovisionals in a row
743                   or until the first nonprovisional at least 8 lines in.
744                   Until that point, cancel any provisionals.  */
745                            for (j = 0, consec = 0; j < length; j++)
746                            {
747                                if (j >= 8 && discards[i + j] == 1) {
748                                    break;
749                                }
750                                if (discards[i + j] == 2) {
751                                    consec = 0; discards[i + j] = 0;
752                                }
753                                else if (discards[i + j] == 0) {
754                                    consec = 0;
755                                } else {
756                                    consec++;
757                                }
758                                if (consec == 3) {
759                                    break;
760                                }
761                            }
762    
763                            /* I advances to the last line of the run.  */
764                            i += length - 1;
765    
766                            /* Same thing, from end.  */
767                            for (j = 0, consec = 0; j < length; j++)
768                            {
769                                if (j >= 8 && discards[i - j] == 1) {
770                                    break;
771                                }
772                                if (discards[i - j] == 2) {
773                                    consec = 0; discards[i - j] = 0;
774                                }
775                                else if (discards[i - j] == 0) {
776                                    consec = 0;
777                                } else {
778                                    consec++;
779                                }
780                                if (consec == 3) {
781                                    break;
782                                }
783                            }
784                        }
785                    }
786                }
787            }
788    
789            /** Actually discard the lines.
790          @param discards flags lines to be discarded
791             */
792            private void discard(final byte[] discards) {
793                final int end = buffered_lines;
794                int j = 0;
795                for (int i = 0; i < end; ++i)
796                    if (no_discards || discards[i] == 0)
797                    {
798                        undiscarded[j] = equivs[i];
799                        realindexes[j++] = i;
800                    } else {
801                        changed_flag[1+i] = true;
802                    }
803                nondiscarded_lines = j;
804            }
805    
806            file_data(Object[] data,Hashtable<Object, Integer> h) {
807                buffered_lines = data.length;
808    
809                equivs = new int[buffered_lines];
810                undiscarded = new int[buffered_lines];
811                realindexes = new int[buffered_lines];
812    
813                for (int i = 0; i < data.length; ++i) {
814                    Integer ir = (Integer)h.get(data[i]);
815                    if (ir == null) {
816                        h.put(data[i],new Integer(equivs[i] = equiv_max++));
817                    } else {
818                        equivs[i] = ir.intValue();
819                    }
820                }
821            }
822    
823            /** Adjust inserts/deletes of blank lines to join changes
824           as much as possible.
825    
826           We do something when a run of changed lines include a blank
827           line at one end and have an excluded blank line at the other.
828           We are free to choose which blank line is included.
829           `compareseq' always chooses the one at the beginning,
830           but usually it is cleaner to consider the following blank line
831           to be the "change".  The only exception is if the preceding blank line
832           would join this change to other changes.
833          @param f the file being compared against
834             */
835    
836            void shift_boundaries(file_data f) {
837                final boolean[] changed = changed_flag;
838                final boolean[] other_changed = f.changed_flag;
839                int i = 0;
840                int j = 0;
841                int i_end = buffered_lines;
842                int preceding = -1;
843                int other_preceding = -1;
844    
845                for (;;)
846                {
847                    int start, end, other_start;
848    
849                    /* Scan forwards to find beginning of another run of changes.
850             Also keep track of the corresponding point in the other file.  */
851    
852                    while (i < i_end && !changed[1+i])
853                    {
854                        while (other_changed[1+j++]) {
855                            /* Non-corresponding lines in the other file
856                   will count as the preceding batch of changes.  */
857                            other_preceding = j;
858                        }
859                        i++;
860                    }
861    
862                    if (i == i_end) {
863                        break;
864                    }
865    
866                    start = i;
867                    other_start = j;
868    
869                    for (;;)
870                    {
871                        /* Now find the end of this run of changes.  */
872    
873                        while (i < i_end && changed[1+i]) {
874                            i++;
875                        }
876                        end = i;
877    
878                        /* If the first changed line matches the following unchanged one,
879             and this run does not follow right after a previous run,
880             and there are no lines deleted from the other file here,
881             then classify the first changed line as unchanged
882             and the following line as changed in its place.  */
883    
884                        /* You might ask, how could this run follow right after another?
885             Only because the previous run was shifted here.  */
886    
887                        if (end != i_end
888                                && equivs[start] == equivs[end]
889                                                           && !other_changed[1+j]
890                                                                             && end != i_end
891                                                                             && !((preceding >= 0 && start == preceding)
892                                                                                     || (other_preceding >= 0
893                                                                                             && other_start == other_preceding)))
894                        {
895                            changed[1+end++] = true;
896                            changed[1+start++] = false;
897                            ++i;
898                            /* Since one line-that-matches is now before this run
899                 instead of after, we must advance in the other file
900                 to keep in synch.  */
901                            ++j;
902                        } else {
903                            break;
904                        }
905                    }
906    
907                    preceding = i;
908                    other_preceding = j;
909                }
910            }
911    
912            /** Number of elements (lines) in this file. */
913            final int buffered_lines;
914    
915            /** Vector, indexed by line number, containing an equivalence code for
916           each line.  It is this vector that is actually compared with that
917           of another file to generate differences. */
918            private final int[]     equivs;
919    
920            /** Vector, like the previous one except that
921           the elements for discarded lines have been squeezed out.  */
922            final int[]    undiscarded;
923    
924            /** Vector mapping virtual line numbers (not counting discarded lines)
925           to real ones (counting those lines).  Both are origin-0.  */
926            final int[]    realindexes;
927    
928            /** Total number of nondiscarded lines. */
929            int         nondiscarded_lines;
930    
931            /** Array, indexed by real origin-1 line number,
932           containing true for a line that is an insertion or a deletion.
933           The results of comparison are stored here.  */
934            boolean[]       changed_flag;
935    
936        }
937    }