udiff

#!/usr/bin/env python
# vim: set ts=8 sw=4 sts=4 et ai:
#
# udiff.py: find minor differences in large files
# (microdiff, or unified-diff *grin*)
#
# Use this instead of regular diff(1) which is prone to choke with a
# "memory exhausted" error on large files. This can be useful when
# comparing database dumps before and after an operation: lots of
# invariant INSERT statements and only a handful more or less caused
# by the operation.
#
# Walter Doekes, 2011, OSSO B.V.
# http://wjd.nu/notes/2011#diff-memory-exhausted-udiff
# License: Public Domain

BUFFER_LINES = 10000


class LineByLineReader(object):
    '''A file reader that reads a file line by line which can look up if
    a similar line exists a bit down the road. The maximum number of
    lines to look forward to is defined by max_lookahead.'''

    def __init__(self, file, max_lookahead=BUFFER_LINES):
        super(LineByLineReader, self).__init__()
        self.file = file
        self.max_lookahead = max_lookahead
        self.lineno = 0
        self.buffer = [] # list of (lineno, line) tuples

    def _fill(self):
        '''Add another line to the internal buffer. Raises
        StopIteration on EOF.'''
        self.lineno += 1
        self.buffer.append((self.lineno, self.file.next()))

    def _pop(self):
        '''Pops a line from the internal buffer.'''
        ret = self.buffer[0]
        self.buffer.pop(0)
        return ret

    def has_next(self):
        '''Returns whether there are more lines or not.'''
        if not self.buffer:
            try:
                self._fill()
            except StopIteration:
                return False
        return True

    def push_back(self, linedata):
        '''Push a line back into the buffer.'''
        self.buffer.insert(0, linedata)

    def next(self):
        '''Serve the next line or None if we're at EOF.'''
        if not self.buffer:
            try: self._fill()
            except StopIteration: return None
        return self._pop()

    def find(self, line):
        '''Returns amount of lines to skip if line is found or None.'''
        for i, buf in enumerate(self.buffer):
            if buf[1] == line:
                return i
        while len(self.buffer) < self.max_lookahead:
            try: self._fill()
            except StopIteration: return None
            if self.buffer[-1][1] == line:
                return len(self.buffer) - 1

    def __repr__(self):
        return 'LineByLineReader(%s)' % (self.file.name,)


def filediff(file1, file2):
    max_context = 3

    reader1 = LineByLineReader(file1)
    reader2 = LineByLineReader(file2)

    pre_context = [] # store context that we can print on diff
    post_context_needed = 0 # how many lines of context we still need to print
    mode = 'skip' # 'diff' 'context'
    has_heading = False # any heading at all

    context = []
    buffer = []

    lineno1 = lineno2 = -1
    while reader1.has_next() and reader2.has_next():
        lineno1, line1 = reader1.next()
        lineno2, line2 = reader2.next()
        # Ah, lines are the same.
        if line1 == line2:
            # Are we at the end of a diff?
            if mode == 'diff':
                mode = 'context'
                post_context_needed = max_context
            # If we need post-diff-context, yield that
            if mode == 'context':
                if post_context_needed:
                    yield ' %s' % (line1,)
                    post_context_needed -= 1
                else:
                    mode = 'skip'
            # Assert we have context for the next diff
            if mode == 'skip':
                context.append(line1)
                while len(context) > max_context:
                    context.pop(0)
        # Lines are different.
        else: 
            # Dump sub-heading and context if not done yet
            if mode == 'skip':
                if not has_heading or len(context) == max_context:
                    yield '@@ -%s,? +%s,? @@\n' % (lineno1 - len(context), lineno2 - len(context))
                    has_heading = True
                while context:
                    yield ' %s' % (context.pop(0),)
            mode = 'diff'
            # Attempt to find a match a bit later on
            lines2 = reader2.find(line1)
            lines1 = reader1.find(line2)
            if lines1 is not None or lines2 is not None:
                # Okay.. we've found something here
                if lines1 is not None and ((lines2 is None) or (lines1 < lines2)):
                    line, reader, lines, sign = line1, reader1, lines1, '-'
                    reader2.push_back((lineno2, line2))
                elif lines2 is not None and ((lines1 is None) or (lines2 < lines1)):
                    line, reader, lines, sign = line2, reader2, lines2, '+'
                    reader1.push_back((lineno1, line1))
                else:
                    # Both differences are "smallest", 1 goes first
                    line, reader, lines, sign = line1, reader1, lines1, '-'
                    reader2.push_back((lineno2, line2))
                yield '%s%s' % (sign, line)
                while lines:
                    yield '%s%s' % (sign, reader.next()[1])
                    lines -= 1
            else:
                yield '-%s' % (line1,)
                yield '+%s' % (line2,)

    # Dump sub-heading and context if not done yet
    if mode == 'skip' and (reader1.has_next() or reader2.has_next()):
        # Only print heading if !has_heading or the context is fresh
        if not has_heading or len(context) == max_context:
            # Handle the special case when one of the files is empty
            if lineno1 == -1 and reader1.has_next():
                lineno1 = 0
            if lineno2 == -1 and reader2.has_next():
                lineno2 = 0
            yield '@@ -%s,? +%s,? @@\n' % (lineno1 - len(context) + 1, lineno2 - len(context) + 1)
            has_heading = True
        while context:
            yield ' %s' % (context.pop(0),)
    while reader1.has_next():
        yield '-%s' % (reader1.next()[1],)
    while reader2.has_next():
        yield '+%s' % (reader2.next()[1],)


def fixheadings(iterator):
    buf = []
    blank = left = right = 0
    for line in iterator:
        if line.startswith('@@'):
            if buf:
                yield buf[0].replace('?', '%u' % (blank + left), 1).replace('?', '%u' % (blank + right), 1)
                for i in buf[1:]:
                    yield i
                buf = []
                blank = left = right = 0
        elif line.startswith('-'):
            left += 1
        elif line.startswith('+'):
            right += 1
        else:
            blank += 1
        buf.append(line)

        # Safety feature: don't overrun memory to fix the numbers.
        # Flush the whole thing and exit.
        if len(buf) > BUFFER_LINES:
            for i in buf:
                yield i
            for i in iterator:
                yield i
            return
            
    if buf:
        yield buf[0].replace('?', '%u' % (blank + left), 1).replace('?', '%u' % (blank + right), 1)
        for i in buf[1:]:
            yield i


def diff(args, stdout, stderr):
    if len(args) != 2:
        print >>stderr, 'diff: need two arguments'
        return 1

    file1 = open(args[0])
    file2 = open(args[1])

    generator = fixheadings(filediff(file1, file2))

    # Print header before first output
    for output in generator:
        import datetime, os
        mtime, mdate, nsec = [None, None], [None, None], [None, None]
        for i in range(2):
            mtime[i] = os.stat(args[i]).st_mtime
            mdate[i] = datetime.datetime.fromtimestamp(mtime[i])
            nsec[i] = '%09d' % (int((mtime[i] - int(mtime[i])) * 1000000000),)
        fmt = '%Y-%m-%d %H:%M:%S'
        tz = '+0000' # for another rainy day
        stdout.write('--- %s\t%s.%s %s\n' % (args[0], mdate[0].strftime(fmt), nsec[0], tz))
        stdout.write('+++ %s\t%s.%s %s\n' % (args[1], mdate[1].strftime(fmt), nsec[1], tz))
        stdout.write(output)
        break
    # Print other output
    for output in generator:
        stdout.write(output)
        pass


if __name__ == '__main__':
    import sys
    ret = diff(sys.argv[1:], sys.stdout, sys.stderr)
    if ret != 0:
        sys.exit(ret)