1package Text::WordDiff;
2
3use strict;
4use vars qw(@ISA $VERSION);
5use Algorithm::Diff ();
6use IO::File;
7use Carp;
8
9$VERSION = '0.08';
10
11# _Mastering Regular Expressions_, p. 132.
12my $BEGIN_WORD = $] >= 5.006
13    ? qr/(?:(?<!\p{IsWord})(?=\p{IsWord})|(?<!\p{IsPunct})(?=\p{IsPunct})|(?<!\p{IsCntrl})(?=\p{IsCntrl}))/msx
14    : qr/(?:(?<!\w)(?=\w)|(?<![\]\[!"%&'()*,\.\/:;?\{}\-@])(?=[\]\[!"%&'()*,\.\/:;?\{}\-@])|(?<![\n\r\t])(?=[\n\r\t]))/msx;
15
16my %styles = (
17    ANSIColor    => undef,
18    HTML         => undef,
19    HTMLTwoLines => undef,
20);
21
22sub import {
23    my $caller = caller;
24    no strict 'refs';
25    *{"$caller\::word_diff"} = \&word_diff;
26}
27
28sub word_diff ($$;$) {
29    my @seqs = ( shift, shift );
30    my $opts = $_[0] ? { %{ +shift } } : {};
31    $opts->{FILENAME_PREFIX_A} ||= '---';
32    $opts->{FILENAME_PREFIX_B} ||= '+++';
33    my $AorB = 'A';
34
35    for my $seq (@seqs) {
36        my $type = ref $seq;
37
38        while ( $type eq 'CODE' ) {
39            $seq = $seq->( $opts );
40            $type = ref $seq;
41        }
42
43        # Get a handle on options.
44        my $filename = \$opts->{"FILENAME_$AorB"};
45        my $mtime    = \$opts->{"MTIME_$AorB"};
46
47        if ( $type eq 'ARRAY' ) {
48            # The work has already been done for us.
49        }
50
51        elsif ( $type eq 'SCALAR' ) {
52            # Parse the words from the string.
53            $seq     = [ split $BEGIN_WORD, $$seq ];
54        }
55
56        elsif ( !$type ) {
57            # Assume that it's a raw file name.
58            $$filename = $seq           unless defined $$filename;
59            $$mtime    = (stat $seq)[9] unless defined $$mtime;
60
61            # Parse the words from the file.
62            my $seq_fh = IO::File->new($seq, '<');
63            $seq       = do { local $/; [ split $BEGIN_WORD, <$seq_fh> ] };
64            $seq_fh->close;
65        }
66
67        elsif ( $type eq "GLOB" || UNIVERSAL::isa( $seq, "IO::Handle" ) ) {
68            # Parse the words from the file.
69            $seq       = do { local $/; [ split $BEGIN_WORD, <$seq> ] };
70        }
71
72        else {
73            # Damn.
74            confess "Can't handle input of type $type";
75        }
76        $AorB++;
77    }
78
79    # Set up the output handler.
80    my $output;
81    my $out_handler = delete $opts->{OUTPUT};
82    my $type = ref $out_handler ;
83
84    if ( ! defined $out_handler ) {
85        # Default to concatenating a string.
86        $output = '';
87        $out_handler = sub { $output .= shift };
88    }
89    elsif ( $type eq 'CODE' ) {
90        # We'll just use the handler.
91    }
92    elsif ( $type eq 'SCALAR' ) {
93        # Append to the scalar reference.
94        my $out_ref = $out_handler;
95        $out_handler = sub { $$out_ref .= shift };
96    }
97    elsif ( $type eq 'ARRAY' ) {
98        # Push each item onto the array.
99        my $out_ref = $out_handler;
100        $out_handler = sub { push @$out_ref, shift };
101    }
102    elsif ( $type eq 'GLOB' || UNIVERSAL::isa( $out_handler, 'IO::Handle' )) {
103        # print to the file handle.
104        my $output_handle = $out_handler;
105        $out_handler = sub { print $output_handle shift };
106    }
107    else {
108        # D'oh!
109        croak "Unrecognized output type: $type";
110    }
111
112    # Instantiate the diff object, along with any options.
113    my $diff = Algorithm::Diff->new(@seqs, delete $opts->{DIFF_OPTS});
114
115    # Load the style class and instantiate an instance.
116    my $style  = delete $opts->{STYLE} || 'ANSIColor';
117    $style     = __PACKAGE__ . "::$style" if exists $styles{$style};
118    eval "require $style" or die $@ unless $style->can('new');
119    $style     = $style->new($opts) if !ref $style;
120
121    # Run the diff.
122    my $hunks = 0;
123    $out_handler->($style->file_header());
124    while ($diff->Next) {
125        $hunks++;
126        $out_handler->( $style->hunk_header() );
127
128        # Output unchanged items.
129        if (my @same = $diff->Same) {
130            $out_handler->( $style->same_items(@same) );
131        }
132
133        # Output deleted and inserted items.
134        else {
135            if (my @del = $diff->Items(1)) {
136                $out_handler->( $style->delete_items(@del) );
137            }
138            if (my @ins = $diff->Items(2)) {
139                $out_handler->( $style->insert_items(@ins) );
140            }
141        }
142        $out_handler->( $style->hunk_footer() );
143    }
144    $out_handler->( $style->file_footer() );
145
146    return defined $output ? $output : $hunks;
147}
148
149package Text::WordDiff::Base;
150
151sub new {
152    my ($class, $opts) = @_;
153    return bless { %{$opts} } => $class;
154}
155
156
157sub file_header  {
158    my $self = shift;
159    my $fn1 = $self->filename_a;
160    my $fn2 = $self->filename_b;
161    return '' unless defined $fn1 && defined $fn2;
162
163    my $p1 = $self->filename_prefix_a;
164    my $t1 = $self->mtime_a;
165    my $p2 = $self->filename_prefix_b;
166    my $t2 = $self->mtime_b;
167
168    return "$p1 $fn1" . (defined $t1 ? "\t" . localtime $t1 : '') . "\n"
169         . "$p2 $fn2" . (defined $t2 ? "\t" . localtime $t2 : '') . "\n"
170         ;
171}
172
173sub hunk_header         { return '' }
174sub same_items          { return '' }
175sub insert_items        { return '' }
176sub delete_items        { return '' }
177sub hunk_footer         { return '' }
178sub file_footer         { return '' }
179sub filename_a          { return shift->{FILENAME_A} }
180sub filename_b          { return shift->{FILENAME_B} }
181sub mtime_a             { return shift->{MTIME_A}    }
182sub mtime_b             { return shift->{MTIME_B}    }
183sub filename_prefix_a   { return shift->{FILENAME_PREFIX_A} }
184sub filename_prefix_b   { return shift->{FILENAME_PREFIX_B} }
185
1861;
187__END__
188
189##############################################################################
190
191=head1 Name
192
193Text::WordDiff - Track changes between documents
194
195=head1 Synopsis
196
197    use Text::WordDiff;
198
199    my $diff = word_diff 'file1.txt', 'file2.txt', { STYLE => 'HTML' };
200    my $diff = word_diff \$string1,   \$string2,   { STYLE => 'ANSIColor' };
201    my $diff = word_diff \*FH1,       \*FH2;       \%options;
202    my $diff = word_diff \&reader1,   \&reader2;
203    my $diff = word_diff \@records1,  \@records2;
204
205    # May also mix input types:
206    my $diff = word_diff \@records1,  'file_B.txt';
207
208=head1 Description
209
210This module is a variation on the lovely L<Text::Diff|Text::Diff> module.
211Rather than generating traditional line-oriented diffs, however, it generates
212word-oriented diffs. This can be useful for tracking changes in narrative
213documents or documents with very long lines. To diff source code, one is still
214best off using L<Text::Diff|Text::Diff>. But if you want to see how a short
215story changed from one version to the next, this module will do the job very
216nicely.
217
218=head2 What is a Word?
219
220I'm glad you asked! Well, sort of. It's a really hard question to answer. I
221consulted a number of sources, but really just did my best to punt on the
222question by reformulating it as, "How do I split text up into individual
223words?" The short answer is to split on word boundaries. However, every word
224has two boundaries, one at the beginning and one at the end. So splitting on
225C</\b/> didn't work so well. What I really wanted to do was to split on the
226I<beginning> of every word. Fortunately, _Mastering Regular Expressions_ has a
227recipe for that: C<< /(?<!\w)(?=\w)/ >>. I've borrowed this regular expression
228for use in Perls before 5.6.x, but go for the Unicode variant in 5.6.0 and
229newer: C<< /(?<!\p{IsWord})(?=\p{IsWord})/ >>. Adding some additional controls
230for punctuation and control characters, this sentence, for example, would be
231split up into the following tokens:
232
233  my @words = (
234      "Adding ",
235      "some ",
236      "additional ",
237      "controls",
238      "\n",
239      "for ",
240      "punctuation ",
241      "and ",
242      "control ",
243      "characters",
244      ", ",
245      "this ",
246      "sentence",
247      ", ",
248      "for ",
249      "example",
250      ", ",
251      "would ",
252      "be",
253      "\n",
254      "split ",
255      "up ",
256      "into ",
257      "the ",
258      "following ",
259      "tokens",
260      ":",
261  );
262
263So it's not just comparing words, but word-like tokens and control/punctuation
264tokens. This makes sense to me, at least, as the diff is between these tokens,
265and thus leads to a nice word-and-space-and-punctuation type diff. It's not
266unlike what a word processor might do (although a lot of them are
267character-based, but that seemed a bit extreme--feel free to dupe this module
268into Text::CharDiff!).
269
270Now, I acknowledge that there are localization issues with this approach. In
271particular, it will fail with Chinese, Japanese, and Korean text, as these
272languages don't put non-word characters between words. Ideally, Test::WordDiff
273would then split on every charaters (since a single character often equals a
274word), but such is not the case when the C<utf8> flag is set on a string.
275For example, This simple script:
276
277=encoding utf8
278
279  use strict;
280  use utf8;
281  use Data::Dumper;
282  my $string = '뼈뼉뼘뼙뼛뼜뼝뽀뽁뽄뽈뽐뽑뽕뾔뾰뿅뿌뿍뿐뿔뿜뿟뿡쀼쁑쁘쁜쁠쁨쁩삐';
283  my @tokens = split /(?<!\p{IsWord})(?=\p{IsWord})/msx, $string;
284  print Dumper \@tokens;
285
286Outputs:
287
288  $VAR1 = [
289            "\x{bf08}\x{bf09}\x{bf18}\x{bf19}\x{bf1b}\x{bf1c}\x{bf1d}\x{bf40}\x{bf41}\x{bf44}\x{bf48}\x{bf50}\x{bf51}\x{bf55}\x{bf94}\x{bfb0}\x{bfc5}\x{bfcc}\x{bfcd}\x{bfd0}\x{bfd4}\x{bfdc}\x{bfdf}\x{bfe1}\x{c03c}\x{c051}\x{c058}\x{c05c}\x{c060}\x{c068}\x{c069}\x{c090}"
290          ];
291
292Not so useful. It seems to be less of a problem if the C<use utf8;> line is
293commented out, in which case we get:
294
295  $VAR1 = [
296            '뼈',
297            '뼉',
298            '뼘',
299            '뼙',
300            '뼛',
301            '뼜',
302            '뼝',
303            '뽀',
304            '뽁',
305            '뽄',
306            '뽈',
307            '뽐',
308            '뽑',
309            '뽕',
310            '뾔',
311            '뾰',
312            '뿅',
313            '뿌',
314            '뿍',
315            '뿐',
316            '뿔',
317            '뿜',
318            '뿟',
319            '뿡',
320            '?',
321            '?쁑',
322            '쁘',
323            '쁜',
324            '쁠',
325            '쁨',
326            '쁩',
327            '삐'
328          ];
329
330Someone whose more familiar with non-space-using languages will have to
331explain to me how I might be able to duplicate this pattern within the scope
332of C<use utf8;>, seing as it may very well be important to have it on in order
333to ensure proper character semantics.
334
335However, if my word tokenization approach is just too naive, and you decide
336that you need to take a different approach (maybe use
337L<Lingua::ZH::Toke|Lingua::ZH::Toke> or similar module), you can still use
338this module; you'll just have to tokenize your strings into words yourself,
339and pass them to word_diff() as array references:
340
341  word_diff \@my_words1, \@my_words2;
342
343=head1 Options
344
345word_diff() takes two arguments from which to draw input and an optional hash
346reference of options to control its output. The first two arguments contain
347the data to be diffed, and each may be in the form of any of the following
348(that is, they can be in two different formats):
349
350=over
351
352=item * String
353
354A bare scalar will be assumed to be a file name. The file will be opened and
355split up into words. word_diff() will also C<stat> the file to get the last
356modified time for use in the header, unless the relevant option (C<MTIME_A> or
357C<MTIME_B>) has been specified explicitly.
358
359=item * Scalar Reference
360
361A scalar reference will be assumed to refer to a string. That string will be
362split up into words.
363
364=item * Array Reference
365
366An array reference will be assumed to be a list of words.
367
368=item * File Handle
369
370A glob or IO::Handle-derived object will be read from and split up into
371its constituent words.
372
373=back
374
375The optional hash reference may contain the following options. Additional
376options may be specified by the formattting class; see the specific class for
377details.
378
379=over
380
381=item * STYLE
382
383"ANSIColor", "HTML" or an object or class name for a class providing
384C<file_header()>, C<hunk_header()>, C<same_items()>, C<delete_items()>,
385C<insert_items()>, C<hunk_footer()> and C<file_footer()> methods. Defaults to
386"ANSIColor" for nice display of diffs in an ANSI Color-supporting terminal.
387
388If the package indicated by the C<STYLE> has no C<new()> method,
389C<word_diff()> will load it automatically (lazy loading). It will then
390instantiate an object of that class, passing in the options hash reference
391with which the formatting class can initialize the object.
392
393Styles may be specified as class names (C<< STYLE => "My::Foo" >>), in which
394case they will be instantiated by calling the C<new()> construcctor and
395passing in the options hash reference, or as objects (C<< STYLE =>
396My::Foo->new >>).
397
398The simplest way to implement your own formatting style is to create a new
399class that inherits from Text::WordDiff::Base, wherein the C<new()> method is
400already provided, and the C<file_header()> returns a Unified diff-style
401header. All of the other formatting methods simply return empty strings, and
402are therefore ripe for overriding.
403
404=item * FILENAME_A, MTIME_A, FILENAME_B, MTIME_B
405
406The name of the file and the modification time "files" in epoch seconds.
407Unless a defined value is specified for these options, they will be filled in
408for each file when word_diff() is passed a filename. If a filename is not
409passed in and C<FILENAME_A> and C<FILENAME_B> are not defined, the header will
410not be printed by the base formatting base class.
411
412=item * OUTPUT
413
414The method by which diff output should be, well, I<output>. Examples and their
415equivalent subroutines:
416
417    OUTPUT => \*FOOHANDLE,   # like: sub { print FOOHANDLE shift() }
418    OUTPUT => \$output,      # like: sub { $output .= shift }
419    OUTPUT => \@output,      # like: sub { push @output, shift }
420    OUTPUT => sub { $output .= shift },
421
422If C<OUTPUT> is not defined, word_diff() will simply return the diff as a
423string. If C<OUTPUT> is a code reference, it will be called once with the file
424header, once for each hunk body, and once for each piece of content. If
425C<OUTPUT> is an L<IO::Handle|IO::Handle>-derived object, output will be
426sent to that handle.
427
428=item * FILENAME_PREFIX_A, FILENAME_PREFIX_B
429
430The string to print before the filename in the header. Defaults are C<"---">,
431C<"+++">.
432
433=item * DIFF_OPTS
434
435A hash reference to be passed as the options to C<< Algorithm::Diff->new >>.
436See L<Algorithm::Diff|Algorithm::Diff> for details on available options.
437
438=back
439
440=head1 Formatting Classes
441
442Text::WordDiff comes with two formatting classes:
443
444=over
445
446=item L<Text::WordDiff::ANSIColor|Text::WordDiff::ANSIColor>
447
448This is the default formatting class. It emits a header and then the diff
449content, with deleted text in bodfaced red and inserted text in boldfaced
450green.
451
452=item L<Text::WordDiff::HTML|Text::WordDiff::HTML>
453
454Specify C<< STYLE => 'HTML' >> to take advantage of this formatting class. It
455outputs the diff content as XHTML, with deleted text in C<< <del> >> elements
456and inserted text in C<< <ins> >> elements.
457
458=back
459
460To implement your own formatting class, simply inherit from
461Text::WordDiff::Base and override its methods as necssary. By default,
462only the C<file_header()> formatting method returns a value. All others
463simply return empty strings, and are therefore ripe for overriding:
464
465  package My::WordDiff::Format;
466  use base 'Text::WordDiff::Base';
467
468  sub file_footer { return "End of diff\n"; }
469
470The methods supplied by the base class are:
471
472=over
473
474=item C<new()>
475
476Constructs and returns a new formatting object. It takes a single hash
477reference as its argument, and uses it to construct the object. The nice thing
478about this is that if you want to support other options in your formatting
479class, you can just use them in the formatting object constructed by the
480Text::WordDiff::Base class and document that they can be passed as
481part of the options hash refernce to word_diff().
482
483=item C<file_header()>
484
485Called once for a single call to C<word_diff()>, this method outputs the
486header for the whole diff. This is the only formatting method in the base
487class that returns anything other than an empty string. It collects the
488filenames from C<filname_a()> and C<filename_b()> and, if they're defined,
489uses the relevant prefixes and modification times to return a unified
490diff-style header.
491
492=item C<hunk_header()>
493
494This method is called for each diff hunk. It should output any necessary
495header for the hunk.
496
497=item C<same_items()>
498
499This method is called for items that have not changed between the two
500sequnces being compared. The unchanged items will be passed as a
501list to the method.
502
503=item C<delete_items>
504
505This method is called for items in the first sequence that are not present in
506the second sequcne. The deleted items will be passed as a list to the method.
507
508=item C<insert_items>
509
510This method is called for items in the second sequence that are not present in
511the first sequcne. The inserted items will be passed as a list to the method.
512
513=item C<hunk_footer>
514
515This method is called at the end of a hunk. It should output any necessary
516content to close out the hunk.
517
518=item C<file_footer()>
519
520This method is called once when the whole diff has been procssed. It should
521output any necessary content to close out the diff file.
522
523=item C<filename_a>
524
525This accessor returns the value specified for the C<FILENAME_A> option
526to word_diff().
527
528=item C<filename_b>
529
530This accessor returns the value specified for the C<FILENAME_B> option
531to word_diff().
532
533=item C<mtime_a>
534
535This accessor returns the value specified for the C<MTIME_A> option to
536word_diff().
537
538=item C<mtime_b>
539
540This accessor returns the value specified for the C<MTIME_B> option to
541word_diff().
542
543=item C<filename_prefix_a>
544
545This accessor returns the value specified for the C<FILENAME_PREFIX_A> option
546to word_diff().
547
548=item C<filename_prefix_b>
549
550This accessor returns the value specified for the C<FILENAME_PREFIX_B> option
551to word_diff().
552
553=back
554
555=head1 See Also
556
557=over
558
559=item L<Text::Diff|Text::Diff>
560
561Inspired the interface and implementation of this module. Thanks Barry!
562
563=item L<Text::ParagraphDiff|Text::ParagraphDiff>
564
565A module that attempts to diff paragraphs and the words in them.
566
567=item L<Algorithm::Diff|Algorithm::Diff>
568
569The module that makes this all possible.
570
571=back
572
573=head1 Support
574
575This module is stored in an open L<GitHub
576repository|http://github.com/theory/text-worddiff/>. Feel free to fork and
577contribute!
578
579Please file bug reports via L<GitHub
580Issues|http://github.com/theory/text-worddiff/issues/> or by sending mail to
581L<bug-Text-WordDiff@rt.cpan.org|mailto:bug-Text-WordDiff@rt.cpan.org>.
582
583=head1 Author
584
585David E. Wheeler <david@justatheory.com>
586
587=head1 Copyright and License
588
589Copyright (c) 2005-2011 David E. Wheeler. Some Rights Reserved.
590
591This module is free software; you can redistribute it and/or modify it under
592the same terms as Perl itself.
593
594=cut
595