Balanced.pm revision 1.4
1package Text::Balanced;
2
3# EXTRACT VARIOUSLY DELIMITED TEXT SEQUENCES FROM STRINGS.
4# FOR FULL DOCUMENTATION SEE Balanced.pod
5
6use 5.005;
7use strict;
8use Exporter ();
9use SelfLoader;
10
11use vars qw { $VERSION @ISA %EXPORT_TAGS };
12BEGIN {
13	$VERSION     = '2.02';
14	@ISA         = 'Exporter';
15	%EXPORT_TAGS = (
16		ALL => [ qw{
17			&extract_delimited
18			&extract_bracketed
19			&extract_quotelike
20			&extract_codeblock
21			&extract_variable
22			&extract_tagged
23			&extract_multiple
24			&gen_delimited_pat
25			&gen_extract_tagged
26			&delimited_pat
27		} ],
28	);
29}
30
31Exporter::export_ok_tags('ALL');
32
33# PROTOTYPES
34
35sub _match_bracketed($$$$$$);
36sub _match_variable($$);
37sub _match_codeblock($$$$$$$);
38sub _match_quotelike($$$$);
39
40# HANDLE RETURN VALUES IN VARIOUS CONTEXTS
41
42sub _failmsg {
43	my ($message, $pos) = @_;
44	$@ = bless {
45		error => $message,
46		pos   => $pos,
47	}, 'Text::Balanced::ErrorMsg';
48}
49
50sub _fail {
51	my ($wantarray, $textref, $message, $pos) = @_;
52	_failmsg $message, $pos if $message;
53	return (undef, $$textref, undef) if $wantarray;
54	return undef;
55}
56
57sub _succeed {
58	$@ = undef;
59	my ($wantarray,$textref) = splice @_, 0, 2;
60	my ($extrapos, $extralen) = @_ > 18
61		? splice(@_, -2, 2)
62		: (0, 0);
63	my ($startlen, $oppos) = @_[5,6];
64	my $remainderpos = $_[2];
65	if ( $wantarray ) {
66		my @res;
67		while (my ($from, $len) = splice @_, 0, 2) {
68			push @res, substr($$textref, $from, $len);
69		}
70		if ( $extralen ) { # CORRECT FILLET
71			my $extra = substr($res[0], $extrapos-$oppos, $extralen, "\n");
72			$res[1] = "$extra$res[1]";
73			eval { substr($$textref,$remainderpos,0) = $extra;
74			       substr($$textref,$extrapos,$extralen,"\n")} ;
75				#REARRANGE HERE DOC AND FILLET IF POSSIBLE
76			pos($$textref) = $remainderpos-$extralen+1; # RESET \G
77		} else {
78			pos($$textref) = $remainderpos;		    # RESET \G
79		}
80		return @res;
81	} else {
82		my $match = substr($$textref,$_[0],$_[1]);
83		substr($match,$extrapos-$_[0]-$startlen,$extralen,"") if $extralen;
84		my $extra = $extralen
85			? substr($$textref, $extrapos, $extralen)."\n" : "";
86		eval {substr($$textref,$_[4],$_[1]+$_[5])=$extra} ;	#CHOP OUT PREFIX & MATCH, IF POSSIBLE
87		pos($$textref) = $_[4];				# RESET \G
88		return $match;
89	}
90}
91
92# BUILD A PATTERN MATCHING A SIMPLE DELIMITED STRING
93
94sub gen_delimited_pat($;$)  # ($delimiters;$escapes)
95{
96	my ($dels, $escs) = @_;
97	return "" unless $dels =~ /\S/;
98	$escs = '\\' unless $escs;
99	$escs .= substr($escs,-1) x (length($dels)-length($escs));
100	my @pat = ();
101	my $i;
102	for ($i=0; $i<length $dels; $i++)
103	{
104		my $del = quotemeta substr($dels,$i,1);
105		my $esc = quotemeta substr($escs,$i,1);
106		if ($del eq $esc)
107		{
108			push @pat, "$del(?:[^$del]*(?:(?:$del$del)[^$del]*)*)$del";
109		}
110		else
111		{
112			push @pat, "$del(?:[^$esc$del]*(?:$esc.[^$esc$del]*)*)$del";
113		}
114	}
115	my $pat = join '|', @pat;
116	return "(?:$pat)";
117}
118
119*delimited_pat = \&gen_delimited_pat;
120
121# THE EXTRACTION FUNCTIONS
122
123sub extract_delimited (;$$$$)
124{
125	my $textref = defined $_[0] ? \$_[0] : \$_;
126	my $wantarray = wantarray;
127	my $del  = defined $_[1] ? $_[1] : qq{\'\"\`};
128	my $pre  = defined $_[2] ? $_[2] : '\s*';
129	my $esc  = defined $_[3] ? $_[3] : qq{\\};
130	my $pat = gen_delimited_pat($del, $esc);
131	my $startpos = pos $$textref || 0;
132	return _fail($wantarray, $textref, "Not a delimited pattern", 0)
133		unless $$textref =~ m/\G($pre)($pat)/gc;
134	my $prelen = length($1);
135	my $matchpos = $startpos+$prelen;
136	my $endpos = pos $$textref;
137	return _succeed $wantarray, $textref,
138			$matchpos, $endpos-$matchpos,		# MATCH
139			$endpos,   length($$textref)-$endpos,	# REMAINDER
140			$startpos, $prelen;			# PREFIX
141}
142
143sub extract_bracketed (;$$$)
144{
145	my $textref = defined $_[0] ? \$_[0] : \$_;
146	my $ldel = defined $_[1] ? $_[1] : '{([<';
147	my $pre  = defined $_[2] ? $_[2] : '\s*';
148	my $wantarray = wantarray;
149	my $qdel = "";
150	my $quotelike;
151	$ldel =~ s/'//g and $qdel .= q{'};
152	$ldel =~ s/"//g and $qdel .= q{"};
153	$ldel =~ s/`//g and $qdel .= q{`};
154	$ldel =~ s/q//g and $quotelike = 1;
155	$ldel =~ tr/[](){}<>\0-\377/[[(({{<</ds;
156	my $rdel = $ldel;
157	unless ($rdel =~ tr/[({</])}>/)
158        {
159		return _fail $wantarray, $textref,
160			     "Did not find a suitable bracket in delimiter: \"$_[1]\"",
161			     0;
162	}
163	my $posbug = pos;
164	$ldel = join('|', map { quotemeta $_ } split('', $ldel));
165	$rdel = join('|', map { quotemeta $_ } split('', $rdel));
166	pos = $posbug;
167
168	my $startpos = pos $$textref || 0;
169	my @match = _match_bracketed($textref,$pre, $ldel, $qdel, $quotelike, $rdel);
170
171	return _fail ($wantarray, $textref) unless @match;
172
173	return _succeed ( $wantarray, $textref,
174			  $match[2], $match[5]+2,	# MATCH
175			  @match[8,9],			# REMAINDER
176			  @match[0,1],			# PREFIX
177			);
178}
179
180sub _match_bracketed($$$$$$)	# $textref, $pre, $ldel, $qdel, $quotelike, $rdel
181{
182	my ($textref, $pre, $ldel, $qdel, $quotelike, $rdel) = @_;
183	my ($startpos, $ldelpos, $endpos) = (pos $$textref = pos $$textref||0);
184	unless ($$textref =~ m/\G$pre/gc)
185	{
186		_failmsg "Did not find prefix: /$pre/", $startpos;
187		return;
188	}
189
190	$ldelpos = pos $$textref;
191
192	unless ($$textref =~ m/\G($ldel)/gc)
193	{
194		_failmsg "Did not find opening bracket after prefix: \"$pre\"",
195		         pos $$textref;
196		pos $$textref = $startpos;
197		return;
198	}
199
200	my @nesting = ( $1 );
201	my $textlen = length $$textref;
202	while (pos $$textref < $textlen)
203	{
204		next if $$textref =~ m/\G\\./gcs;
205
206		if ($$textref =~ m/\G($ldel)/gc)
207		{
208			push @nesting, $1;
209		}
210		elsif ($$textref =~ m/\G($rdel)/gc)
211		{
212			my ($found, $brackettype) = ($1, $1);
213			if ($#nesting < 0)
214			{
215				_failmsg "Unmatched closing bracket: \"$found\"",
216					 pos $$textref;
217				pos $$textref = $startpos;
218			        return;
219			}
220			my $expected = pop(@nesting);
221			$expected =~ tr/({[</)}]>/;
222			if ($expected ne $brackettype)
223			{
224				_failmsg qq{Mismatched closing bracket: expected "$expected" but found "$found"},
225					 pos $$textref;
226				pos $$textref = $startpos;
227			        return;
228			}
229			last if $#nesting < 0;
230		}
231		elsif ($qdel && $$textref =~ m/\G([$qdel])/gc)
232		{
233			$$textref =~ m/\G[^\\$1]*(?:\\.[^\\$1]*)*(\Q$1\E)/gsc and next;
234			_failmsg "Unmatched embedded quote ($1)",
235				 pos $$textref;
236			pos $$textref = $startpos;
237			return;
238		}
239		elsif ($quotelike && _match_quotelike($textref,"",1,0))
240		{
241			next;
242		}
243
244		else { $$textref =~ m/\G(?:[a-zA-Z0-9]+|.)/gcs }
245	}
246	if ($#nesting>=0)
247	{
248		_failmsg "Unmatched opening bracket(s): "
249				. join("..",@nesting)."..",
250		         pos $$textref;
251		pos $$textref = $startpos;
252		return;
253	}
254
255	$endpos = pos $$textref;
256
257	return (
258		$startpos,  $ldelpos-$startpos,		# PREFIX
259		$ldelpos,   1,				# OPENING BRACKET
260		$ldelpos+1, $endpos-$ldelpos-2,		# CONTENTS
261		$endpos-1,  1,				# CLOSING BRACKET
262		$endpos,    length($$textref)-$endpos,	# REMAINDER
263	       );
264}
265
266sub _revbracket($)
267{
268	my $brack = reverse $_[0];
269	$brack =~ tr/[({</])}>/;
270	return $brack;
271}
272
273my $XMLNAME = q{[a-zA-Z_:][a-zA-Z0-9_:.-]*};
274
275sub extract_tagged (;$$$$$) # ($text, $opentag, $closetag, $pre, \%options)
276{
277	my $textref = defined $_[0] ? \$_[0] : \$_;
278	my $ldel    = $_[1];
279	my $rdel    = $_[2];
280	my $pre     = defined $_[3] ? $_[3] : '\s*';
281	my %options = defined $_[4] ? %{$_[4]} : ();
282	my $omode   = defined $options{fail} ? $options{fail} : '';
283	my $bad     = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}})
284		    : defined($options{reject})	       ? $options{reject}
285		    :					 ''
286		    ;
287	my $ignore  = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}})
288		    : defined($options{ignore})	       ? $options{ignore}
289		    :					 ''
290		    ;
291
292	if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; }
293	$@ = undef;
294
295	my @match = _match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore);
296
297	return _fail(wantarray, $textref) unless @match;
298	return _succeed wantarray, $textref,
299			$match[2], $match[3]+$match[5]+$match[7],	# MATCH
300			@match[8..9,0..1,2..7];				# REM, PRE, BITS
301}
302
303sub _match_tagged	# ($$$$$$$)
304{
305	my ($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore) = @_;
306	my $rdelspec;
307
308	my ($startpos, $opentagpos, $textpos, $parapos, $closetagpos, $endpos) = ( pos($$textref) = pos($$textref)||0 );
309
310	unless ($$textref =~ m/\G($pre)/gc)
311	{
312		_failmsg "Did not find prefix: /$pre/", pos $$textref;
313		goto failed;
314	}
315
316	$opentagpos = pos($$textref);
317
318	unless ($$textref =~ m/\G$ldel/gc)
319	{
320		_failmsg "Did not find opening tag: /$ldel/", pos $$textref;
321		goto failed;
322	}
323
324	$textpos = pos($$textref);
325
326	if (!defined $rdel)
327	{
328		$rdelspec = substr($$textref, $-[0], $+[0] - $-[0]);
329		unless ($rdelspec =~ s/\A([[(<{]+)($XMLNAME).*/ quotemeta "$1\/$2". _revbracket($1) /oes)
330		{
331			_failmsg "Unable to construct closing tag to match: $rdel",
332				 pos $$textref;
333			goto failed;
334		}
335	}
336	else
337	{
338		$rdelspec = eval "qq{$rdel}" || do {
339			my $del;
340			for (qw,~ ! ^ & * ) _ + - = } ] : " ; ' > . ? / | ',)
341				{ next if $rdel =~ /\Q$_/; $del = $_; last }
342			unless ($del) {
343				use Carp;
344				croak "Can't interpolate right delimiter $rdel"
345			}
346			eval "qq$del$rdel$del";
347		};
348	}
349
350	while (pos($$textref) < length($$textref))
351	{
352		next if $$textref =~ m/\G\\./gc;
353
354		if ($$textref =~ m/\G(\n[ \t]*\n)/gc )
355		{
356			$parapos = pos($$textref) - length($1)
357				unless defined $parapos;
358		}
359		elsif ($$textref =~ m/\G($rdelspec)/gc )
360		{
361			$closetagpos = pos($$textref)-length($1);
362			goto matched;
363		}
364		elsif ($ignore && $$textref =~ m/\G(?:$ignore)/gc)
365		{
366			next;
367		}
368		elsif ($bad && $$textref =~ m/\G($bad)/gcs)
369		{
370			pos($$textref) -= length($1);	# CUT OFF WHATEVER CAUSED THE SHORTNESS
371			goto short if ($omode eq 'PARA' || $omode eq 'MAX');
372			_failmsg "Found invalid nested tag: $1", pos $$textref;
373			goto failed;
374		}
375		elsif ($$textref =~ m/\G($ldel)/gc)
376		{
377			my $tag = $1;
378			pos($$textref) -= length($tag);	# REWIND TO NESTED TAG
379			unless (_match_tagged(@_))	# MATCH NESTED TAG
380			{
381				goto short if $omode eq 'PARA' || $omode eq 'MAX';
382				_failmsg "Found unbalanced nested tag: $tag",
383					 pos $$textref;
384				goto failed;
385			}
386		}
387		else { $$textref =~ m/./gcs }
388	}
389
390short:
391	$closetagpos = pos($$textref);
392	goto matched if $omode eq 'MAX';
393	goto failed unless $omode eq 'PARA';
394
395	if (defined $parapos) { pos($$textref) = $parapos }
396	else		      { $parapos = pos($$textref) }
397
398	return (
399		$startpos,    $opentagpos-$startpos,		# PREFIX
400		$opentagpos,  $textpos-$opentagpos,		# OPENING TAG
401		$textpos,     $parapos-$textpos,		# TEXT
402		$parapos,     0,				# NO CLOSING TAG
403		$parapos,     length($$textref)-$parapos,	# REMAINDER
404	       );
405
406matched:
407	$endpos = pos($$textref);
408	return (
409		$startpos,    $opentagpos-$startpos,		# PREFIX
410		$opentagpos,  $textpos-$opentagpos,		# OPENING TAG
411		$textpos,     $closetagpos-$textpos,		# TEXT
412		$closetagpos, $endpos-$closetagpos,		# CLOSING TAG
413		$endpos,      length($$textref)-$endpos,	# REMAINDER
414	       );
415
416failed:
417	_failmsg "Did not find closing tag", pos $$textref unless $@;
418	pos($$textref) = $startpos;
419	return;
420}
421
422sub extract_variable (;$$)
423{
424	my $textref = defined $_[0] ? \$_[0] : \$_;
425	return ("","","") unless defined $$textref;
426	my $pre  = defined $_[1] ? $_[1] : '\s*';
427
428	my @match = _match_variable($textref,$pre);
429
430	return _fail wantarray, $textref unless @match;
431
432	return _succeed wantarray, $textref,
433			@match[2..3,4..5,0..1];		# MATCH, REMAINDER, PREFIX
434}
435
436sub _match_variable($$)
437{
438#  $#
439#  $^
440#  $$
441	my ($textref, $pre) = @_;
442	my $startpos = pos($$textref) = pos($$textref)||0;
443	unless ($$textref =~ m/\G($pre)/gc)
444	{
445		_failmsg "Did not find prefix: /$pre/", pos $$textref;
446		return;
447	}
448	my $varpos = pos($$textref);
449        unless ($$textref =~ m{\G\$\s*(?!::)(\d+|[][&`'+*./|,";%=~:?!\@<>()-]|\^[a-z]?)}gci)
450	{
451	    unless ($$textref =~ m/\G((\$#?|[*\@\%]|\\&)+)/gc)
452	    {
453		_failmsg "Did not find leading dereferencer", pos $$textref;
454		pos $$textref = $startpos;
455		return;
456	    }
457	    my $deref = $1;
458
459	    unless ($$textref =~ m/\G\s*(?:::|')?(?:[_a-z]\w*(?:::|'))*[_a-z]\w*/gci
460	    	or _match_codeblock($textref, "", '\{', '\}', '\{', '\}', 0)
461		or $deref eq '$#' or $deref eq '$$' )
462	    {
463		_failmsg "Bad identifier after dereferencer", pos $$textref;
464		pos $$textref = $startpos;
465		return;
466	    }
467	}
468
469	while (1)
470	{
471		next if $$textref =~ m/\G\s*(?:->)?\s*[{]\w+[}]/gc;
472		next if _match_codeblock($textref,
473					 qr/\s*->\s*(?:[_a-zA-Z]\w+\s*)?/,
474					 qr/[({[]/, qr/[)}\]]/,
475					 qr/[({[]/, qr/[)}\]]/, 0);
476		next if _match_codeblock($textref,
477					 qr/\s*/, qr/[{[]/, qr/[}\]]/,
478					 qr/[{[]/, qr/[}\]]/, 0);
479		next if _match_variable($textref,'\s*->\s*');
480		next if $$textref =~ m/\G\s*->\s*\w+(?![{([])/gc;
481		last;
482	}
483
484	my $endpos = pos($$textref);
485	return ($startpos, $varpos-$startpos,
486		$varpos,   $endpos-$varpos,
487		$endpos,   length($$textref)-$endpos
488		);
489}
490
491sub extract_codeblock (;$$$$$)
492{
493	my $textref = defined $_[0] ? \$_[0] : \$_;
494	my $wantarray = wantarray;
495	my $ldel_inner = defined $_[1] ? $_[1] : '{';
496	my $pre        = defined $_[2] ? $_[2] : '\s*';
497	my $ldel_outer = defined $_[3] ? $_[3] : $ldel_inner;
498	my $rd         = $_[4];
499	my $rdel_inner = $ldel_inner;
500	my $rdel_outer = $ldel_outer;
501	my $posbug = pos;
502	for ($ldel_inner, $ldel_outer) { tr/[]()<>{}\0-\377/[[((<<{{/ds }
503	for ($rdel_inner, $rdel_outer) { tr/[]()<>{}\0-\377/]]))>>}}/ds }
504	for ($ldel_inner, $ldel_outer, $rdel_inner, $rdel_outer)
505	{
506		$_ = '('.join('|',map { quotemeta $_ } split('',$_)).')'
507	}
508	pos = $posbug;
509
510	my @match = _match_codeblock($textref, $pre,
511				     $ldel_outer, $rdel_outer,
512				     $ldel_inner, $rdel_inner,
513				     $rd);
514	return _fail($wantarray, $textref) unless @match;
515	return _succeed($wantarray, $textref,
516			@match[2..3,4..5,0..1]	# MATCH, REMAINDER, PREFIX
517		       );
518
519}
520
521sub _match_codeblock($$$$$$$)
522{
523	my ($textref, $pre, $ldel_outer, $rdel_outer, $ldel_inner, $rdel_inner, $rd) = @_;
524	my $startpos = pos($$textref) = pos($$textref) || 0;
525	unless ($$textref =~ m/\G($pre)/gc)
526	{
527		_failmsg qq{Did not match prefix /$pre/ at"} .
528			    substr($$textref,pos($$textref),20) .
529			    q{..."},
530		         pos $$textref;
531		return;
532	}
533	my $codepos = pos($$textref);
534	unless ($$textref =~ m/\G($ldel_outer)/gc)	# OUTERMOST DELIMITER
535	{
536		_failmsg qq{Did not find expected opening bracket at "} .
537			     substr($$textref,pos($$textref),20) .
538			     q{..."},
539		         pos $$textref;
540		pos $$textref = $startpos;
541		return;
542	}
543	my $closing = $1;
544	   $closing =~ tr/([<{/)]>}/;
545	my $matched;
546	my $patvalid = 1;
547	while (pos($$textref) < length($$textref))
548	{
549		$matched = '';
550		if ($rd && $$textref =~ m#\G(\Q(?)\E|\Q(s?)\E|\Q(s)\E)#gc)
551		{
552			$patvalid = 0;
553			next;
554		}
555
556		if ($$textref =~ m/\G\s*#.*/gc)
557		{
558			next;
559		}
560
561		if ($$textref =~ m/\G\s*($rdel_outer)/gc)
562		{
563			unless ($matched = ($closing && $1 eq $closing) )
564			{
565				next if $1 eq '>';	# MIGHT BE A "LESS THAN"
566				_failmsg q{Mismatched closing bracket at "} .
567					     substr($$textref,pos($$textref),20) .
568					     qq{...". Expected '$closing'},
569					 pos $$textref;
570			}
571			last;
572		}
573
574		if (_match_variable($textref,'\s*') ||
575		    _match_quotelike($textref,'\s*',$patvalid,$patvalid) )
576		{
577			$patvalid = 0;
578			next;
579		}
580
581
582		# NEED TO COVER MANY MORE CASES HERE!!!
583		if ($$textref =~ m#\G\s*(?!$ldel_inner)
584					( [-+*x/%^&|.]=?
585					| [!=]~
586					| =(?!>)
587					| (\*\*|&&|\|\||<<|>>)=?
588					| split|grep|map|return
589					| [([]
590					)#gcx)
591		{
592			$patvalid = 1;
593			next;
594		}
595
596		if ( _match_codeblock($textref, '\s*', $ldel_inner, $rdel_inner, $ldel_inner, $rdel_inner, $rd) )
597		{
598			$patvalid = 1;
599			next;
600		}
601
602		if ($$textref =~ m/\G\s*$ldel_outer/gc)
603		{
604			_failmsg q{Improperly nested codeblock at "} .
605				     substr($$textref,pos($$textref),20) .
606				     q{..."},
607				 pos $$textref;
608			last;
609		}
610
611		$patvalid = 0;
612		$$textref =~ m/\G\s*(\w+|[-=>]>|.|\Z)/gc;
613	}
614	continue { $@ = undef }
615
616	unless ($matched)
617	{
618		_failmsg 'No match found for opening bracket', pos $$textref
619			unless $@;
620		return;
621	}
622
623	my $endpos = pos($$textref);
624	return ( $startpos, $codepos-$startpos,
625		 $codepos, $endpos-$codepos,
626		 $endpos,  length($$textref)-$endpos,
627	       );
628}
629
630
631my %mods   = (
632		'none'	=> '[cgimsox]*',
633		'm'	=> '[cgimsox]*',
634		's'	=> '[cegimsox]*',
635		'tr'	=> '[cds]*',
636		'y'	=> '[cds]*',
637		'qq'	=> '',
638		'qx'	=> '',
639		'qw'	=> '',
640		'qr'	=> '[imsx]*',
641		'q'	=> '',
642	     );
643
644sub extract_quotelike (;$$)
645{
646	my $textref = $_[0] ? \$_[0] : \$_;
647	my $wantarray = wantarray;
648	my $pre  = defined $_[1] ? $_[1] : '\s*';
649
650	my @match = _match_quotelike($textref,$pre,1,0);
651	return _fail($wantarray, $textref) unless @match;
652	return _succeed($wantarray, $textref,
653			$match[2], $match[18]-$match[2],	# MATCH
654			@match[18,19],				# REMAINDER
655			@match[0,1],				# PREFIX
656			@match[2..17],				# THE BITS
657			@match[20,21],				# ANY FILLET?
658		       );
659};
660
661sub _match_quotelike($$$$)	# ($textref, $prepat, $allow_raw_match)
662{
663	my ($textref, $pre, $rawmatch, $qmark) = @_;
664
665	my ($textlen,$startpos,
666	    $oppos,
667	    $preld1pos,$ld1pos,$str1pos,$rd1pos,
668	    $preld2pos,$ld2pos,$str2pos,$rd2pos,
669	    $modpos) = ( length($$textref), pos($$textref) = pos($$textref) || 0 );
670
671	unless ($$textref =~ m/\G($pre)/gc)
672	{
673		_failmsg qq{Did not find prefix /$pre/ at "} .
674			     substr($$textref, pos($$textref), 20) .
675			     q{..."},
676		         pos $$textref;
677		return;
678	}
679	$oppos = pos($$textref);
680
681	my $initial = substr($$textref,$oppos,1);
682
683	if ($initial && $initial =~ m|^[\"\'\`]|
684		     || $rawmatch && $initial =~ m|^/|
685		     || $qmark && $initial =~ m|^\?|)
686	{
687		unless ($$textref =~ m/ \Q$initial\E [^\\$initial]* (\\.[^\\$initial]*)* \Q$initial\E /gcsx)
688		{
689			_failmsg qq{Did not find closing delimiter to match '$initial' at "} .
690				     substr($$textref, $oppos, 20) .
691				     q{..."},
692				 pos $$textref;
693			pos $$textref = $startpos;
694			return;
695		}
696		$modpos= pos($$textref);
697		$rd1pos = $modpos-1;
698
699		if ($initial eq '/' || $initial eq '?')
700		{
701			$$textref =~ m/\G$mods{none}/gc
702		}
703
704		my $endpos = pos($$textref);
705		return (
706			$startpos,	$oppos-$startpos,	# PREFIX
707			$oppos,		0,			# NO OPERATOR
708			$oppos,		1,			# LEFT DEL
709			$oppos+1,	$rd1pos-$oppos-1,	# STR/PAT
710			$rd1pos,	1,			# RIGHT DEL
711			$modpos,	0,			# NO 2ND LDEL
712			$modpos,	0,			# NO 2ND STR
713			$modpos,	0,			# NO 2ND RDEL
714			$modpos,	$endpos-$modpos,	# MODIFIERS
715			$endpos, 	$textlen-$endpos,	# REMAINDER
716		       );
717	}
718
719	unless ($$textref =~ m{\G(\b(?:m|s|qq|qx|qw|q|qr|tr|y)\b(?=\s*\S)|<<)}gc)
720	{
721		_failmsg q{No quotelike operator found after prefix at "} .
722			     substr($$textref, pos($$textref), 20) .
723			     q{..."},
724		         pos $$textref;
725		pos $$textref = $startpos;
726		return;
727	}
728
729	my $op = $1;
730	$preld1pos = pos($$textref);
731	if ($op eq '<<') {
732		$ld1pos = pos($$textref);
733		my $label;
734		if ($$textref =~ m{\G([A-Za-z_]\w*)}gc) {
735			$label = $1;
736		}
737		elsif ($$textref =~ m{ \G ' ([^'\\]* (?:\\.[^'\\]*)*) '
738				     | \G " ([^"\\]* (?:\\.[^"\\]*)*) "
739				     | \G ` ([^`\\]* (?:\\.[^`\\]*)*) `
740				     }gcsx) {
741			$label = $+;
742		}
743		else {
744			$label = "";
745		}
746		my $extrapos = pos($$textref);
747		$$textref =~ m{.*\n}gc;
748		$str1pos = pos($$textref)--;
749		unless ($$textref =~ m{.*?\n(?=\Q$label\E\n)}gc) {
750			_failmsg qq{Missing here doc terminator ('$label') after "} .
751				     substr($$textref, $startpos, 20) .
752				     q{..."},
753				 pos $$textref;
754			pos $$textref = $startpos;
755			return;
756		}
757		$rd1pos = pos($$textref);
758        $$textref =~ m{\Q$label\E\n}gc;
759		$ld2pos = pos($$textref);
760		return (
761			$startpos,	$oppos-$startpos,	# PREFIX
762			$oppos,		length($op),		# OPERATOR
763			$ld1pos,	$extrapos-$ld1pos,	# LEFT DEL
764			$str1pos,	$rd1pos-$str1pos,	# STR/PAT
765			$rd1pos,	$ld2pos-$rd1pos,	# RIGHT DEL
766			$ld2pos,	0,			# NO 2ND LDEL
767			$ld2pos,	0,                	# NO 2ND STR
768			$ld2pos,	0,	                # NO 2ND RDEL
769			$ld2pos,	0,                      # NO MODIFIERS
770			$ld2pos,	$textlen-$ld2pos,	# REMAINDER
771			$extrapos,      $str1pos-$extrapos,	# FILLETED BIT
772		       );
773	}
774
775	$$textref =~ m/\G\s*/gc;
776	$ld1pos = pos($$textref);
777	$str1pos = $ld1pos+1;
778
779	unless ($$textref =~ m/\G(\S)/gc)	# SHOULD USE LOOKAHEAD
780	{
781		_failmsg "No block delimiter found after quotelike $op",
782		         pos $$textref;
783		pos $$textref = $startpos;
784		return;
785	}
786	pos($$textref) = $ld1pos;	# HAVE TO DO THIS BECAUSE LOOKAHEAD BROKEN
787	my ($ldel1, $rdel1) = ("\Q$1","\Q$1");
788	if ($ldel1 =~ /[[(<{]/)
789	{
790		$rdel1 =~ tr/[({</])}>/;
791		defined(_match_bracketed($textref,"",$ldel1,"","",$rdel1))
792		|| do { pos $$textref = $startpos; return };
793        $ld2pos = pos($$textref);
794        $rd1pos = $ld2pos-1;
795	}
796	else
797	{
798		$$textref =~ /\G$ldel1[^\\$ldel1]*(\\.[^\\$ldel1]*)*$ldel1/gcs
799		|| do { pos $$textref = $startpos; return };
800        $ld2pos = $rd1pos = pos($$textref)-1;
801	}
802
803	my $second_arg = $op =~ /s|tr|y/ ? 1 : 0;
804	if ($second_arg)
805	{
806		my ($ldel2, $rdel2);
807		if ($ldel1 =~ /[[(<{]/)
808		{
809			unless ($$textref =~ /\G\s*(\S)/gc)	# SHOULD USE LOOKAHEAD
810			{
811				_failmsg "Missing second block for quotelike $op",
812					 pos $$textref;
813				pos $$textref = $startpos;
814				return;
815			}
816			$ldel2 = $rdel2 = "\Q$1";
817			$rdel2 =~ tr/[({</])}>/;
818		}
819		else
820		{
821			$ldel2 = $rdel2 = $ldel1;
822		}
823		$str2pos = $ld2pos+1;
824
825		if ($ldel2 =~ /[[(<{]/)
826		{
827			pos($$textref)--;	# OVERCOME BROKEN LOOKAHEAD
828			defined(_match_bracketed($textref,"",$ldel2,"","",$rdel2))
829			|| do { pos $$textref = $startpos; return };
830		}
831		else
832		{
833			$$textref =~ /[^\\$ldel2]*(\\.[^\\$ldel2]*)*$ldel2/gcs
834			|| do { pos $$textref = $startpos; return };
835		}
836		$rd2pos = pos($$textref)-1;
837	}
838	else
839	{
840		$ld2pos = $str2pos = $rd2pos = $rd1pos;
841	}
842
843	$modpos = pos $$textref;
844
845	$$textref =~ m/\G($mods{$op})/gc;
846	my $endpos = pos $$textref;
847
848	return (
849		$startpos,	$oppos-$startpos,	# PREFIX
850		$oppos,		length($op),		# OPERATOR
851		$ld1pos,	1,			# LEFT DEL
852		$str1pos,	$rd1pos-$str1pos,	# STR/PAT
853		$rd1pos,	1,			# RIGHT DEL
854		$ld2pos,	$second_arg,		# 2ND LDEL (MAYBE)
855		$str2pos,	$rd2pos-$str2pos,	# 2ND STR (MAYBE)
856		$rd2pos,	$second_arg,		# 2ND RDEL (MAYBE)
857		$modpos,	$endpos-$modpos,	# MODIFIERS
858		$endpos,	$textlen-$endpos,	# REMAINDER
859	       );
860}
861
862my $def_func = [
863	sub { extract_variable($_[0], '') },
864	sub { extract_quotelike($_[0],'') },
865	sub { extract_codeblock($_[0],'{}','') },
866];
867
868sub extract_multiple (;$$$$)	# ($text, $functions_ref, $max_fields, $ignoreunknown)
869{
870	my $textref = defined($_[0]) ? \$_[0] : \$_;
871	my $posbug = pos;
872	my ($lastpos, $firstpos);
873	my @fields = ();
874
875	#for ($$textref)
876	{
877		my @func = defined $_[1] ? @{$_[1]} : @{$def_func};
878		my $max  = defined $_[2] && $_[2]>0 ? $_[2] : 1_000_000_000;
879		my $igunk = $_[3];
880
881		pos $$textref ||= 0;
882
883		unless (wantarray)
884		{
885			use Carp;
886			carp "extract_multiple reset maximal count to 1 in scalar context"
887				if $^W && defined($_[2]) && $max > 1;
888			$max = 1
889		}
890
891		my $unkpos;
892		my $func;
893		my $class;
894
895		my @class;
896		foreach $func ( @func )
897		{
898			if (ref($func) eq 'HASH')
899			{
900				push @class, (keys %$func)[0];
901				$func = (values %$func)[0];
902			}
903			else
904			{
905				push @class, undef;
906			}
907		}
908
909		FIELD: while (pos($$textref) < length($$textref))
910		{
911			my ($field, $rem);
912			my @bits;
913			foreach my $i ( 0..$#func )
914			{
915				my $pref;
916				$func = $func[$i];
917				$class = $class[$i];
918				$lastpos = pos $$textref;
919				if (ref($func) eq 'CODE')
920					{ ($field,$rem,$pref) = @bits = $func->($$textref) }
921				elsif (ref($func) eq 'Text::Balanced::Extractor')
922					{ @bits = $field = $func->extract($$textref) }
923				elsif( $$textref =~ m/\G$func/gc )
924					{ @bits = $field = defined($1)
925                                ? $1
926                                : substr($$textref, $-[0], $+[0] - $-[0])
927                    }
928				$pref ||= "";
929				if (defined($field) && length($field))
930				{
931					if (!$igunk) {
932						$unkpos = $lastpos
933							if length($pref) && !defined($unkpos);
934						if (defined $unkpos)
935						{
936							push @fields, substr($$textref, $unkpos, $lastpos-$unkpos).$pref;
937							$firstpos = $unkpos unless defined $firstpos;
938							undef $unkpos;
939							last FIELD if @fields == $max;
940						}
941					}
942					push @fields, $class
943						? bless (\$field, $class)
944						: $field;
945					$firstpos = $lastpos unless defined $firstpos;
946					$lastpos = pos $$textref;
947					last FIELD if @fields == $max;
948					next FIELD;
949				}
950			}
951			if ($$textref =~ /\G(.)/gcs)
952			{
953				$unkpos = pos($$textref)-1
954					unless $igunk || defined $unkpos;
955			}
956		}
957
958		if (defined $unkpos)
959		{
960			push @fields, substr($$textref, $unkpos);
961			$firstpos = $unkpos unless defined $firstpos;
962			$lastpos = length $$textref;
963		}
964		last;
965	}
966
967	pos $$textref = $lastpos;
968	return @fields if wantarray;
969
970	$firstpos ||= 0;
971	eval { substr($$textref,$firstpos,$lastpos-$firstpos)="";
972	       pos $$textref = $firstpos };
973	return $fields[0];
974}
975
976sub gen_extract_tagged # ($opentag, $closetag, $pre, \%options)
977{
978	my $ldel    = $_[0];
979	my $rdel    = $_[1];
980	my $pre     = defined $_[2] ? $_[2] : '\s*';
981	my %options = defined $_[3] ? %{$_[3]} : ();
982	my $omode   = defined $options{fail} ? $options{fail} : '';
983	my $bad     = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}})
984		    : defined($options{reject})	       ? $options{reject}
985		    :					 ''
986		    ;
987	my $ignore  = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}})
988		    : defined($options{ignore})	       ? $options{ignore}
989		    :					 ''
990		    ;
991
992	if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; }
993
994	my $posbug = pos;
995	for ($ldel, $pre, $bad, $ignore) { $_ = qr/$_/ if $_ }
996	pos = $posbug;
997
998	my $closure = sub
999	{
1000		my $textref = defined $_[0] ? \$_[0] : \$_;
1001		my @match = Text::Balanced::_match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore);
1002
1003		return _fail(wantarray, $textref) unless @match;
1004		return _succeed wantarray, $textref,
1005				$match[2], $match[3]+$match[5]+$match[7],	# MATCH
1006				@match[8..9,0..1,2..7];				# REM, PRE, BITS
1007	};
1008
1009	bless $closure, 'Text::Balanced::Extractor';
1010}
1011
1012package Text::Balanced::Extractor;
1013
1014sub extract($$)	# ($self, $text)
1015{
1016	&{$_[0]}($_[1]);
1017}
1018
1019package Text::Balanced::ErrorMsg;
1020
1021use overload '""' => sub { "$_[0]->{error}, detected at offset $_[0]->{pos}" };
1022
10231;
1024
1025__END__
1026
1027=pod
1028
1029=head1 NAME
1030
1031Text::Balanced - Extract delimited text sequences from strings.
1032
1033=head1 SYNOPSIS
1034
1035 use Text::Balanced qw (
1036			extract_delimited
1037			extract_bracketed
1038			extract_quotelike
1039			extract_codeblock
1040			extract_variable
1041			extract_tagged
1042			extract_multiple
1043			gen_delimited_pat
1044			gen_extract_tagged
1045		       );
1046
1047 # Extract the initial substring of $text that is delimited by
1048 # two (unescaped) instances of the first character in $delim.
1049
1050	($extracted, $remainder) = extract_delimited($text,$delim);
1051
1052
1053 # Extract the initial substring of $text that is bracketed
1054 # with a delimiter(s) specified by $delim (where the string
1055 # in $delim contains one or more of '(){}[]<>').
1056
1057	($extracted, $remainder) = extract_bracketed($text,$delim);
1058
1059
1060 # Extract the initial substring of $text that is bounded by
1061 # an XML tag.
1062
1063	($extracted, $remainder) = extract_tagged($text);
1064
1065
1066 # Extract the initial substring of $text that is bounded by
1067 # a C<BEGIN>...C<END> pair. Don't allow nested C<BEGIN> tags
1068
1069	($extracted, $remainder) =
1070		extract_tagged($text,"BEGIN","END",undef,{bad=>["BEGIN"]});
1071
1072
1073 # Extract the initial substring of $text that represents a
1074 # Perl "quote or quote-like operation"
1075
1076	($extracted, $remainder) = extract_quotelike($text);
1077
1078
1079 # Extract the initial substring of $text that represents a block
1080 # of Perl code, bracketed by any of character(s) specified by $delim
1081 # (where the string $delim contains one or more of '(){}[]<>').
1082
1083	($extracted, $remainder) = extract_codeblock($text,$delim);
1084
1085
1086 # Extract the initial substrings of $text that would be extracted by
1087 # one or more sequential applications of the specified functions
1088 # or regular expressions
1089
1090	@extracted = extract_multiple($text,
1091				      [ \&extract_bracketed,
1092					\&extract_quotelike,
1093					\&some_other_extractor_sub,
1094					qr/[xyz]*/,
1095					'literal',
1096				      ]);
1097
1098# Create a string representing an optimized pattern (a la Friedl)
1099# that matches a substring delimited by any of the specified characters
1100# (in this case: any type of quote or a slash)
1101
1102	$patstring = gen_delimited_pat(q{'"`/});
1103
1104# Generate a reference to an anonymous sub that is just like extract_tagged
1105# but pre-compiled and optimized for a specific pair of tags, and consequently
1106# much faster (i.e. 3 times faster). It uses qr// for better performance on
1107# repeated calls, so it only works under Perl 5.005 or later.
1108
1109	$extract_head = gen_extract_tagged('<HEAD>','</HEAD>');
1110
1111	($extracted, $remainder) = $extract_head->($text);
1112
1113=head1 DESCRIPTION
1114
1115The various C<extract_...> subroutines may be used to
1116extract a delimited substring, possibly after skipping a
1117specified prefix string. By default, that prefix is
1118optional whitespace (C</\s*/>), but you can change it to whatever
1119you wish (see below).
1120
1121The substring to be extracted must appear at the
1122current C<pos> location of the string's variable
1123(or at index zero, if no C<pos> position is defined).
1124In other words, the C<extract_...> subroutines I<don't>
1125extract the first occurrence of a substring anywhere
1126in a string (like an unanchored regex would). Rather,
1127they extract an occurrence of the substring appearing
1128immediately at the current matching position in the
1129string (like a C<\G>-anchored regex would).
1130
1131=head2 General behaviour in list contexts
1132
1133In a list context, all the subroutines return a list, the first three
1134elements of which are always:
1135
1136=over 4
1137
1138=item [0]
1139
1140The extracted string, including the specified delimiters.
1141If the extraction fails C<undef> is returned.
1142
1143=item [1]
1144
1145The remainder of the input string (i.e. the characters after the
1146extracted string). On failure, the entire string is returned.
1147
1148=item [2]
1149
1150The skipped prefix (i.e. the characters before the extracted string).
1151On failure, C<undef> is returned.
1152
1153=back
1154
1155Note that in a list context, the contents of the original input text (the first
1156argument) are not modified in any way.
1157
1158However, if the input text was passed in a variable, that variable's
1159C<pos> value is updated to point at the first character after the
1160extracted text. That means that in a list context the various
1161subroutines can be used much like regular expressions. For example:
1162
1163	while ( $next = (extract_quotelike($text))[0] )
1164	{
1165		# process next quote-like (in $next)
1166	}
1167
1168=head2 General behaviour in scalar and void contexts
1169
1170In a scalar context, the extracted string is returned, having first been
1171removed from the input text. Thus, the following code also processes
1172each quote-like operation, but actually removes them from $text:
1173
1174	while ( $next = extract_quotelike($text) )
1175	{
1176		# process next quote-like (in $next)
1177	}
1178
1179Note that if the input text is a read-only string (i.e. a literal),
1180no attempt is made to remove the extracted text.
1181
1182In a void context the behaviour of the extraction subroutines is
1183exactly the same as in a scalar context, except (of course) that the
1184extracted substring is not returned.
1185
1186=head2 A note about prefixes
1187
1188Prefix patterns are matched without any trailing modifiers (C</gimsox> etc.)
1189This can bite you if you're expecting a prefix specification like
1190'.*?(?=<H1>)' to skip everything up to the first <H1> tag. Such a prefix
1191pattern will only succeed if the <H1> tag is on the current line, since
1192. normally doesn't match newlines.
1193
1194To overcome this limitation, you need to turn on /s matching within
1195the prefix pattern, using the C<(?s)> directive: '(?s).*?(?=<H1>)'
1196
1197=head2 C<extract_delimited>
1198
1199The C<extract_delimited> function formalizes the common idiom
1200of extracting a single-character-delimited substring from the start of
1201a string. For example, to extract a single-quote delimited string, the
1202following code is typically used:
1203
1204	($remainder = $text) =~ s/\A('(\\.|[^'])*')//s;
1205	$extracted = $1;
1206
1207but with C<extract_delimited> it can be simplified to:
1208
1209	($extracted,$remainder) = extract_delimited($text, "'");
1210
1211C<extract_delimited> takes up to four scalars (the input text, the
1212delimiters, a prefix pattern to be skipped, and any escape characters)
1213and extracts the initial substring of the text that
1214is appropriately delimited. If the delimiter string has multiple
1215characters, the first one encountered in the text is taken to delimit
1216the substring.
1217The third argument specifies a prefix pattern that is to be skipped
1218(but must be present!) before the substring is extracted.
1219The final argument specifies the escape character to be used for each
1220delimiter.
1221
1222All arguments are optional. If the escape characters are not specified,
1223every delimiter is escaped with a backslash (C<\>).
1224If the prefix is not specified, the
1225pattern C<'\s*'> - optional whitespace - is used. If the delimiter set
1226is also not specified, the set C</["'`]/> is used. If the text to be processed
1227is not specified either, C<$_> is used.
1228
1229In list context, C<extract_delimited> returns a array of three
1230elements, the extracted substring (I<including the surrounding
1231delimiters>), the remainder of the text, and the skipped prefix (if
1232any). If a suitable delimited substring is not found, the first
1233element of the array is the empty string, the second is the complete
1234original text, and the prefix returned in the third element is an
1235empty string.
1236
1237In a scalar context, just the extracted substring is returned. In
1238a void context, the extracted substring (and any prefix) are simply
1239removed from the beginning of the first argument.
1240
1241Examples:
1242
1243	# Remove a single-quoted substring from the very beginning of $text:
1244
1245		$substring = extract_delimited($text, "'", '');
1246
1247	# Remove a single-quoted Pascalish substring (i.e. one in which
1248	# doubling the quote character escapes it) from the very
1249	# beginning of $text:
1250
1251		$substring = extract_delimited($text, "'", '', "'");
1252
1253	# Extract a single- or double- quoted substring from the
1254	# beginning of $text, optionally after some whitespace
1255	# (note the list context to protect $text from modification):
1256
1257		($substring) = extract_delimited $text, q{"'};
1258
1259	# Delete the substring delimited by the first '/' in $text:
1260
1261		$text = join '', (extract_delimited($text,'/','[^/]*')[2,1];
1262
1263Note that this last example is I<not> the same as deleting the first
1264quote-like pattern. For instance, if C<$text> contained the string:
1265
1266	"if ('./cmd' =~ m/$UNIXCMD/s) { $cmd = $1; }"
1267
1268then after the deletion it would contain:
1269
1270	"if ('.$UNIXCMD/s) { $cmd = $1; }"
1271
1272not:
1273
1274	"if ('./cmd' =~ ms) { $cmd = $1; }"
1275
1276See L<"extract_quotelike"> for a (partial) solution to this problem.
1277
1278=head2 C<extract_bracketed>
1279
1280Like C<"extract_delimited">, the C<extract_bracketed> function takes
1281up to three optional scalar arguments: a string to extract from, a delimiter
1282specifier, and a prefix pattern. As before, a missing prefix defaults to
1283optional whitespace and a missing text defaults to C<$_>. However, a missing
1284delimiter specifier defaults to C<'{}()[]E<lt>E<gt>'> (see below).
1285
1286C<extract_bracketed> extracts a balanced-bracket-delimited
1287substring (using any one (or more) of the user-specified delimiter
1288brackets: '(..)', '{..}', '[..]', or '<..>'). Optionally it will also
1289respect quoted unbalanced brackets (see below).
1290
1291A "delimiter bracket" is a bracket in list of delimiters passed as
1292C<extract_bracketed>'s second argument. Delimiter brackets are
1293specified by giving either the left or right (or both!) versions
1294of the required bracket(s). Note that the order in which
1295two or more delimiter brackets are specified is not significant.
1296
1297A "balanced-bracket-delimited substring" is a substring bounded by
1298matched brackets, such that any other (left or right) delimiter
1299bracket I<within> the substring is also matched by an opposite
1300(right or left) delimiter bracket I<at the same level of nesting>. Any
1301type of bracket not in the delimiter list is treated as an ordinary
1302character.
1303
1304In other words, each type of bracket specified as a delimiter must be
1305balanced and correctly nested within the substring, and any other kind of
1306("non-delimiter") bracket in the substring is ignored.
1307
1308For example, given the string:
1309
1310	$text = "{ an '[irregularly :-(] {} parenthesized >:-)' string }";
1311
1312then a call to C<extract_bracketed> in a list context:
1313
1314	@result = extract_bracketed( $text, '{}' );
1315
1316would return:
1317
1318	( "{ an '[irregularly :-(] {} parenthesized >:-)' string }" , "" , "" )
1319
1320since both sets of C<'{..}'> brackets are properly nested and evenly balanced.
1321(In a scalar context just the first element of the array would be returned. In
1322a void context, C<$text> would be replaced by an empty string.)
1323
1324Likewise the call in:
1325
1326	@result = extract_bracketed( $text, '{[' );
1327
1328would return the same result, since all sets of both types of specified
1329delimiter brackets are correctly nested and balanced.
1330
1331However, the call in:
1332
1333	@result = extract_bracketed( $text, '{([<' );
1334
1335would fail, returning:
1336
1337	( undef , "{ an '[irregularly :-(] {} parenthesized >:-)' string }"  );
1338
1339because the embedded pairs of C<'(..)'>s and C<'[..]'>s are "cross-nested" and
1340the embedded C<'E<gt>'> is unbalanced. (In a scalar context, this call would
1341return an empty string. In a void context, C<$text> would be unchanged.)
1342
1343Note that the embedded single-quotes in the string don't help in this
1344case, since they have not been specified as acceptable delimiters and are
1345therefore treated as non-delimiter characters (and ignored).
1346
1347However, if a particular species of quote character is included in the
1348delimiter specification, then that type of quote will be correctly handled.
1349for example, if C<$text> is:
1350
1351	$text = '<A HREF=">>>>">link</A>';
1352
1353then
1354
1355	@result = extract_bracketed( $text, '<">' );
1356
1357returns:
1358
1359	( '<A HREF=">>>>">', 'link</A>', "" )
1360
1361as expected. Without the specification of C<"> as an embedded quoter:
1362
1363	@result = extract_bracketed( $text, '<>' );
1364
1365the result would be:
1366
1367	( '<A HREF=">', '>>>">link</A>', "" )
1368
1369In addition to the quote delimiters C<'>, C<">, and C<`>, full Perl quote-like
1370quoting (i.e. q{string}, qq{string}, etc) can be specified by including the
1371letter 'q' as a delimiter. Hence:
1372
1373	@result = extract_bracketed( $text, '<q>' );
1374
1375would correctly match something like this:
1376
1377	$text = '<leftop: conj /and/ conj>';
1378
1379See also: C<"extract_quotelike"> and C<"extract_codeblock">.
1380
1381=head2 C<extract_variable>
1382
1383C<extract_variable> extracts any valid Perl variable or
1384variable-involved expression, including scalars, arrays, hashes, array
1385accesses, hash look-ups, method calls through objects, subroutine calls
1386through subroutine references, etc.
1387
1388The subroutine takes up to two optional arguments:
1389
1390=over 4
1391
1392=item 1.
1393
1394A string to be processed (C<$_> if the string is omitted or C<undef>)
1395
1396=item 2.
1397
1398A string specifying a pattern to be matched as a prefix (which is to be
1399skipped). If omitted, optional whitespace is skipped.
1400
1401=back
1402
1403On success in a list context, an array of 3 elements is returned. The
1404elements are:
1405
1406=over 4
1407
1408=item [0]
1409
1410the extracted variable, or variablish expression
1411
1412=item [1]
1413
1414the remainder of the input text,
1415
1416=item [2]
1417
1418the prefix substring (if any),
1419
1420=back
1421
1422On failure, all of these values (except the remaining text) are C<undef>.
1423
1424In a scalar context, C<extract_variable> returns just the complete
1425substring that matched a variablish expression. C<undef> is returned on
1426failure. In addition, the original input text has the returned substring
1427(and any prefix) removed from it.
1428
1429In a void context, the input text just has the matched substring (and
1430any specified prefix) removed.
1431
1432
1433=head2 C<extract_tagged>
1434
1435C<extract_tagged> extracts and segments text between (balanced)
1436specified tags.
1437
1438The subroutine takes up to five optional arguments:
1439
1440=over 4
1441
1442=item 1.
1443
1444A string to be processed (C<$_> if the string is omitted or C<undef>)
1445
1446=item 2.
1447
1448A string specifying a pattern to be matched as the opening tag.
1449If the pattern string is omitted (or C<undef>) then a pattern
1450that matches any standard XML tag is used.
1451
1452=item 3.
1453
1454A string specifying a pattern to be matched at the closing tag.
1455If the pattern string is omitted (or C<undef>) then the closing
1456tag is constructed by inserting a C</> after any leading bracket
1457characters in the actual opening tag that was matched (I<not> the pattern
1458that matched the tag). For example, if the opening tag pattern
1459is specified as C<'{{\w+}}'> and actually matched the opening tag
1460C<"{{DATA}}">, then the constructed closing tag would be C<"{{/DATA}}">.
1461
1462=item 4.
1463
1464A string specifying a pattern to be matched as a prefix (which is to be
1465skipped). If omitted, optional whitespace is skipped.
1466
1467=item 5.
1468
1469A hash reference containing various parsing options (see below)
1470
1471=back
1472
1473The various options that can be specified are:
1474
1475=over 4
1476
1477=item C<reject =E<gt> $listref>
1478
1479The list reference contains one or more strings specifying patterns
1480that must I<not> appear within the tagged text.
1481
1482For example, to extract
1483an HTML link (which should not contain nested links) use:
1484
1485        extract_tagged($text, '<A>', '</A>', undef, {reject => ['<A>']} );
1486
1487=item C<ignore =E<gt> $listref>
1488
1489The list reference contains one or more strings specifying patterns
1490that are I<not> be be treated as nested tags within the tagged text
1491(even if they would match the start tag pattern).
1492
1493For example, to extract an arbitrary XML tag, but ignore "empty" elements:
1494
1495        extract_tagged($text, undef, undef, undef, {ignore => ['<[^>]*/>']} );
1496
1497(also see L<"gen_delimited_pat"> below).
1498
1499=item C<fail =E<gt> $str>
1500
1501The C<fail> option indicates the action to be taken if a matching end
1502tag is not encountered (i.e. before the end of the string or some
1503C<reject> pattern matches). By default, a failure to match a closing
1504tag causes C<extract_tagged> to immediately fail.
1505
1506However, if the string value associated with <reject> is "MAX", then
1507C<extract_tagged> returns the complete text up to the point of failure.
1508If the string is "PARA", C<extract_tagged> returns only the first paragraph
1509after the tag (up to the first line that is either empty or contains
1510only whitespace characters).
1511If the string is "", the default behaviour (i.e. failure) is reinstated.
1512
1513For example, suppose the start tag "/para" introduces a paragraph, which then
1514continues until the next "/endpara" tag or until another "/para" tag is
1515encountered:
1516
1517        $text = "/para line 1\n\nline 3\n/para line 4";
1518
1519        extract_tagged($text, '/para', '/endpara', undef,
1520                                {reject => '/para', fail => MAX );
1521
1522        # EXTRACTED: "/para line 1\n\nline 3\n"
1523
1524Suppose instead, that if no matching "/endpara" tag is found, the "/para"
1525tag refers only to the immediately following paragraph:
1526
1527        $text = "/para line 1\n\nline 3\n/para line 4";
1528
1529        extract_tagged($text, '/para', '/endpara', undef,
1530                        {reject => '/para', fail => MAX );
1531
1532        # EXTRACTED: "/para line 1\n"
1533
1534Note that the specified C<fail> behaviour applies to nested tags as well.
1535
1536=back
1537
1538On success in a list context, an array of 6 elements is returned. The elements are:
1539
1540=over 4
1541
1542=item [0]
1543
1544the extracted tagged substring (including the outermost tags),
1545
1546=item [1]
1547
1548the remainder of the input text,
1549
1550=item [2]
1551
1552the prefix substring (if any),
1553
1554=item [3]
1555
1556the opening tag
1557
1558=item [4]
1559
1560the text between the opening and closing tags
1561
1562=item [5]
1563
1564the closing tag (or "" if no closing tag was found)
1565
1566=back
1567
1568On failure, all of these values (except the remaining text) are C<undef>.
1569
1570In a scalar context, C<extract_tagged> returns just the complete
1571substring that matched a tagged text (including the start and end
1572tags). C<undef> is returned on failure. In addition, the original input
1573text has the returned substring (and any prefix) removed from it.
1574
1575In a void context, the input text just has the matched substring (and
1576any specified prefix) removed.
1577
1578=head2 C<gen_extract_tagged>
1579
1580(Note: This subroutine is only available under Perl5.005)
1581
1582C<gen_extract_tagged> generates a new anonymous subroutine which
1583extracts text between (balanced) specified tags. In other words,
1584it generates a function identical in function to C<extract_tagged>.
1585
1586The difference between C<extract_tagged> and the anonymous
1587subroutines generated by
1588C<gen_extract_tagged>, is that those generated subroutines:
1589
1590=over 4
1591
1592=item *
1593
1594do not have to reparse tag specification or parsing options every time
1595they are called (whereas C<extract_tagged> has to effectively rebuild
1596its tag parser on every call);
1597
1598=item *
1599
1600make use of the new qr// construct to pre-compile the regexes they use
1601(whereas C<extract_tagged> uses standard string variable interpolation
1602to create tag-matching patterns).
1603
1604=back
1605
1606The subroutine takes up to four optional arguments (the same set as
1607C<extract_tagged> except for the string to be processed). It returns
1608a reference to a subroutine which in turn takes a single argument (the text to
1609be extracted from).
1610
1611In other words, the implementation of C<extract_tagged> is exactly
1612equivalent to:
1613
1614        sub extract_tagged
1615        {
1616                my $text = shift;
1617                $extractor = gen_extract_tagged(@_);
1618                return $extractor->($text);
1619        }
1620
1621(although C<extract_tagged> is not currently implemented that way, in order
1622to preserve pre-5.005 compatibility).
1623
1624Using C<gen_extract_tagged> to create extraction functions for specific tags
1625is a good idea if those functions are going to be called more than once, since
1626their performance is typically twice as good as the more general-purpose
1627C<extract_tagged>.
1628
1629
1630=head2 C<extract_quotelike>
1631
1632C<extract_quotelike> attempts to recognize, extract, and segment any
1633one of the various Perl quotes and quotelike operators (see
1634L<perlop(3)>) Nested backslashed delimiters, embedded balanced bracket
1635delimiters (for the quotelike operators), and trailing modifiers are
1636all caught. For example, in:
1637
1638        extract_quotelike 'q # an octothorpe: \# (not the end of the q!) #'
1639
1640        extract_quotelike '  "You said, \"Use sed\"."  '
1641
1642        extract_quotelike ' s{([A-Z]{1,8}\.[A-Z]{3})} /\L$1\E/; '
1643
1644        extract_quotelike ' tr/\\\/\\\\/\\\//ds; '
1645
1646the full Perl quotelike operations are all extracted correctly.
1647
1648Note too that, when using the /x modifier on a regex, any comment
1649containing the current pattern delimiter will cause the regex to be
1650immediately terminated. In other words:
1651
1652        'm /
1653                (?i)            # CASE INSENSITIVE
1654                [a-z_]          # LEADING ALPHABETIC/UNDERSCORE
1655                [a-z0-9]*       # FOLLOWED BY ANY NUMBER OF ALPHANUMERICS
1656           /x'
1657
1658will be extracted as if it were:
1659
1660        'm /
1661                (?i)            # CASE INSENSITIVE
1662                [a-z_]          # LEADING ALPHABETIC/'
1663
1664This behaviour is identical to that of the actual compiler.
1665
1666C<extract_quotelike> takes two arguments: the text to be processed and
1667a prefix to be matched at the very beginning of the text. If no prefix
1668is specified, optional whitespace is the default. If no text is given,
1669C<$_> is used.
1670
1671In a list context, an array of 11 elements is returned. The elements are:
1672
1673=over 4
1674
1675=item [0]
1676
1677the extracted quotelike substring (including trailing modifiers),
1678
1679=item [1]
1680
1681the remainder of the input text,
1682
1683=item [2]
1684
1685the prefix substring (if any),
1686
1687=item [3]
1688
1689the name of the quotelike operator (if any),
1690
1691=item [4]
1692
1693the left delimiter of the first block of the operation,
1694
1695=item [5]
1696
1697the text of the first block of the operation
1698(that is, the contents of
1699a quote, the regex of a match or substitution or the target list of a
1700translation),
1701
1702=item [6]
1703
1704the right delimiter of the first block of the operation,
1705
1706=item [7]
1707
1708the left delimiter of the second block of the operation
1709(that is, if it is a C<s>, C<tr>, or C<y>),
1710
1711=item [8]
1712
1713the text of the second block of the operation
1714(that is, the replacement of a substitution or the translation list
1715of a translation),
1716
1717=item [9]
1718
1719the right delimiter of the second block of the operation (if any),
1720
1721=item [10]
1722
1723the trailing modifiers on the operation (if any).
1724
1725=back
1726
1727For each of the fields marked "(if any)" the default value on success is
1728an empty string.
1729On failure, all of these values (except the remaining text) are C<undef>.
1730
1731In a scalar context, C<extract_quotelike> returns just the complete substring
1732that matched a quotelike operation (or C<undef> on failure). In a scalar or
1733void context, the input text has the same substring (and any specified
1734prefix) removed.
1735
1736Examples:
1737
1738        # Remove the first quotelike literal that appears in text
1739
1740                $quotelike = extract_quotelike($text,'.*?');
1741
1742        # Replace one or more leading whitespace-separated quotelike
1743        # literals in $_ with "<QLL>"
1744
1745                do { $_ = join '<QLL>', (extract_quotelike)[2,1] } until $@;
1746
1747
1748        # Isolate the search pattern in a quotelike operation from $text
1749
1750                ($op,$pat) = (extract_quotelike $text)[3,5];
1751                if ($op =~ /[ms]/)
1752                {
1753                        print "search pattern: $pat\n";
1754                }
1755                else
1756                {
1757                        print "$op is not a pattern matching operation\n";
1758                }
1759
1760=head2 C<extract_quotelike> and "here documents"
1761
1762C<extract_quotelike> can successfully extract "here documents" from an input
1763string, but with an important caveat in list contexts.
1764
1765Unlike other types of quote-like literals, a here document is rarely
1766a contiguous substring. For example, a typical piece of code using
1767here document might look like this:
1768
1769        <<'EOMSG' || die;
1770        This is the message.
1771        EOMSG
1772        exit;
1773
1774Given this as an input string in a scalar context, C<extract_quotelike>
1775would correctly return the string "<<'EOMSG'\nThis is the message.\nEOMSG",
1776leaving the string " || die;\nexit;" in the original variable. In other words,
1777the two separate pieces of the here document are successfully extracted and
1778concatenated.
1779
1780In a list context, C<extract_quotelike> would return the list
1781
1782=over 4
1783
1784=item [0]
1785
1786"<<'EOMSG'\nThis is the message.\nEOMSG\n" (i.e. the full extracted here document,
1787including fore and aft delimiters),
1788
1789=item [1]
1790
1791" || die;\nexit;" (i.e. the remainder of the input text, concatenated),
1792
1793=item [2]
1794
1795"" (i.e. the prefix substring -- trivial in this case),
1796
1797=item [3]
1798
1799"<<" (i.e. the "name" of the quotelike operator)
1800
1801=item [4]
1802
1803"'EOMSG'" (i.e. the left delimiter of the here document, including any quotes),
1804
1805=item [5]
1806
1807"This is the message.\n" (i.e. the text of the here document),
1808
1809=item [6]
1810
1811"EOMSG" (i.e. the right delimiter of the here document),
1812
1813=item [7..10]
1814
1815"" (a here document has no second left delimiter, second text, second right
1816delimiter, or trailing modifiers).
1817
1818=back
1819
1820However, the matching position of the input variable would be set to
1821"exit;" (i.e. I<after> the closing delimiter of the here document),
1822which would cause the earlier " || die;\nexit;" to be skipped in any
1823sequence of code fragment extractions.
1824
1825To avoid this problem, when it encounters a here document whilst
1826extracting from a modifiable string, C<extract_quotelike> silently
1827rearranges the string to an equivalent piece of Perl:
1828
1829        <<'EOMSG'
1830        This is the message.
1831        EOMSG
1832        || die;
1833        exit;
1834
1835in which the here document I<is> contiguous. It still leaves the
1836matching position after the here document, but now the rest of the line
1837on which the here document starts is not skipped.
1838
1839To prevent <extract_quotelike> from mucking about with the input in this way
1840(this is the only case where a list-context C<extract_quotelike> does so),
1841you can pass the input variable as an interpolated literal:
1842
1843        $quotelike = extract_quotelike("$var");
1844
1845=head2 C<extract_codeblock>
1846
1847C<extract_codeblock> attempts to recognize and extract a balanced
1848bracket delimited substring that may contain unbalanced brackets
1849inside Perl quotes or quotelike operations. That is, C<extract_codeblock>
1850is like a combination of C<"extract_bracketed"> and
1851C<"extract_quotelike">.
1852
1853C<extract_codeblock> takes the same initial three parameters as C<extract_bracketed>:
1854a text to process, a set of delimiter brackets to look for, and a prefix to
1855match first. It also takes an optional fourth parameter, which allows the
1856outermost delimiter brackets to be specified separately (see below).
1857
1858Omitting the first argument (input text) means process C<$_> instead.
1859Omitting the second argument (delimiter brackets) indicates that only C<'{'> is to be used.
1860Omitting the third argument (prefix argument) implies optional whitespace at the start.
1861Omitting the fourth argument (outermost delimiter brackets) indicates that the
1862value of the second argument is to be used for the outermost delimiters.
1863
1864Once the prefix an dthe outermost opening delimiter bracket have been
1865recognized, code blocks are extracted by stepping through the input text and
1866trying the following alternatives in sequence:
1867
1868=over 4
1869
1870=item 1.
1871
1872Try and match a closing delimiter bracket. If the bracket was the same
1873species as the last opening bracket, return the substring to that
1874point. If the bracket was mismatched, return an error.
1875
1876=item 2.
1877
1878Try to match a quote or quotelike operator. If found, call
1879C<extract_quotelike> to eat it. If C<extract_quotelike> fails, return
1880the error it returned. Otherwise go back to step 1.
1881
1882=item 3.
1883
1884Try to match an opening delimiter bracket. If found, call
1885C<extract_codeblock> recursively to eat the embedded block. If the
1886recursive call fails, return an error. Otherwise, go back to step 1.
1887
1888=item 4.
1889
1890Unconditionally match a bareword or any other single character, and
1891then go back to step 1.
1892
1893=back
1894
1895Examples:
1896
1897        # Find a while loop in the text
1898
1899                if ($text =~ s/.*?while\s*\{/{/)
1900                {
1901                        $loop = "while " . extract_codeblock($text);
1902                }
1903
1904        # Remove the first round-bracketed list (which may include
1905        # round- or curly-bracketed code blocks or quotelike operators)
1906
1907                extract_codeblock $text, "(){}", '[^(]*';
1908
1909
1910The ability to specify a different outermost delimiter bracket is useful
1911in some circumstances. For example, in the Parse::RecDescent module,
1912parser actions which are to be performed only on a successful parse
1913are specified using a C<E<lt>defer:...E<gt>> directive. For example:
1914
1915        sentence: subject verb object
1916                        <defer: {$::theVerb = $item{verb}} >
1917
1918Parse::RecDescent uses C<extract_codeblock($text, '{}E<lt>E<gt>')> to extract the code
1919within the C<E<lt>defer:...E<gt>> directive, but there's a problem.
1920
1921A deferred action like this:
1922
1923                        <defer: {if ($count>10) {$count--}} >
1924
1925will be incorrectly parsed as:
1926
1927                        <defer: {if ($count>
1928
1929because the "less than" operator is interpreted as a closing delimiter.
1930
1931But, by extracting the directive using
1932S<C<extract_codeblock($text, '{}', undef, 'E<lt>E<gt>')>>
1933the '>' character is only treated as a delimited at the outermost
1934level of the code block, so the directive is parsed correctly.
1935
1936=head2 C<extract_multiple>
1937
1938The C<extract_multiple> subroutine takes a string to be processed and a
1939list of extractors (subroutines or regular expressions) to apply to that string.
1940
1941In an array context C<extract_multiple> returns an array of substrings
1942of the original string, as extracted by the specified extractors.
1943In a scalar context, C<extract_multiple> returns the first
1944substring successfully extracted from the original string. In both
1945scalar and void contexts the original string has the first successfully
1946extracted substring removed from it. In all contexts
1947C<extract_multiple> starts at the current C<pos> of the string, and
1948sets that C<pos> appropriately after it matches.
1949
1950Hence, the aim of of a call to C<extract_multiple> in a list context
1951is to split the processed string into as many non-overlapping fields as
1952possible, by repeatedly applying each of the specified extractors
1953to the remainder of the string. Thus C<extract_multiple> is
1954a generalized form of Perl's C<split> subroutine.
1955
1956The subroutine takes up to four optional arguments:
1957
1958=over 4
1959
1960=item 1.
1961
1962A string to be processed (C<$_> if the string is omitted or C<undef>)
1963
1964=item 2.
1965
1966A reference to a list of subroutine references and/or qr// objects and/or
1967literal strings and/or hash references, specifying the extractors
1968to be used to split the string. If this argument is omitted (or
1969C<undef>) the list:
1970
1971        [
1972                sub { extract_variable($_[0], '') },
1973                sub { extract_quotelike($_[0],'') },
1974                sub { extract_codeblock($_[0],'{}','') },
1975        ]
1976
1977is used.
1978
1979=item 3.
1980
1981An number specifying the maximum number of fields to return. If this
1982argument is omitted (or C<undef>), split continues as long as possible.
1983
1984If the third argument is I<N>, then extraction continues until I<N> fields
1985have been successfully extracted, or until the string has been completely
1986processed.
1987
1988Note that in scalar and void contexts the value of this argument is
1989automatically reset to 1 (under C<-w>, a warning is issued if the argument
1990has to be reset).
1991
1992=item 4.
1993
1994A value indicating whether unmatched substrings (see below) within the
1995text should be skipped or returned as fields. If the value is true,
1996such substrings are skipped. Otherwise, they are returned.
1997
1998=back
1999
2000The extraction process works by applying each extractor in
2001sequence to the text string.
2002
2003If the extractor is a subroutine it is called in a list context and is
2004expected to return a list of a single element, namely the extracted
2005text. It may optionally also return two further arguments: a string
2006representing the text left after extraction (like $' for a pattern
2007match), and a string representing any prefix skipped before the
2008extraction (like $` in a pattern match). Note that this is designed
2009to facilitate the use of other Text::Balanced subroutines with
2010C<extract_multiple>. Note too that the value returned by an extractor
2011subroutine need not bear any relationship to the corresponding substring
2012of the original text (see examples below).
2013
2014If the extractor is a precompiled regular expression or a string,
2015it is matched against the text in a scalar context with a leading
2016'\G' and the gc modifiers enabled. The extracted value is either
2017$1 if that variable is defined after the match, or else the
2018complete match (i.e. $&).
2019
2020If the extractor is a hash reference, it must contain exactly one element.
2021The value of that element is one of the
2022above extractor types (subroutine reference, regular expression, or string).
2023The key of that element is the name of a class into which the successful
2024return value of the extractor will be blessed.
2025
2026If an extractor returns a defined value, that value is immediately
2027treated as the next extracted field and pushed onto the list of fields.
2028If the extractor was specified in a hash reference, the field is also
2029blessed into the appropriate class,
2030
2031If the extractor fails to match (in the case of a regex extractor), or returns an empty list or an undefined value (in the case of a subroutine extractor), it is
2032assumed to have failed to extract.
2033If none of the extractor subroutines succeeds, then one
2034character is extracted from the start of the text and the extraction
2035subroutines reapplied. Characters which are thus removed are accumulated and
2036eventually become the next field (unless the fourth argument is true, in which
2037case they are discarded).
2038
2039For example, the following extracts substrings that are valid Perl variables:
2040
2041        @fields = extract_multiple($text,
2042                                   [ sub { extract_variable($_[0]) } ],
2043                                   undef, 1);
2044
2045This example separates a text into fields which are quote delimited,
2046curly bracketed, and anything else. The delimited and bracketed
2047parts are also blessed to identify them (the "anything else" is unblessed):
2048
2049        @fields = extract_multiple($text,
2050                   [
2051                        { Delim => sub { extract_delimited($_[0],q{'"}) } },
2052                        { Brack => sub { extract_bracketed($_[0],'{}') } },
2053                   ]);
2054
2055This call extracts the next single substring that is a valid Perl quotelike
2056operator (and removes it from $text):
2057
2058        $quotelike = extract_multiple($text,
2059                                      [
2060                                        sub { extract_quotelike($_[0]) },
2061                                      ], undef, 1);
2062
2063Finally, here is yet another way to do comma-separated value parsing:
2064
2065        @fields = extract_multiple($csv_text,
2066                                  [
2067                                        sub { extract_delimited($_[0],q{'"}) },
2068                                        qr/([^,]+)(.*)/,
2069                                  ],
2070                                  undef,1);
2071
2072The list in the second argument means:
2073I<"Try and extract a ' or " delimited string, otherwise extract anything up to a comma...">.
2074The undef third argument means:
2075I<"...as many times as possible...">,
2076and the true value in the fourth argument means
2077I<"...discarding anything else that appears (i.e. the commas)">.
2078
2079If you wanted the commas preserved as separate fields (i.e. like split
2080does if your split pattern has capturing parentheses), you would
2081just make the last parameter undefined (or remove it).
2082
2083=head2 C<gen_delimited_pat>
2084
2085The C<gen_delimited_pat> subroutine takes a single (string) argument and
2086   > builds a Friedl-style optimized regex that matches a string delimited
2087by any one of the characters in the single argument. For example:
2088
2089        gen_delimited_pat(q{'"})
2090
2091returns the regex:
2092
2093        (?:\"(?:\\\"|(?!\").)*\"|\'(?:\\\'|(?!\').)*\')
2094
2095Note that the specified delimiters are automatically quotemeta'd.
2096
2097A typical use of C<gen_delimited_pat> would be to build special purpose tags
2098for C<extract_tagged>. For example, to properly ignore "empty" XML elements
2099(which might contain quoted strings):
2100
2101        my $empty_tag = '<(' . gen_delimited_pat(q{'"}) . '|.)+/>';
2102
2103        extract_tagged($text, undef, undef, undef, {ignore => [$empty_tag]} );
2104
2105C<gen_delimited_pat> may also be called with an optional second argument,
2106which specifies the "escape" character(s) to be used for each delimiter.
2107For example to match a Pascal-style string (where ' is the delimiter
2108and '' is a literal ' within the string):
2109
2110        gen_delimited_pat(q{'},q{'});
2111
2112Different escape characters can be specified for different delimiters.
2113For example, to specify that '/' is the escape for single quotes
2114and '%' is the escape for double quotes:
2115
2116        gen_delimited_pat(q{'"},q{/%});
2117
2118If more delimiters than escape chars are specified, the last escape char
2119is used for the remaining delimiters.
2120If no escape char is specified for a given specified delimiter, '\' is used.
2121
2122=head2 C<delimited_pat>
2123
2124Note that C<gen_delimited_pat> was previously called C<delimited_pat>.
2125That name may still be used, but is now deprecated.
2126
2127
2128=head1 DIAGNOSTICS
2129
2130In a list context, all the functions return C<(undef,$original_text)>
2131on failure. In a scalar context, failure is indicated by returning C<undef>
2132(in this case the input text is not modified in any way).
2133
2134In addition, on failure in I<any> context, the C<$@> variable is set.
2135Accessing C<$@-E<gt>{error}> returns one of the error diagnostics listed
2136below.
2137Accessing C<$@-E<gt>{pos}> returns the offset into the original string at
2138which the error was detected (although not necessarily where it occurred!)
2139Printing C<$@> directly produces the error message, with the offset appended.
2140On success, the C<$@> variable is guaranteed to be C<undef>.
2141
2142The available diagnostics are:
2143
2144=over 4
2145
2146=item  C<Did not find a suitable bracket: "%s">
2147
2148The delimiter provided to C<extract_bracketed> was not one of
2149C<'()[]E<lt>E<gt>{}'>.
2150
2151=item  C<Did not find prefix: /%s/>
2152
2153A non-optional prefix was specified but wasn't found at the start of the text.
2154
2155=item  C<Did not find opening bracket after prefix: "%s">
2156
2157C<extract_bracketed> or C<extract_codeblock> was expecting a
2158particular kind of bracket at the start of the text, and didn't find it.
2159
2160=item  C<No quotelike operator found after prefix: "%s">
2161
2162C<extract_quotelike> didn't find one of the quotelike operators C<q>,
2163C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y> at the start of the substring
2164it was extracting.
2165
2166=item  C<Unmatched closing bracket: "%c">
2167
2168C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> encountered
2169a closing bracket where none was expected.
2170
2171=item  C<Unmatched opening bracket(s): "%s">
2172
2173C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> ran
2174out of characters in the text before closing one or more levels of nested
2175brackets.
2176
2177=item C<Unmatched embedded quote (%s)>
2178
2179C<extract_bracketed> attempted to match an embedded quoted substring, but
2180failed to find a closing quote to match it.
2181
2182=item C<Did not find closing delimiter to match '%s'>
2183
2184C<extract_quotelike> was unable to find a closing delimiter to match the
2185one that opened the quote-like operation.
2186
2187=item  C<Mismatched closing bracket: expected "%c" but found "%s">
2188
2189C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> found
2190a valid bracket delimiter, but it was the wrong species. This usually
2191indicates a nesting error, but may indicate incorrect quoting or escaping.
2192
2193=item  C<No block delimiter found after quotelike "%s">
2194
2195C<extract_quotelike> or C<extract_codeblock> found one of the
2196quotelike operators C<q>, C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y>
2197without a suitable block after it.
2198
2199=item C<Did not find leading dereferencer>
2200
2201C<extract_variable> was expecting one of '$', '@', or '%' at the start of
2202a variable, but didn't find any of them.
2203
2204=item C<Bad identifier after dereferencer>
2205
2206C<extract_variable> found a '$', '@', or '%' indicating a variable, but that
2207character was not followed by a legal Perl identifier.
2208
2209=item C<Did not find expected opening bracket at %s>
2210
2211C<extract_codeblock> failed to find any of the outermost opening brackets
2212that were specified.
2213
2214=item C<Improperly nested codeblock at %s>
2215
2216A nested code block was found that started with a delimiter that was specified
2217as being only to be used as an outermost bracket.
2218
2219=item  C<Missing second block for quotelike "%s">
2220
2221C<extract_codeblock> or C<extract_quotelike> found one of the
2222quotelike operators C<s>, C<tr> or C<y> followed by only one block.
2223
2224=item C<No match found for opening bracket>
2225
2226C<extract_codeblock> failed to find a closing bracket to match the outermost
2227opening bracket.
2228
2229=item C<Did not find opening tag: /%s/>
2230
2231C<extract_tagged> did not find a suitable opening tag (after any specified
2232prefix was removed).
2233
2234=item C<Unable to construct closing tag to match: /%s/>
2235
2236C<extract_tagged> matched the specified opening tag and tried to
2237modify the matched text to produce a matching closing tag (because
2238none was specified). It failed to generate the closing tag, almost
2239certainly because the opening tag did not start with a
2240bracket of some kind.
2241
2242=item C<Found invalid nested tag: %s>
2243
2244C<extract_tagged> found a nested tag that appeared in the "reject" list
2245(and the failure mode was not "MAX" or "PARA").
2246
2247=item C<Found unbalanced nested tag: %s>
2248
2249C<extract_tagged> found a nested opening tag that was not matched by a
2250corresponding nested closing tag (and the failure mode was not "MAX" or "PARA").
2251
2252=item C<Did not find closing tag>
2253
2254C<extract_tagged> reached the end of the text without finding a closing tag
2255to match the original opening tag (and the failure mode was not
2256"MAX" or "PARA").
2257
2258=back
2259
2260=head1 AUTHOR
2261
2262Damian Conway (damian@conway.org)
2263
2264=head1 BUGS AND IRRITATIONS
2265
2266There are undoubtedly serious bugs lurking somewhere in this code, if
2267only because parts of it give the impression of understanding a great deal
2268more about Perl than they really do.
2269
2270Bug reports and other feedback are most welcome.
2271
2272=head1 COPYRIGHT
2273
2274Copyright 1997 - 2001 Damian Conway. All Rights Reserved.
2275
2276Some (minor) parts copyright 2009 Adam Kennedy.
2277
2278This module is free software. It may be used, redistributed
2279and/or modified under the same terms as Perl itself.
2280
2281=cut
2282