1/* __gmp_doscan -- formatted input internals.
2
3   THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY.  THEY'RE ALMOST
4   CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
5   FUTURE GNU MP RELEASES.
6
7Copyright 2001, 2002, 2003 Free Software Foundation, Inc.
8
9This file is part of the GNU MP Library.
10
11The GNU MP Library is free software; you can redistribute it and/or modify
12it under the terms of the GNU Lesser General Public License as published by
13the Free Software Foundation; either version 3 of the License, or (at your
14option) any later version.
15
16The GNU MP Library is distributed in the hope that it will be useful, but
17WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
18or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
19License for more details.
20
21You should have received a copy of the GNU Lesser General Public License
22along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
23
24#define _GNU_SOURCE    /* for DECIMAL_POINT in langinfo.h */
25
26#include "config.h"
27
28#if HAVE_STDARG
29#include <stdarg.h>
30#else
31#include <varargs.h>
32#endif
33
34#include <ctype.h>
35#include <stddef.h>    /* for ptrdiff_t */
36#include <stdio.h>
37#include <stdlib.h>    /* for strtol */
38#include <string.h>
39
40#if HAVE_LANGINFO_H
41#include <langinfo.h>  /* for nl_langinfo */
42#endif
43
44#if HAVE_LOCALE_H
45#include <locale.h>    /* for localeconv */
46#endif
47
48#if HAVE_INTTYPES_H
49# include <inttypes.h> /* for intmax_t */
50#else
51# if HAVE_STDINT_H
52#  include <stdint.h>
53# endif
54#endif
55
56#if HAVE_SYS_TYPES_H
57#include <sys/types.h> /* for quad_t */
58#endif
59
60#include "gmp.h"
61#include "gmp-impl.h"
62
63
64/* Change this to "#define TRACE(x) x" for some traces. */
65#define TRACE(x)
66
67
68/* General:
69
70       It's necessary to parse up the format string to recognise the GMP
71       extra types F, Q and Z.  Other types and conversions are passed
72       across to the standard sscanf or fscanf via funs->scan, for ease of
73       implementation.  This is essential in the case of something like glibc
74       %p where the pointer format isn't actually documented.
75
76       Because funs->scan doesn't get the whole input it can't put the right
77       values in for %n, so that's handled in __gmp_doscan.  Neither sscanf
78       nor fscanf directly indicate how many characters were read, so an
79       extra %n is appended to each run for that.  For fscanf this merely
80       supports our %n output, but for sscanf it lets funs->step move us
81       along the input string.
82
83       Whitespace and literal matches in the format string, including %%,
84       are handled directly within __gmp_doscan.  This is reasonably
85       efficient, and avoids some suspicious behaviour observed in various
86       system libc's.  GLIBC 2.2.4 for instance returns 0 on
87
88	   sscanf(" ", " x")
89       or
90	   sscanf(" ", " x%d",&n)
91
92       whereas we think they should return EOF, since end-of-string is
93       reached when a match of "x" is required.
94
95       For standard % conversions, funs->scan is called once for each
96       conversion.  If we had vfscanf and vsscanf and could rely on their
97       fixed text matching behaviour then we could call them with multiple
98       consecutive standard conversions.  But plain fscanf and sscanf work
99       fine, and parsing one field at a time shouldn't be too much of a
100       slowdown.
101
102   gmpscan:
103
104       gmpscan reads a gmp type.  It's only used from one place, but is a
105       separate subroutine to avoid a big chunk of complicated code in the
106       middle of __gmp_doscan.  Within gmpscan a couple of loopbacks make it
107       possible to share code for parsing integers, rationals and floats.
108
109       In gmpscan normally one char of lookahead is maintained, but when width
110       is reached that stops, on the principle that an fgetc/ungetc of a char
111       past where we're told to stop would be undesirable.  "chars" is how many
112       characters have been read so far, including the current c.  When
113       chars==width and another character is desired then a jump is done to the
114       "convert" stage.  c is invalid and mustn't be unget'ed in this case;
115       chars is set to width+1 to indicate that.
116
117       gmpscan normally returns the number of characters read.  -1 means an
118       invalid field, -2 means EOF reached before any matching characters
119       were read.
120
121       For hex floats, the mantissa part is passed to mpf_set_str, then the
122       exponent is applied with mpf_mul_exp or mpf_div_2exp.  This is easier
123       than teaching mpf_set_str about an exponent factor (ie. 2) differing
124       from the mantissa radix point factor (ie. 16).  mpf_mul_exp and
125       mpf_div_2exp will preserve the application requested precision, so
126       nothing in that respect is lost by making this a two-step process.
127
128   Matching and errors:
129
130       C99 7.19.6.2 paras 9 and 10 say an input item is read as the longest
131       string which is a match for the appropriate type, or a prefix of a
132       match.  With that done, if it's only a prefix then the result is a
133       matching failure, ie. invalid input.
134
135       This rule seems fairly clear, but doesn't seem to be universally
136       applied in system C libraries.  Even GLIBC doesn't seem to get it
137       right, insofar as it seems to accept some apparently invalid forms.
138       Eg. glibc 2.3.1 accepts "0x" for a "%i", where a reading of the
139       standard would suggest a non-empty sequence of digits should be
140       required after an "0x".
141
142       A footnote to 7.19.6.2 para 17 notes how this input item reading can
143       mean inputs acceptable to strtol are not acceptable to fscanf.  We
144       think this confirms our reading of "0x" as invalid.
145
146       Clearly gmp_sscanf could backtrack to a longest input which was a
147       valid match for a given item, but this is not done, since C99 says
148       sscanf is identical to fscanf, so we make gmp_sscanf identical to
149       gmp_fscanf.
150
151   Types:
152
153       C99 says "ll" is for long long, and "L" is for long double floats.
154       Unfortunately in GMP 4.1.1 we documented the two as equivalent.  This
155       doesn't affect us directly, since both are passed through to plain
156       scanf.  It seems wisest not to try to enforce the C99 rule.  This is
157       consistent with what we said before, though whether it actually
158       worked was always up to the C library.
159
160   Alternatives:
161
162       Consideration was given to using separate code for gmp_fscanf and
163       gmp_sscanf.  The sscanf case could zip across a string doing literal
164       matches or recognising digits in gmpscan, rather than making a
165       function call fun->get per character.  The fscanf could use getc
166       rather than fgetc too, which might help those systems where getc is a
167       macro or otherwise inlined.  But none of this scanning and converting
168       will be particularly fast, so the two are done together to keep it a
169       little simpler for now.
170
171       Various multibyte string issues are not addressed, for a start C99
172       scanf says the format string is multibyte.  Since we pass %c, %s and
173       %[ to the system scanf, they might do multibyte reads already, but
174       it's another matter whether or not that can be used, since our digit
175       and whitespace parsing is only unibyte.  The plan is to quietly
176       ignore multibyte locales for now.  This is not as bad as it sounds,
177       since GMP is presumably used mostly on numbers, which can be
178       perfectly adequately treated in plain ASCII.
179
180*/
181
182
183struct gmp_doscan_params_t {
184  int	base;
185  int	ignore;
186  char	type;
187  int	width;
188};
189
190
191#define GET(c)			\
192  do {				\
193    ASSERT (chars <= width);	\
194    chars++;			\
195    if (chars > width)		\
196      goto convert;		\
197    (c) = (*funs->get) (data);	\
198  } while (0)
199
200/* store into "s", extending if necessary */
201#define STORE(c)							\
202  do {									\
203    ASSERT (s_upto <= s_alloc);						\
204    if (s_upto >= s_alloc)						\
205      {									\
206	size_t	s_alloc_new = s_alloc + S_ALLOC_STEP;			\
207	s = __GMP_REALLOCATE_FUNC_TYPE (s, s_alloc, s_alloc_new, char); \
208	s_alloc = s_alloc_new;						\
209      }									\
210    s[s_upto++] = c;							\
211  } while (0)
212
213#define S_ALLOC_STEP  512
214
215static int
216gmpscan (const struct gmp_doscan_funs_t *funs, void *data,
217	 const struct gmp_doscan_params_t *p, void *dst)
218{
219  int	  chars, c, base, first, width, seen_point, seen_digit, hexfloat;
220  size_t  s_upto, s_alloc, hexexp;
221  char	  *s;
222  int	  invalid = 0;
223
224  TRACE (printf ("gmpscan\n"));
225
226  ASSERT (p->type == 'F' || p->type == 'Q' || p->type == 'Z');
227
228  c = (*funs->get) (data);
229  if (c == EOF)
230    return -2;
231
232  chars = 1;
233  first = 1;
234  seen_point = 0;
235  width = (p->width == 0 ? INT_MAX-1 : p->width);
236  base = p->base;
237  s_alloc = S_ALLOC_STEP;
238  s = __GMP_ALLOCATE_FUNC_TYPE (s_alloc, char);
239  s_upto = 0;
240  hexfloat = 0;
241  hexexp = 0;
242
243 another:
244  seen_digit = 0;
245  if (c == '-')
246    {
247      STORE (c);
248      goto get_for_sign;
249    }
250  else if (c == '+')
251    {
252      /* don't store '+', it's not accepted by mpz_set_str etc */
253    get_for_sign:
254      GET (c);
255    }
256
257  if (base == 0)
258    {
259      base = 10;		  /* decimal if no base indicator */
260      if (c == '0')
261	{
262	  seen_digit = 1;	  /* 0 alone is a valid number */
263	  if (p->type != 'F')
264	    base = 8;		  /* leading 0 is octal, for non-floats */
265	  STORE (c);
266	  GET (c);
267	  if (c == 'x' || c == 'X')
268	    {
269	      base = 16;
270	      seen_digit = 0;	  /* must have digits after an 0x */
271	      if (p->type == 'F') /* don't pass 'x' to mpf_set_str_point */
272		hexfloat = 1;
273	      else
274		STORE (c);
275	      GET (c);
276	    }
277	}
278    }
279
280 digits:
281  for (;;)
282    {
283      if (base == 16)
284	{
285	  if (! isxdigit (c))
286	    break;
287	}
288      else
289	{
290	  if (! isdigit (c))
291	    break;
292	  if (base == 8 && (c == '8' || c == '9'))
293	    break;
294	}
295
296      seen_digit = 1;
297      STORE (c);
298      GET (c);
299    }
300
301  if (first)
302    {
303      /* decimal point */
304      if (p->type == 'F' && ! seen_point)
305	{
306	  /* For a multi-character decimal point, if the first character is
307	     present then all of it must be, otherwise the input is
308	     considered invalid.  */
309	  const char  *point = GMP_DECIMAL_POINT;
310	  int	      pc = (unsigned char) *point++;
311	  if (c == pc)
312	    {
313	      for (;;)
314		{
315		  STORE (c);
316		  GET (c);
317		  pc = (unsigned char) *point++;
318		  if (pc == '\0')
319		    break;
320		  if (c != pc)
321		    goto set_invalid;
322		}
323	      seen_point = 1;
324	      goto digits;
325	    }
326	}
327
328      /* exponent */
329      if (p->type == 'F')
330	{
331	  if (hexfloat && (c == 'p' || c == 'P'))
332	    {
333	      hexexp = s_upto; /* exponent location */
334	      base = 10;       /* exponent in decimal */
335	      goto exponent;
336	    }
337	  else if (! hexfloat && (c == 'e' || c == 'E'))
338	    {
339	    exponent:
340	      /* must have at least one digit in the mantissa, just an exponent
341		 is not good enough */
342	      if (! seen_digit)
343		goto set_invalid;
344
345	    do_second:
346	      first = 0;
347	      STORE (c);
348	      GET (c);
349	      goto another;
350	    }
351	}
352
353      /* denominator */
354      if (p->type == 'Q' && c == '/')
355	{
356	  /* must have at least one digit in the numerator */
357	  if (! seen_digit)
358	    goto set_invalid;
359
360	  /* now look for at least one digit in the denominator */
361	  seen_digit = 0;
362
363	  /* allow the base to be redetermined for "%i" */
364	  base = p->base;
365	  goto do_second;
366	}
367    }
368
369 convert:
370  if (! seen_digit)
371    {
372    set_invalid:
373      invalid = 1;
374      goto done;
375    }
376
377  if (! p->ignore)
378    {
379      STORE ('\0');
380      TRACE (printf ("	convert \"%s\"\n", s));
381
382      /* We ought to have parsed out a valid string above, so just test
383	 mpz_set_str etc with an ASSERT.  */
384      switch (p->type) {
385      case 'F':
386	{
387	  mpf_ptr  f = (mpf_ptr) dst;
388	  if (hexexp != 0)
389	    s[hexexp] = '\0';
390	  ASSERT_NOCARRY (mpf_set_str (f, s, hexfloat ? 16 : 10));
391	  if (hexexp != 0)
392	    {
393	      char *dummy;
394	      long  exp;
395	      exp = strtol (s + hexexp + 1, &dummy, 10);
396	      if (exp >= 0)
397		mpf_mul_2exp (f, f, (unsigned long) exp);
398	      else
399		mpf_div_2exp (f, f, - (unsigned long) exp);
400	    }
401	}
402	break;
403      case 'Q':
404	ASSERT_NOCARRY (mpq_set_str ((mpq_ptr) dst, s, p->base));
405	break;
406      case 'Z':
407	ASSERT_NOCARRY (mpz_set_str ((mpz_ptr) dst, s, p->base));
408	break;
409      default:
410	ASSERT (0);
411	/*FALLTHRU*/
412	break;
413      }
414    }
415
416 done:
417  ASSERT (chars <= width+1);
418  if (chars != width+1)
419    {
420      (*funs->unget) (c, data);
421      TRACE (printf ("	ungetc %d, to give %d chars\n", c, chars-1));
422    }
423  chars--;
424
425  (*__gmp_free_func) (s, s_alloc);
426
427  if (invalid)
428    {
429      TRACE (printf ("	invalid\n"));
430      return -1;
431    }
432
433  TRACE (printf ("  return %d chars (cf width %d)\n", chars, width));
434  return chars;
435}
436
437
438/* Read and discard whitespace, if any.  Return number of chars skipped.
439   Whitespace skipping never provokes the EOF return from __gmp_doscan, so
440   it's not necessary to watch for EOF from funs->get, */
441static int
442skip_white (const struct gmp_doscan_funs_t *funs, void *data)
443{
444  int  c;
445  int  ret = 0;
446
447  do
448    {
449      c = (funs->get) (data);
450      ret++;
451    }
452  while (isspace (c));
453
454  (funs->unget) (c, data);
455  ret--;
456
457  TRACE (printf ("  skip white %d\n", ret));
458  return ret;
459}
460
461
462int
463__gmp_doscan (const struct gmp_doscan_funs_t *funs, void *data,
464	      const char *orig_fmt, va_list orig_ap)
465{
466  struct gmp_doscan_params_t  param;
467  va_list     ap;
468  char	      *alloc_fmt;
469  const char  *fmt, *this_fmt, *end_fmt;
470  size_t      orig_fmt_len, alloc_fmt_size, len;
471  int	      new_fields, new_chars;
472  char	      fchar;
473  int	      fields = 0;
474  int	      chars = 0;
475
476  TRACE (printf ("__gmp_doscan \"%s\"\n", orig_fmt);
477	 if (funs->scan == (gmp_doscan_scan_t) sscanf)
478	   printf ("  s=\"%s\"\n", * (const char **) data));
479
480  /* Don't modify orig_ap, if va_list is actually an array and hence call by
481     reference.  It could be argued that it'd be more efficient to leave
482     callers to make a copy if they care, but doing so here is going to be a
483     very small part of the total work, and we may as well keep applications
484     out of trouble.  */
485  va_copy (ap, orig_ap);
486
487  /* Parts of the format string are going to be copied so that a " %n" can
488     be appended.  alloc_fmt is some space for that.  orig_fmt_len+4 will be
489     needed if fmt consists of a single "%" specifier, but otherwise is an
490     overestimate.  We're not going to be very fast here, so use
491     __gmp_allocate_func rather than TMP_ALLOC.  */
492  orig_fmt_len = strlen (orig_fmt);
493  alloc_fmt_size = orig_fmt_len + 4;
494  alloc_fmt = __GMP_ALLOCATE_FUNC_TYPE (alloc_fmt_size, char);
495
496  fmt = orig_fmt;
497  end_fmt = orig_fmt + orig_fmt_len;
498
499  for (;;)
500    {
501    next:
502      fchar = *fmt++;
503
504      if (fchar == '\0')
505	break;
506
507      if (isspace (fchar))
508	{
509	  chars += skip_white (funs, data);
510	  continue;
511	}
512
513      if (fchar != '%')
514	{
515	  int  c;
516	literal:
517	  c = (funs->get) (data);
518	  if (c != fchar)
519	    {
520	      (funs->unget) (c, data);
521	      if (c == EOF)
522		{
523		eof_no_match:
524		  if (fields == 0)
525		    fields = EOF;
526		}
527	      goto done;
528	    }
529	  chars++;
530	  continue;
531	}
532
533      param.type = '\0';
534      param.base = 0;	 /* for e,f,g,i */
535      param.ignore = 0;
536      param.width = 0;
537
538      this_fmt = fmt-1;
539      TRACE (printf ("	this_fmt \"%s\"\n", this_fmt));
540
541      for (;;)
542	{
543	  ASSERT (fmt <= end_fmt);
544
545	  fchar = *fmt++;
546	  switch (fchar) {
547
548	  case '\0':  /* unterminated % sequence */
549	    ASSERT (0);
550	    goto done;
551
552	  case '%':   /* literal % */
553	    goto literal;
554
555	  case '[':   /* character range */
556	    fchar = *fmt++;
557	    if (fchar == '^')
558	      fchar = *fmt++;
559	    /* ']' allowed as the first char (possibly after '^') */
560	    if (fchar == ']')
561	      fchar = *fmt++;
562	    for (;;)
563	      {
564		ASSERT (fmt <= end_fmt);
565		if (fchar == '\0')
566		  {
567		    /* unterminated % sequence */
568		    ASSERT (0);
569		    goto done;
570		  }
571		if (fchar == ']')
572		  break;
573		fchar = *fmt++;
574	      }
575	    /*FALLTHRU*/
576	  case 'c':   /* characters */
577	  case 's':   /* string of non-whitespace */
578	  case 'p':   /* pointer */
579	  libc_type:
580	    len = fmt - this_fmt;
581	    memcpy (alloc_fmt, this_fmt, len);
582	    alloc_fmt[len++] = '%';
583	    alloc_fmt[len++] = 'n';
584	    alloc_fmt[len] = '\0';
585
586	    TRACE (printf ("  scan \"%s\"\n", alloc_fmt);
587		   if (funs->scan == (gmp_doscan_scan_t) sscanf)
588		     printf ("	s=\"%s\"\n", * (const char **) data));
589
590	    new_chars = -1;
591	    if (param.ignore)
592	      {
593		new_fields = (*funs->scan) (data, alloc_fmt, &new_chars, NULL);
594		ASSERT (new_fields == 0 || new_fields == EOF);
595	      }
596	    else
597	      {
598		void *arg = va_arg (ap, void *);
599		new_fields = (*funs->scan) (data, alloc_fmt, arg, &new_chars);
600		ASSERT (new_fields==0 || new_fields==1 || new_fields==EOF);
601
602		if (new_fields == 0)
603		  goto done;  /* invalid input */
604
605		if (new_fields == 1)
606		  ASSERT (new_chars != -1);
607	      }
608	    TRACE (printf ("  new_fields %d   new_chars %d\n",
609			   new_fields, new_chars));
610
611	    if (new_fields == -1)
612	      goto eof_no_match;  /* EOF before anything matched */
613
614	    /* Under param.ignore, when new_fields==0 we don't know if
615	       it's a successful match or an invalid field.  new_chars
616	       won't have been assigned if it was an invalid field.  */
617	    if (new_chars == -1)
618	      goto done;  /* invalid input */
619
620	    chars += new_chars;
621	    (*funs->step) (data, new_chars);
622
623	  increment_fields:
624	    if (! param.ignore)
625	      fields++;
626	    goto next;
627
628	  case 'd':   /* decimal */
629	  case 'u':   /* decimal */
630	    param.base = 10;
631	    goto numeric;
632
633	  case 'e':   /* float */
634	  case 'E':   /* float */
635	  case 'f':   /* float */
636	  case 'g':   /* float */
637	  case 'G':   /* float */
638	  case 'i':   /* integer with base marker */
639	  numeric:
640	    if (param.type != 'F' && param.type != 'Q' && param.type != 'Z')
641	      goto libc_type;
642
643	    chars += skip_white (funs, data);
644
645	    new_chars = gmpscan (funs, data, &param,
646				 param.ignore ? NULL : va_arg (ap, void*));
647	    if (new_chars == -2)
648	      goto eof_no_match;
649	    if (new_chars == -1)
650	      goto done;
651
652	    ASSERT (new_chars >= 0);
653	    chars += new_chars;
654	    goto increment_fields;
655
656	  case 'a':   /* glibc allocate string */
657	  case '\'':  /* glibc digit groupings */
658	    break;
659
660	  case 'F':   /* mpf_t */
661	  case 'j':   /* intmax_t */
662	  case 'L':   /* long long */
663	  case 'q':   /* quad_t */
664	  case 'Q':   /* mpq_t */
665	  case 't':   /* ptrdiff_t */
666	  case 'z':   /* size_t */
667	  case 'Z':   /* mpz_t */
668	  set_type:
669	    param.type = fchar;
670	    break;
671
672	  case 'h':   /* short or char */
673	    if (param.type != 'h')
674	      goto set_type;
675	    param.type = 'H';	/* internal code for "hh" */
676	    break;
677
678	    goto numeric;
679
680	  case 'l':   /* long, long long, double or long double */
681	    if (param.type != 'l')
682	      goto set_type;
683	    param.type = 'L';	/* "ll" means "L" */
684	    break;
685
686	  case 'n':
687	    if (! param.ignore)
688	      {
689		void  *p;
690		p = va_arg (ap, void *);
691		TRACE (printf ("  store %%n to %p\n", p));
692		switch (param.type) {
693		case '\0': * (int	*) p = chars; break;
694		case 'F':  mpf_set_si ((mpf_ptr) p, (long) chars); break;
695		case 'H':  * (char	*) p = chars; break;
696		case 'h':  * (short	*) p = chars; break;
697#if HAVE_INTMAX_T
698		case 'j':  * (intmax_t	*) p = chars; break;
699#else
700		case 'j':  ASSERT_FAIL (intmax_t not available); break;
701#endif
702		case 'l':  * (long	*) p = chars; break;
703#if HAVE_QUAD_T && HAVE_LONG_LONG
704		case 'q':
705		  ASSERT_ALWAYS (sizeof (quad_t) == sizeof (long long));
706		  /*FALLTHRU*/
707#else
708		case 'q':  ASSERT_FAIL (quad_t not available); break;
709#endif
710#if HAVE_LONG_LONG
711		case 'L':  * (long long *) p = chars; break;
712#else
713		case 'L':  ASSERT_FAIL (long long not available); break;
714#endif
715		case 'Q':  mpq_set_si ((mpq_ptr) p, (long) chars, 1L); break;
716#if HAVE_PTRDIFF_T
717		case 't':  * (ptrdiff_t *) p = chars; break;
718#else
719		case 't':  ASSERT_FAIL (ptrdiff_t not available); break;
720#endif
721		case 'z':  * (size_t	*) p = chars; break;
722		case 'Z':  mpz_set_si ((mpz_ptr) p, (long) chars); break;
723		default: ASSERT (0); break;
724		}
725	      }
726	    goto next;
727
728	  case 'o':
729	    param.base = 8;
730	    goto numeric;
731
732	  case 'x':
733	  case 'X':
734	    param.base = 16;
735	    goto numeric;
736
737	  case '0': case '1': case '2': case '3': case '4':
738	  case '5': case '6': case '7': case '8': case '9':
739	    param.width = 0;
740	    do {
741	      param.width = param.width * 10 + (fchar-'0');
742	      fchar = *fmt++;
743	    } while (isdigit (fchar));
744	    fmt--; /* unget the non-digit */
745	    break;
746
747	  case '*':
748	    param.ignore = 1;
749	    break;
750
751	  default:
752	    /* something invalid in a % sequence */
753	    ASSERT (0);
754	    goto next;
755	  }
756	}
757    }
758
759 done:
760  (*__gmp_free_func) (alloc_fmt, alloc_fmt_size);
761  return fields;
762}
763