1/* $Id: iptcutil.c 276 2010-06-30 12:18:30Z nijtmans $ */
2
3#include "tif_config.h"
4
5#include <stdio.h>
6#include <stdlib.h>
7#include <string.h>
8#include <memory.h>
9#include <ctype.h>
10
11#ifdef HAVE_STRINGS_H
12# include <strings.h>
13#endif
14
15#ifdef HAVE_IO_H
16# include <io.h>
17#endif
18
19#ifdef HAVE_FCNTL_H
20# include <fcntl.h>
21#endif
22
23#ifdef WIN32
24#define STRNICMP strnicmp
25#else
26#define STRNICMP strncasecmp
27#endif
28
29typedef struct _tag_spec
30{
31  short
32    id;
33
34  char
35    *name;
36} tag_spec;
37
38static tag_spec tags[] = {
39    { 5,"Image Name" },
40    { 7,"Edit Status" },
41    { 10,"Priority" },
42    { 15,"Category" },
43    { 20,"Supplemental Category" },
44    { 22,"Fixture Identifier" },
45    { 25,"Keyword" },
46    { 30,"Release Date" },
47    { 35,"Release Time" },
48    { 40,"Special Instructions" },
49    { 45,"Reference Service" },
50    { 47,"Reference Date" },
51    { 50,"Reference Number" },
52    { 55,"Created Date" },
53    { 60,"Created Time" },
54    { 65,"Originating Program" },
55    { 70,"Program Version" },
56    { 75,"Object Cycle" },
57    { 80,"Byline" },
58    { 85,"Byline Title" },
59    { 90,"City" },
60    { 95,"Province State" },
61    { 100,"Country Code" },
62    { 101,"Country" },
63    { 103,"Original Transmission Reference" },
64    { 105,"Headline" },
65    { 110,"Credit" },
66    { 115,"Source" },
67    { 116,"Copyright String" },
68    { 120,"Caption" },
69    { 121,"Local Caption" },
70    { 122,"Caption Writer" },
71    { 200,"Custom Field 1" },
72    { 201,"Custom Field 2" },
73    { 202,"Custom Field 3" },
74    { 203,"Custom Field 4" },
75    { 204,"Custom Field 5" },
76    { 205,"Custom Field 6" },
77    { 206,"Custom Field 7" },
78    { 207,"Custom Field 8" },
79    { 208,"Custom Field 9" },
80    { 209,"Custom Field 10" },
81    { 210,"Custom Field 11" },
82    { 211,"Custom Field 12" },
83    { 212,"Custom Field 13" },
84    { 213,"Custom Field 14" },
85    { 214,"Custom Field 15" },
86    { 215,"Custom Field 16" },
87    { 216,"Custom Field 17" },
88    { 217,"Custom Field 18" },
89    { 218,"Custom Field 19" },
90    { 219,"Custom Field 20" }
91};
92
93/*
94 * We format the output using HTML conventions
95 * to preserve control characters and such.
96 */
97void formatString(FILE *ofile, const char *s, int len)
98{
99  putc('"', ofile);
100  for (; len > 0; --len, ++s) {
101    int c = *s;
102    switch (c) {
103    case '&':
104      fputs("&amp;", ofile);
105      break;
106#ifdef HANDLE_GT_LT
107    case '<':
108      fputs("&lt;", ofile);
109      break;
110    case '>':
111      fputs("&gt;", ofile);
112      break;
113#endif
114    case '"':
115      fputs("&quot;", ofile);
116      break;
117    default:
118      if (iscntrl(c))
119        fprintf(ofile, "&#%d;", c);
120      else
121        putc(*s, ofile);
122      break;
123    }
124  }
125  fputs("\"\n", ofile);
126}
127
128typedef struct _html_code
129{
130  short
131    len;
132  const char
133    *code,
134    val;
135} html_code;
136
137static html_code html_codes[] = {
138#ifdef HANDLE_GT_LT
139    { 4,"&lt;",'<' },
140    { 4,"&gt;",'>' },
141#endif
142    { 5,"&amp;",'&' },
143    { 6,"&quot;",'"' }
144};
145
146/*
147 * This routine converts HTML escape sequence
148 * back to the original ASCII representation.
149 * - returns the number of characters dropped.
150 */
151int convertHTMLcodes(char *s, int len)
152{
153  if (len <=0 || s==(char*)NULL || *s=='\0')
154    return 0;
155
156  if (s[1] == '#')
157    {
158      int val, o;
159
160      if (sscanf(s,"&#%d;",&val) == 1)
161      {
162        o = 3;
163        while (s[o] != ';')
164        {
165          o++;
166          if (o > 5)
167            break;
168        }
169        if (o < 5)
170          strcpy(s+1, s+1+o);
171        *s = val;
172        return o;
173      }
174    }
175  else
176    {
177      int
178        i,
179        codes = sizeof(html_codes) / sizeof(html_code);
180
181      for (i=0; i < codes; i++)
182      {
183        if (html_codes[i].len <= len)
184          if (STRNICMP(s, html_codes[i].code, html_codes[i].len) == 0)
185            {
186              strcpy(s+1, s+html_codes[i].len);
187              *s = html_codes[i].val;
188              return html_codes[i].len-1;
189            }
190      }
191    }
192
193  return 0;
194}
195
196int formatIPTC(FILE *ifile, FILE *ofile)
197{
198  unsigned int
199    foundiptc,
200    tagsfound;
201
202  unsigned char
203    recnum,
204    dataset;
205
206  char
207    *readable,
208    *str;
209
210  long
211    tagindx,
212    taglen;
213
214  int
215    i,
216    tagcount = sizeof(tags) / sizeof(tag_spec);
217
218  char
219    c;
220
221  foundiptc = 0; /* found the IPTC-Header */
222  tagsfound = 0; /* number of tags found */
223
224  c = getc(ifile);
225  while (c != EOF)
226  {
227	  if (c == 0x1c)
228	    foundiptc = 1;
229	  else
230      {
231        if (foundiptc)
232	        return -1;
233        else
234	        continue;
235	    }
236
237    /* we found the 0x1c tag and now grab the dataset and record number tags */
238    dataset = getc(ifile);
239	  if ((char) dataset == EOF)
240	    return -1;
241    recnum = getc(ifile);
242	  if ((char) recnum == EOF)
243	    return -1;
244    /* try to match this record to one of the ones in our named table */
245    for (i=0; i< tagcount; i++)
246    {
247      if (tags[i].id == recnum)
248          break;
249    }
250    if (i < tagcount)
251      readable = tags[i].name;
252    else
253      readable = "";
254
255    /* then we decode the length of the block that follows - long or short fmt */
256    c = getc(ifile);
257	  if (c == EOF)
258	    return 0;
259	  if (c & (unsigned char) 0x80)
260      {
261        unsigned char
262          buffer[4];
263
264        for (i=0; i<4; i++)
265        {
266          c = buffer[i] = getc(ifile);
267          if (c == EOF)
268            return -1;
269        }
270        taglen = (((long) buffer[ 0 ]) << 24) |
271                 (((long) buffer[ 1 ]) << 16) |
272	               (((long) buffer[ 2 ]) <<  8) |
273                 (((long) buffer[ 3 ]));
274	    }
275    else
276      {
277        unsigned char
278          x = c;
279
280        taglen = ((long) x) << 8;
281        x = getc(ifile);
282        if ((char)x == EOF)
283          return -1;
284        taglen |= (long) x;
285	    }
286    /* make a buffer to hold the tag data and snag it from the input stream */
287    str = (char *) malloc((unsigned int) (taglen+1));
288    if (str == (char *) NULL)
289      {
290        printf("Memory allocation failed");
291        return 0;
292      }
293    for (tagindx=0; tagindx<taglen; tagindx++)
294    {
295      c = str[tagindx] = getc(ifile);
296      if (c == EOF)
297        return -1;
298    }
299    str[ taglen ] = 0;
300
301    /* now finish up by formatting this binary data into ASCII equivalent */
302    if (strlen(readable) > 0)
303	    fprintf(ofile, "%d#%d#%s=",(unsigned int)dataset, (unsigned int) recnum, readable);
304    else
305	    fprintf(ofile, "%d#%d=",(unsigned int)dataset, (unsigned int) recnum);
306    formatString( ofile, str, taglen );
307    free(str);
308
309	  tagsfound++;
310
311    c = getc(ifile);
312  }
313  return tagsfound;
314}
315
316int tokenizer(unsigned inflag,char *token,int tokmax,char *line,
317char *white,char *brkchar,char *quote,char eschar,char *brkused,
318int *next,char *quoted);
319
320char *super_fgets(char *b, int *blen, FILE *file)
321{
322  int
323    c,
324    len;
325
326  char
327    *q;
328
329  len=*blen;
330  for (q=b; ; q++)
331  {
332    c=fgetc(file);
333    if (c == EOF || c == '\n')
334      break;
335    if (((int)q - (int)b + 1 ) >= (int) len)
336      {
337        int
338          tlen;
339
340        tlen=(int)q-(int)b;
341        len<<=1;
342        b=(char *) realloc((char *) b,(len+2));
343        if ((char *) b == (char *) NULL)
344          break;
345        q=b+tlen;
346      }
347    *q=(unsigned char) c;
348  }
349  *blen=0;
350  if ((unsigned char *)b != (unsigned char *) NULL)
351    {
352      int
353        tlen;
354
355      tlen=(int)q - (int)b;
356      if (tlen == 0)
357        return (char *) NULL;
358      b[tlen] = '\0';
359      *blen=++tlen;
360    }
361  return b;
362}
363
364#define BUFFER_SZ 4096
365
366int main(int argc, char *argv[])
367{
368  unsigned int
369    length;
370
371  unsigned char
372    *buffer;
373
374  int
375    i,
376    mode; /* iptc binary, or iptc text */
377
378  FILE
379    *ifile = stdin,
380    *ofile = stdout;
381
382  char
383    c,
384    *usage = "usage: iptcutil -t | -b [-i file] [-o file] <input >output";
385
386  if( argc < 2 )
387    {
388      printf("%s\n", usage);
389	    return 1;
390    }
391
392  mode = 0;
393  length = -1;
394  buffer = (unsigned char *)NULL;
395
396  for (i=1; i<argc; i++)
397  {
398    c = argv[i][0];
399    if (c == '-' || c == '/')
400      {
401        c = argv[i][1];
402        switch( c )
403        {
404        case 't':
405	        mode = 1;
406#ifdef WIN32
407          /* Set "stdout" to binary mode: */
408          _setmode( _fileno( ofile ), _O_BINARY );
409#endif
410	        break;
411        case 'b':
412	        mode = 0;
413#ifdef WIN32
414          /* Set "stdin" to binary mode: */
415          _setmode( _fileno( ifile ), _O_BINARY );
416#endif
417	        break;
418        case 'i':
419          if (mode == 0)
420            ifile = fopen(argv[++i], "rb");
421          else
422            ifile = fopen(argv[++i], "rt");
423          if (ifile == (FILE *)NULL)
424            {
425	            printf("Unable to open: %s\n", argv[i]);
426              return 1;
427            }
428	        break;
429        case 'o':
430          if (mode == 0)
431            ofile = fopen(argv[++i], "wt");
432          else
433            ofile = fopen(argv[++i], "wb");
434          if (ofile == (FILE *)NULL)
435            {
436	            printf("Unable to open: %s\n", argv[i]);
437              return 1;
438            }
439	        break;
440        default:
441	        printf("Unknown option: %s\n", argv[i]);
442	        return 1;
443        }
444      }
445    else
446      {
447        printf("%s\n", usage);
448	      return 1;
449      }
450  }
451
452  if (mode == 0) /* handle binary iptc info */
453    formatIPTC(ifile, ofile);
454
455  if (mode == 1) /* handle text form of iptc info */
456    {
457      char
458        brkused,
459        quoted,
460        *line,
461        *token,
462        *newstr;
463
464      int
465        state,
466        next;
467
468      unsigned char
469        recnum = 0,
470        dataset = 0;
471
472      int
473        inputlen = BUFFER_SZ;
474
475      line = (char *) malloc(inputlen);
476      token = (char *)NULL;
477      while((line = super_fgets(line,&inputlen,ifile))!=NULL)
478      {
479        state=0;
480        next=0;
481
482        token = (char *) malloc(inputlen);
483        newstr = (char *) malloc(inputlen);
484        while(tokenizer(0, token, inputlen, line, "", "=", "\"", 0,
485          &brkused,&next,&quoted)==0)
486        {
487          if (state == 0)
488            {
489              int
490                state,
491                next;
492
493              char
494                brkused,
495                quoted;
496
497              state=0;
498              next=0;
499              while(tokenizer(0, newstr, inputlen, token, "", "#", "", 0,
500                &brkused, &next, &quoted)==0)
501              {
502                if (state == 0)
503                  dataset = (unsigned char) atoi(newstr);
504                else
505                   if (state == 1)
506                     recnum = (unsigned char) atoi(newstr);
507                state++;
508              }
509            }
510          else
511            if (state == 1)
512              {
513                int
514                  next;
515
516                unsigned long
517                  len;
518
519                char
520                  brkused,
521                  quoted;
522
523                next=0;
524                len = strlen(token);
525                while(tokenizer(0, newstr, inputlen, token, "", "&", "", 0,
526                  &brkused, &next, &quoted)==0)
527                {
528                  if (brkused && next > 0)
529                    {
530                      char
531                        *s = &token[next-1];
532
533                      len -= convertHTMLcodes(s, strlen(s));
534                    }
535                }
536
537                fputc(0x1c, ofile);
538                fputc(dataset, ofile);
539                fputc(recnum, ofile);
540                if (len < 0x10000)
541                  {
542                    fputc((len >> 8) & 255, ofile);
543                    fputc(len & 255, ofile);
544                  }
545                else
546                  {
547                    fputc(((len >> 24) & 255) | 0x80, ofile);
548                    fputc((len >> 16) & 255, ofile);
549                    fputc((len >> 8) & 255, ofile);
550                    fputc(len & 255, ofile);
551                  }
552                next=0;
553                while (len--)
554                  fputc(token[next++], ofile);
555              }
556          state++;
557        }
558        free(token);
559        token = (char *)NULL;
560        free(newstr);
561        newstr = (char *)NULL;
562      }
563      free(line);
564
565      fclose( ifile );
566      fclose( ofile );
567    }
568
569  return 0;
570}
571
572/*
573	This routine is a generalized, finite state token parser. It allows
574    you extract tokens one at a time from a string of characters.  The
575    characters used for white space, for break characters, and for quotes
576    can be specified. Also, characters in the string can be preceded by
577    a specifiable escape character which removes any special meaning the
578    character may have.
579
580	There are a lot of formal parameters in this subroutine call, but
581	once you get familiar with them, this routine is fairly easy to use.
582	"#define" macros can be used to generate simpler looking calls for
583	commonly used applications of this routine.
584
585	First, some terminology:
586
587	token:		used here, a single unit of information in
588				the form of a group of characters.
589
590	white space:	space that gets ignored (except within quotes
591				or when escaped), like blanks and tabs.  in
592				addition, white space terminates a non-quoted
593				token.
594
595	break character: a character that separates non-quoted tokens.
596				commas are a common break character.  the
597				usage of break characters to signal the end
598				of a token is the same as that of white space,
599				except multiple break characters with nothing
600				or only white space between generate a null
601				token for each two break characters together.
602
603				for example, if blank is set to be the white
604				space and comma is set to be the break
605				character, the line ...
606
607				A, B, C ,  , DEF
608
609				... consists of 5 tokens:
610
611				1)	"A"
612				2)	"B"
613				3)	"C"
614				4)	""      (the null string)
615				5)	"DEF"
616
617	quote character: 	a character that, when surrounding a group
618				of other characters, causes the group of
619				characters to be treated as a single token,
620				no matter how many white spaces or break
621				characters exist in the group.	also, a
622				token always terminates after the closing
623				quote.	for example, if ' is the quote
624				character, blank is white space, and comma
625				is the break character, the following
626				string ...
627
628				A, ' B, CD'EF GHI
629
630				... consists of 4 tokens:
631
632				1)	"A"
633				2)	" B, CD" (note the blanks & comma)
634				3)	"EF"
635				4)	"GHI"
636
637				the quote characters themselves do
638				not appear in the resultant tokens.  the
639				double quotes are delimiters i use here for
640				documentation purposes only.
641
642	escape character:	a character which itself is ignored but
643				which causes the next character to be
644				used as is.  ^ and \ are often used as
645				escape characters.  an escape in the last
646				position of the string gets treated as a
647				"normal" (i.e., non-quote, non-white,
648				non-break, and non-escape) character.
649				for example, assume white space, break
650				character, and quote are the same as in the
651				above examples, and further, assume that
652				^ is the escape character.  then, in the
653				string ...
654
655				ABC, ' DEF ^' GH' I ^ J K^ L ^
656
657				... there are 7 tokens:
658
659				1)	"ABC"
660				2)	" DEF ' GH"
661				3)	"I"
662				4)	" "     (a lone blank)
663				5)	"J"
664				6)	"K L"
665				7)	"^"     (passed as is at end of line)
666
667
668	OK, now that you have this background, here's how to call "tokenizer":
669
670	result=tokenizer(flag,token,maxtok,string,white,break,quote,escape,
671		      brkused,next,quoted)
672
673	result: 	0 if we haven't reached EOS (end of string), and
674			1 if we have (this is an "int").
675
676	flag:		right now, only the low order 3 bits are used.
677			1 => convert non-quoted tokens to upper case
678			2 => convert non-quoted tokens to lower case
679			0 => do not convert non-quoted tokens
680			(this is a "char").
681
682	token:		a character string containing the returned next token
683			(this is a "char[]").
684
685	maxtok: 	the maximum size of "token".  characters beyond
686			"maxtok" are truncated (this is an "int").
687
688	string: 	the string to be parsed (this is a "char[]").
689
690	white:		a string of the valid white spaces.  example:
691
692			char whitesp[]={" \t"};
693
694			blank and tab will be valid white space (this is
695			a "char[]").
696
697	break:		a string of the valid break characters.  example:
698
699			char breakch[]={";,"};
700
701			semicolon and comma will be valid break characters
702			(this is a "char[]").
703
704			IMPORTANT:  do not use the name "break" as a C
705			variable, as this is a reserved word in C.
706
707	quote:		a string of the valid quote characters.  an example
708			would be
709
710			char whitesp[]={"'\"");
711
712			(this causes single and double quotes to be valid)
713			note that a token starting with one of these characters
714			needs the same quote character to terminate it.
715
716			for example,
717
718			"ABC '
719
720			is unterminated, but
721
722			"DEF" and 'GHI'
723
724			are properly terminated.  note that different quote
725			characters can appear on the same line; only for
726			a given token do the quote characters have to be
727			the same (this is a "char[]").
728
729	escape: 	the escape character (NOT a string ... only one
730			allowed).  use zero if none is desired (this is
731			a "char").
732
733	brkused:	the break character used to terminate the current
734			token.	if the token was quoted, this will be the
735			quote used.  if the token is the last one on the
736			line, this will be zero (this is a pointer to a
737			"char").
738
739	next:		this variable points to the first character of the
740			next token.  it gets reset by "tokenizer" as it steps
741			through the string.  set it to 0 upon initialization,
742			and leave it alone after that.	you can change it
743			if you want to jump around in the string or re-parse
744			from the beginning, but be careful (this is a
745			pointer to an "int").
746
747	quoted: 	set to 1 (true) if the token was quoted and 0 (false)
748			if not.  you may need this information (for example:
749			in C, a string with quotes around it is a character
750			string, while one without is an identifier).
751
752			(this is a pointer to a "char").
753*/
754
755/* states */
756
757#define IN_WHITE 0
758#define IN_TOKEN 1
759#define IN_QUOTE 2
760#define IN_OZONE 3
761
762int _p_state;	   /* current state	 */
763unsigned _p_flag;  /* option flag	 */
764char _p_curquote;  /* current quote char */
765int _p_tokpos;	   /* current token pos  */
766
767/* routine to find character in string ... used only by "tokenizer" */
768
769int sindex(char ch,char *string)
770{
771  char *cp;
772  for(cp=string;*cp;++cp)
773    if(ch==*cp)
774      return (int)(cp-string);	/* return postion of character */
775  return -1;			/* eol ... no match found */
776}
777
778/* routine to store a character in a string ... used only by "tokenizer" */
779
780void chstore(char *string,int max,char ch)
781{
782  char c;
783  if(_p_tokpos>=0&&_p_tokpos<max-1)
784  {
785    if(_p_state==IN_QUOTE)
786      c=ch;
787    else
788      switch(_p_flag&3)
789      {
790	    case 1: 	    /* convert to upper */
791	      c=toupper(ch);
792	      break;
793
794	    case 2: 	    /* convert to lower */
795	      c=tolower(ch);
796	      break;
797
798	    default:	    /* use as is */
799	      c=ch;
800	      break;
801      }
802    string[_p_tokpos++]=c;
803  }
804  return;
805}
806
807int tokenizer(unsigned inflag,char *token,int tokmax,char *line,
808  char *white,char *brkchar,char *quote,char eschar,char *brkused,
809    int *next,char *quoted)
810{
811  int qp;
812  char c,nc;
813
814  *brkused=0;		/* initialize to null */
815  *quoted=0;		/* assume not quoted  */
816
817  if(!line[*next])	/* if we're at end of line, indicate such */
818    return 1;
819
820  _p_state=IN_WHITE;   /* initialize state */
821  _p_curquote=0;	   /* initialize previous quote char */
822  _p_flag=inflag;	   /* set option flag */
823
824  for(_p_tokpos=0;(c=line[*next]);++(*next))	/* main loop */
825  {
826    if((qp=sindex(c,brkchar))>=0)  /* break */
827    {
828      switch(_p_state)
829      {
830	    case IN_WHITE:		/* these are the same here ...	*/
831	    case IN_TOKEN:		/* ... just get out		*/
832	    case IN_OZONE:		/* ditto			*/
833	      ++(*next);
834	      *brkused=brkchar[qp];
835	      goto byebye;
836
837	    case IN_QUOTE:		 /* just keep going */
838	      chstore(token,tokmax,c);
839	      break;
840      }
841    }
842    else if((qp=sindex(c,quote))>=0)  /* quote */
843    {
844      switch(_p_state)
845      {
846	    case IN_WHITE:	 /* these are identical, */
847	      _p_state=IN_QUOTE; /* change states   */
848	      _p_curquote=quote[qp]; /* save quote char */
849	      *quoted=1;	/* set to true as long as something is in quotes */
850	      break;
851
852	    case IN_QUOTE:
853	      if(quote[qp]==_p_curquote) /* same as the beginning quote? */
854	      {
855	        _p_state=IN_OZONE;
856	        _p_curquote=0;
857	      }
858	      else
859	        chstore(token,tokmax,c); /* treat as regular char */
860	      break;
861
862	    case IN_TOKEN:
863	    case IN_OZONE:
864	      *brkused=c; /* uses quote as break char */
865	      goto byebye;
866      }
867    }
868    else if((qp=sindex(c,white))>=0) /* white */
869    {
870      switch(_p_state)
871      {
872	    case IN_WHITE:
873	    case IN_OZONE:
874	      break;		/* keep going */
875
876	    case IN_TOKEN:
877	      _p_state=IN_OZONE;
878	      break;
879
880	    case IN_QUOTE:
881	      chstore(token,tokmax,c); /* it's valid here */
882	      break;
883      }
884    }
885    else if(c==eschar)  /* escape */
886    {
887      nc=line[(*next)+1];
888      if(nc==0) 		/* end of line */
889      {
890	    *brkused=0;
891	    chstore(token,tokmax,c);
892	    ++(*next);
893	    goto byebye;
894      }
895      switch(_p_state)
896      {
897	    case IN_WHITE:
898	      --(*next);
899	      _p_state=IN_TOKEN;
900	      break;
901
902	    case IN_TOKEN:
903	    case IN_QUOTE:
904	      ++(*next);
905	      chstore(token,tokmax,nc);
906	      break;
907
908	    case IN_OZONE:
909	      goto byebye;
910      }
911    }
912    else	/* anything else is just a real character */
913    {
914      switch(_p_state)
915      {
916	    case IN_WHITE:
917	      _p_state=IN_TOKEN; /* switch states */
918
919	    case IN_TOKEN:		 /* these 2 are     */
920	    case IN_QUOTE:		 /*  identical here */
921	      chstore(token,tokmax,c);
922	      break;
923
924	    case IN_OZONE:
925	      goto byebye;
926      }
927    }
928  }		/* end of main loop */
929
930byebye:
931  token[_p_tokpos]=0;	/* make sure token ends with EOS */
932
933  return 0;
934}
935/*
936 * Local Variables:
937 * mode: c
938 * c-basic-offset: 8
939 * fill-column: 78
940 * End:
941 */
942