1/* $Id: iptcutil.c 276 2010-06-30 12:18:30Z nijtmans $ */ 2 3#include "tif_config.h" 4 5#include <stdio.h> 6#include <stdlib.h> 7#include <string.h> 8#include <memory.h> 9#include <ctype.h> 10 11#ifdef HAVE_STRINGS_H 12# include <strings.h> 13#endif 14 15#ifdef HAVE_IO_H 16# include <io.h> 17#endif 18 19#ifdef HAVE_FCNTL_H 20# include <fcntl.h> 21#endif 22 23#ifdef WIN32 24#define STRNICMP strnicmp 25#else 26#define STRNICMP strncasecmp 27#endif 28 29typedef struct _tag_spec 30{ 31 short 32 id; 33 34 char 35 *name; 36} tag_spec; 37 38static tag_spec tags[] = { 39 { 5,"Image Name" }, 40 { 7,"Edit Status" }, 41 { 10,"Priority" }, 42 { 15,"Category" }, 43 { 20,"Supplemental Category" }, 44 { 22,"Fixture Identifier" }, 45 { 25,"Keyword" }, 46 { 30,"Release Date" }, 47 { 35,"Release Time" }, 48 { 40,"Special Instructions" }, 49 { 45,"Reference Service" }, 50 { 47,"Reference Date" }, 51 { 50,"Reference Number" }, 52 { 55,"Created Date" }, 53 { 60,"Created Time" }, 54 { 65,"Originating Program" }, 55 { 70,"Program Version" }, 56 { 75,"Object Cycle" }, 57 { 80,"Byline" }, 58 { 85,"Byline Title" }, 59 { 90,"City" }, 60 { 95,"Province State" }, 61 { 100,"Country Code" }, 62 { 101,"Country" }, 63 { 103,"Original Transmission Reference" }, 64 { 105,"Headline" }, 65 { 110,"Credit" }, 66 { 115,"Source" }, 67 { 116,"Copyright String" }, 68 { 120,"Caption" }, 69 { 121,"Local Caption" }, 70 { 122,"Caption Writer" }, 71 { 200,"Custom Field 1" }, 72 { 201,"Custom Field 2" }, 73 { 202,"Custom Field 3" }, 74 { 203,"Custom Field 4" }, 75 { 204,"Custom Field 5" }, 76 { 205,"Custom Field 6" }, 77 { 206,"Custom Field 7" }, 78 { 207,"Custom Field 8" }, 79 { 208,"Custom Field 9" }, 80 { 209,"Custom Field 10" }, 81 { 210,"Custom Field 11" }, 82 { 211,"Custom Field 12" }, 83 { 212,"Custom Field 13" }, 84 { 213,"Custom Field 14" }, 85 { 214,"Custom Field 15" }, 86 { 215,"Custom Field 16" }, 87 { 216,"Custom Field 17" }, 88 { 217,"Custom Field 18" }, 89 { 218,"Custom Field 19" }, 90 { 219,"Custom Field 20" } 91}; 92 93/* 94 * We format the output using HTML conventions 95 * to preserve control characters and such. 96 */ 97void formatString(FILE *ofile, const char *s, int len) 98{ 99 putc('"', ofile); 100 for (; len > 0; --len, ++s) { 101 int c = *s; 102 switch (c) { 103 case '&': 104 fputs("&", ofile); 105 break; 106#ifdef HANDLE_GT_LT 107 case '<': 108 fputs("<", ofile); 109 break; 110 case '>': 111 fputs(">", ofile); 112 break; 113#endif 114 case '"': 115 fputs(""", ofile); 116 break; 117 default: 118 if (iscntrl(c)) 119 fprintf(ofile, "&#%d;", c); 120 else 121 putc(*s, ofile); 122 break; 123 } 124 } 125 fputs("\"\n", ofile); 126} 127 128typedef struct _html_code 129{ 130 short 131 len; 132 const char 133 *code, 134 val; 135} html_code; 136 137static html_code html_codes[] = { 138#ifdef HANDLE_GT_LT 139 { 4,"<",'<' }, 140 { 4,">",'>' }, 141#endif 142 { 5,"&",'&' }, 143 { 6,""",'"' } 144}; 145 146/* 147 * This routine converts HTML escape sequence 148 * back to the original ASCII representation. 149 * - returns the number of characters dropped. 150 */ 151int convertHTMLcodes(char *s, int len) 152{ 153 if (len <=0 || s==(char*)NULL || *s=='\0') 154 return 0; 155 156 if (s[1] == '#') 157 { 158 int val, o; 159 160 if (sscanf(s,"&#%d;",&val) == 1) 161 { 162 o = 3; 163 while (s[o] != ';') 164 { 165 o++; 166 if (o > 5) 167 break; 168 } 169 if (o < 5) 170 strcpy(s+1, s+1+o); 171 *s = val; 172 return o; 173 } 174 } 175 else 176 { 177 int 178 i, 179 codes = sizeof(html_codes) / sizeof(html_code); 180 181 for (i=0; i < codes; i++) 182 { 183 if (html_codes[i].len <= len) 184 if (STRNICMP(s, html_codes[i].code, html_codes[i].len) == 0) 185 { 186 strcpy(s+1, s+html_codes[i].len); 187 *s = html_codes[i].val; 188 return html_codes[i].len-1; 189 } 190 } 191 } 192 193 return 0; 194} 195 196int formatIPTC(FILE *ifile, FILE *ofile) 197{ 198 unsigned int 199 foundiptc, 200 tagsfound; 201 202 unsigned char 203 recnum, 204 dataset; 205 206 char 207 *readable, 208 *str; 209 210 long 211 tagindx, 212 taglen; 213 214 int 215 i, 216 tagcount = sizeof(tags) / sizeof(tag_spec); 217 218 char 219 c; 220 221 foundiptc = 0; /* found the IPTC-Header */ 222 tagsfound = 0; /* number of tags found */ 223 224 c = getc(ifile); 225 while (c != EOF) 226 { 227 if (c == 0x1c) 228 foundiptc = 1; 229 else 230 { 231 if (foundiptc) 232 return -1; 233 else 234 continue; 235 } 236 237 /* we found the 0x1c tag and now grab the dataset and record number tags */ 238 dataset = getc(ifile); 239 if ((char) dataset == EOF) 240 return -1; 241 recnum = getc(ifile); 242 if ((char) recnum == EOF) 243 return -1; 244 /* try to match this record to one of the ones in our named table */ 245 for (i=0; i< tagcount; i++) 246 { 247 if (tags[i].id == recnum) 248 break; 249 } 250 if (i < tagcount) 251 readable = tags[i].name; 252 else 253 readable = ""; 254 255 /* then we decode the length of the block that follows - long or short fmt */ 256 c = getc(ifile); 257 if (c == EOF) 258 return 0; 259 if (c & (unsigned char) 0x80) 260 { 261 unsigned char 262 buffer[4]; 263 264 for (i=0; i<4; i++) 265 { 266 c = buffer[i] = getc(ifile); 267 if (c == EOF) 268 return -1; 269 } 270 taglen = (((long) buffer[ 0 ]) << 24) | 271 (((long) buffer[ 1 ]) << 16) | 272 (((long) buffer[ 2 ]) << 8) | 273 (((long) buffer[ 3 ])); 274 } 275 else 276 { 277 unsigned char 278 x = c; 279 280 taglen = ((long) x) << 8; 281 x = getc(ifile); 282 if ((char)x == EOF) 283 return -1; 284 taglen |= (long) x; 285 } 286 /* make a buffer to hold the tag data and snag it from the input stream */ 287 str = (char *) malloc((unsigned int) (taglen+1)); 288 if (str == (char *) NULL) 289 { 290 printf("Memory allocation failed"); 291 return 0; 292 } 293 for (tagindx=0; tagindx<taglen; tagindx++) 294 { 295 c = str[tagindx] = getc(ifile); 296 if (c == EOF) 297 return -1; 298 } 299 str[ taglen ] = 0; 300 301 /* now finish up by formatting this binary data into ASCII equivalent */ 302 if (strlen(readable) > 0) 303 fprintf(ofile, "%d#%d#%s=",(unsigned int)dataset, (unsigned int) recnum, readable); 304 else 305 fprintf(ofile, "%d#%d=",(unsigned int)dataset, (unsigned int) recnum); 306 formatString( ofile, str, taglen ); 307 free(str); 308 309 tagsfound++; 310 311 c = getc(ifile); 312 } 313 return tagsfound; 314} 315 316int tokenizer(unsigned inflag,char *token,int tokmax,char *line, 317char *white,char *brkchar,char *quote,char eschar,char *brkused, 318int *next,char *quoted); 319 320char *super_fgets(char *b, int *blen, FILE *file) 321{ 322 int 323 c, 324 len; 325 326 char 327 *q; 328 329 len=*blen; 330 for (q=b; ; q++) 331 { 332 c=fgetc(file); 333 if (c == EOF || c == '\n') 334 break; 335 if (((int)q - (int)b + 1 ) >= (int) len) 336 { 337 int 338 tlen; 339 340 tlen=(int)q-(int)b; 341 len<<=1; 342 b=(char *) realloc((char *) b,(len+2)); 343 if ((char *) b == (char *) NULL) 344 break; 345 q=b+tlen; 346 } 347 *q=(unsigned char) c; 348 } 349 *blen=0; 350 if ((unsigned char *)b != (unsigned char *) NULL) 351 { 352 int 353 tlen; 354 355 tlen=(int)q - (int)b; 356 if (tlen == 0) 357 return (char *) NULL; 358 b[tlen] = '\0'; 359 *blen=++tlen; 360 } 361 return b; 362} 363 364#define BUFFER_SZ 4096 365 366int main(int argc, char *argv[]) 367{ 368 unsigned int 369 length; 370 371 unsigned char 372 *buffer; 373 374 int 375 i, 376 mode; /* iptc binary, or iptc text */ 377 378 FILE 379 *ifile = stdin, 380 *ofile = stdout; 381 382 char 383 c, 384 *usage = "usage: iptcutil -t | -b [-i file] [-o file] <input >output"; 385 386 if( argc < 2 ) 387 { 388 printf("%s\n", usage); 389 return 1; 390 } 391 392 mode = 0; 393 length = -1; 394 buffer = (unsigned char *)NULL; 395 396 for (i=1; i<argc; i++) 397 { 398 c = argv[i][0]; 399 if (c == '-' || c == '/') 400 { 401 c = argv[i][1]; 402 switch( c ) 403 { 404 case 't': 405 mode = 1; 406#ifdef WIN32 407 /* Set "stdout" to binary mode: */ 408 _setmode( _fileno( ofile ), _O_BINARY ); 409#endif 410 break; 411 case 'b': 412 mode = 0; 413#ifdef WIN32 414 /* Set "stdin" to binary mode: */ 415 _setmode( _fileno( ifile ), _O_BINARY ); 416#endif 417 break; 418 case 'i': 419 if (mode == 0) 420 ifile = fopen(argv[++i], "rb"); 421 else 422 ifile = fopen(argv[++i], "rt"); 423 if (ifile == (FILE *)NULL) 424 { 425 printf("Unable to open: %s\n", argv[i]); 426 return 1; 427 } 428 break; 429 case 'o': 430 if (mode == 0) 431 ofile = fopen(argv[++i], "wt"); 432 else 433 ofile = fopen(argv[++i], "wb"); 434 if (ofile == (FILE *)NULL) 435 { 436 printf("Unable to open: %s\n", argv[i]); 437 return 1; 438 } 439 break; 440 default: 441 printf("Unknown option: %s\n", argv[i]); 442 return 1; 443 } 444 } 445 else 446 { 447 printf("%s\n", usage); 448 return 1; 449 } 450 } 451 452 if (mode == 0) /* handle binary iptc info */ 453 formatIPTC(ifile, ofile); 454 455 if (mode == 1) /* handle text form of iptc info */ 456 { 457 char 458 brkused, 459 quoted, 460 *line, 461 *token, 462 *newstr; 463 464 int 465 state, 466 next; 467 468 unsigned char 469 recnum = 0, 470 dataset = 0; 471 472 int 473 inputlen = BUFFER_SZ; 474 475 line = (char *) malloc(inputlen); 476 token = (char *)NULL; 477 while((line = super_fgets(line,&inputlen,ifile))!=NULL) 478 { 479 state=0; 480 next=0; 481 482 token = (char *) malloc(inputlen); 483 newstr = (char *) malloc(inputlen); 484 while(tokenizer(0, token, inputlen, line, "", "=", "\"", 0, 485 &brkused,&next,"ed)==0) 486 { 487 if (state == 0) 488 { 489 int 490 state, 491 next; 492 493 char 494 brkused, 495 quoted; 496 497 state=0; 498 next=0; 499 while(tokenizer(0, newstr, inputlen, token, "", "#", "", 0, 500 &brkused, &next, "ed)==0) 501 { 502 if (state == 0) 503 dataset = (unsigned char) atoi(newstr); 504 else 505 if (state == 1) 506 recnum = (unsigned char) atoi(newstr); 507 state++; 508 } 509 } 510 else 511 if (state == 1) 512 { 513 int 514 next; 515 516 unsigned long 517 len; 518 519 char 520 brkused, 521 quoted; 522 523 next=0; 524 len = strlen(token); 525 while(tokenizer(0, newstr, inputlen, token, "", "&", "", 0, 526 &brkused, &next, "ed)==0) 527 { 528 if (brkused && next > 0) 529 { 530 char 531 *s = &token[next-1]; 532 533 len -= convertHTMLcodes(s, strlen(s)); 534 } 535 } 536 537 fputc(0x1c, ofile); 538 fputc(dataset, ofile); 539 fputc(recnum, ofile); 540 if (len < 0x10000) 541 { 542 fputc((len >> 8) & 255, ofile); 543 fputc(len & 255, ofile); 544 } 545 else 546 { 547 fputc(((len >> 24) & 255) | 0x80, ofile); 548 fputc((len >> 16) & 255, ofile); 549 fputc((len >> 8) & 255, ofile); 550 fputc(len & 255, ofile); 551 } 552 next=0; 553 while (len--) 554 fputc(token[next++], ofile); 555 } 556 state++; 557 } 558 free(token); 559 token = (char *)NULL; 560 free(newstr); 561 newstr = (char *)NULL; 562 } 563 free(line); 564 565 fclose( ifile ); 566 fclose( ofile ); 567 } 568 569 return 0; 570} 571 572/* 573 This routine is a generalized, finite state token parser. It allows 574 you extract tokens one at a time from a string of characters. The 575 characters used for white space, for break characters, and for quotes 576 can be specified. Also, characters in the string can be preceded by 577 a specifiable escape character which removes any special meaning the 578 character may have. 579 580 There are a lot of formal parameters in this subroutine call, but 581 once you get familiar with them, this routine is fairly easy to use. 582 "#define" macros can be used to generate simpler looking calls for 583 commonly used applications of this routine. 584 585 First, some terminology: 586 587 token: used here, a single unit of information in 588 the form of a group of characters. 589 590 white space: space that gets ignored (except within quotes 591 or when escaped), like blanks and tabs. in 592 addition, white space terminates a non-quoted 593 token. 594 595 break character: a character that separates non-quoted tokens. 596 commas are a common break character. the 597 usage of break characters to signal the end 598 of a token is the same as that of white space, 599 except multiple break characters with nothing 600 or only white space between generate a null 601 token for each two break characters together. 602 603 for example, if blank is set to be the white 604 space and comma is set to be the break 605 character, the line ... 606 607 A, B, C , , DEF 608 609 ... consists of 5 tokens: 610 611 1) "A" 612 2) "B" 613 3) "C" 614 4) "" (the null string) 615 5) "DEF" 616 617 quote character: a character that, when surrounding a group 618 of other characters, causes the group of 619 characters to be treated as a single token, 620 no matter how many white spaces or break 621 characters exist in the group. also, a 622 token always terminates after the closing 623 quote. for example, if ' is the quote 624 character, blank is white space, and comma 625 is the break character, the following 626 string ... 627 628 A, ' B, CD'EF GHI 629 630 ... consists of 4 tokens: 631 632 1) "A" 633 2) " B, CD" (note the blanks & comma) 634 3) "EF" 635 4) "GHI" 636 637 the quote characters themselves do 638 not appear in the resultant tokens. the 639 double quotes are delimiters i use here for 640 documentation purposes only. 641 642 escape character: a character which itself is ignored but 643 which causes the next character to be 644 used as is. ^ and \ are often used as 645 escape characters. an escape in the last 646 position of the string gets treated as a 647 "normal" (i.e., non-quote, non-white, 648 non-break, and non-escape) character. 649 for example, assume white space, break 650 character, and quote are the same as in the 651 above examples, and further, assume that 652 ^ is the escape character. then, in the 653 string ... 654 655 ABC, ' DEF ^' GH' I ^ J K^ L ^ 656 657 ... there are 7 tokens: 658 659 1) "ABC" 660 2) " DEF ' GH" 661 3) "I" 662 4) " " (a lone blank) 663 5) "J" 664 6) "K L" 665 7) "^" (passed as is at end of line) 666 667 668 OK, now that you have this background, here's how to call "tokenizer": 669 670 result=tokenizer(flag,token,maxtok,string,white,break,quote,escape, 671 brkused,next,quoted) 672 673 result: 0 if we haven't reached EOS (end of string), and 674 1 if we have (this is an "int"). 675 676 flag: right now, only the low order 3 bits are used. 677 1 => convert non-quoted tokens to upper case 678 2 => convert non-quoted tokens to lower case 679 0 => do not convert non-quoted tokens 680 (this is a "char"). 681 682 token: a character string containing the returned next token 683 (this is a "char[]"). 684 685 maxtok: the maximum size of "token". characters beyond 686 "maxtok" are truncated (this is an "int"). 687 688 string: the string to be parsed (this is a "char[]"). 689 690 white: a string of the valid white spaces. example: 691 692 char whitesp[]={" \t"}; 693 694 blank and tab will be valid white space (this is 695 a "char[]"). 696 697 break: a string of the valid break characters. example: 698 699 char breakch[]={";,"}; 700 701 semicolon and comma will be valid break characters 702 (this is a "char[]"). 703 704 IMPORTANT: do not use the name "break" as a C 705 variable, as this is a reserved word in C. 706 707 quote: a string of the valid quote characters. an example 708 would be 709 710 char whitesp[]={"'\""); 711 712 (this causes single and double quotes to be valid) 713 note that a token starting with one of these characters 714 needs the same quote character to terminate it. 715 716 for example, 717 718 "ABC ' 719 720 is unterminated, but 721 722 "DEF" and 'GHI' 723 724 are properly terminated. note that different quote 725 characters can appear on the same line; only for 726 a given token do the quote characters have to be 727 the same (this is a "char[]"). 728 729 escape: the escape character (NOT a string ... only one 730 allowed). use zero if none is desired (this is 731 a "char"). 732 733 brkused: the break character used to terminate the current 734 token. if the token was quoted, this will be the 735 quote used. if the token is the last one on the 736 line, this will be zero (this is a pointer to a 737 "char"). 738 739 next: this variable points to the first character of the 740 next token. it gets reset by "tokenizer" as it steps 741 through the string. set it to 0 upon initialization, 742 and leave it alone after that. you can change it 743 if you want to jump around in the string or re-parse 744 from the beginning, but be careful (this is a 745 pointer to an "int"). 746 747 quoted: set to 1 (true) if the token was quoted and 0 (false) 748 if not. you may need this information (for example: 749 in C, a string with quotes around it is a character 750 string, while one without is an identifier). 751 752 (this is a pointer to a "char"). 753*/ 754 755/* states */ 756 757#define IN_WHITE 0 758#define IN_TOKEN 1 759#define IN_QUOTE 2 760#define IN_OZONE 3 761 762int _p_state; /* current state */ 763unsigned _p_flag; /* option flag */ 764char _p_curquote; /* current quote char */ 765int _p_tokpos; /* current token pos */ 766 767/* routine to find character in string ... used only by "tokenizer" */ 768 769int sindex(char ch,char *string) 770{ 771 char *cp; 772 for(cp=string;*cp;++cp) 773 if(ch==*cp) 774 return (int)(cp-string); /* return postion of character */ 775 return -1; /* eol ... no match found */ 776} 777 778/* routine to store a character in a string ... used only by "tokenizer" */ 779 780void chstore(char *string,int max,char ch) 781{ 782 char c; 783 if(_p_tokpos>=0&&_p_tokpos<max-1) 784 { 785 if(_p_state==IN_QUOTE) 786 c=ch; 787 else 788 switch(_p_flag&3) 789 { 790 case 1: /* convert to upper */ 791 c=toupper(ch); 792 break; 793 794 case 2: /* convert to lower */ 795 c=tolower(ch); 796 break; 797 798 default: /* use as is */ 799 c=ch; 800 break; 801 } 802 string[_p_tokpos++]=c; 803 } 804 return; 805} 806 807int tokenizer(unsigned inflag,char *token,int tokmax,char *line, 808 char *white,char *brkchar,char *quote,char eschar,char *brkused, 809 int *next,char *quoted) 810{ 811 int qp; 812 char c,nc; 813 814 *brkused=0; /* initialize to null */ 815 *quoted=0; /* assume not quoted */ 816 817 if(!line[*next]) /* if we're at end of line, indicate such */ 818 return 1; 819 820 _p_state=IN_WHITE; /* initialize state */ 821 _p_curquote=0; /* initialize previous quote char */ 822 _p_flag=inflag; /* set option flag */ 823 824 for(_p_tokpos=0;(c=line[*next]);++(*next)) /* main loop */ 825 { 826 if((qp=sindex(c,brkchar))>=0) /* break */ 827 { 828 switch(_p_state) 829 { 830 case IN_WHITE: /* these are the same here ... */ 831 case IN_TOKEN: /* ... just get out */ 832 case IN_OZONE: /* ditto */ 833 ++(*next); 834 *brkused=brkchar[qp]; 835 goto byebye; 836 837 case IN_QUOTE: /* just keep going */ 838 chstore(token,tokmax,c); 839 break; 840 } 841 } 842 else if((qp=sindex(c,quote))>=0) /* quote */ 843 { 844 switch(_p_state) 845 { 846 case IN_WHITE: /* these are identical, */ 847 _p_state=IN_QUOTE; /* change states */ 848 _p_curquote=quote[qp]; /* save quote char */ 849 *quoted=1; /* set to true as long as something is in quotes */ 850 break; 851 852 case IN_QUOTE: 853 if(quote[qp]==_p_curquote) /* same as the beginning quote? */ 854 { 855 _p_state=IN_OZONE; 856 _p_curquote=0; 857 } 858 else 859 chstore(token,tokmax,c); /* treat as regular char */ 860 break; 861 862 case IN_TOKEN: 863 case IN_OZONE: 864 *brkused=c; /* uses quote as break char */ 865 goto byebye; 866 } 867 } 868 else if((qp=sindex(c,white))>=0) /* white */ 869 { 870 switch(_p_state) 871 { 872 case IN_WHITE: 873 case IN_OZONE: 874 break; /* keep going */ 875 876 case IN_TOKEN: 877 _p_state=IN_OZONE; 878 break; 879 880 case IN_QUOTE: 881 chstore(token,tokmax,c); /* it's valid here */ 882 break; 883 } 884 } 885 else if(c==eschar) /* escape */ 886 { 887 nc=line[(*next)+1]; 888 if(nc==0) /* end of line */ 889 { 890 *brkused=0; 891 chstore(token,tokmax,c); 892 ++(*next); 893 goto byebye; 894 } 895 switch(_p_state) 896 { 897 case IN_WHITE: 898 --(*next); 899 _p_state=IN_TOKEN; 900 break; 901 902 case IN_TOKEN: 903 case IN_QUOTE: 904 ++(*next); 905 chstore(token,tokmax,nc); 906 break; 907 908 case IN_OZONE: 909 goto byebye; 910 } 911 } 912 else /* anything else is just a real character */ 913 { 914 switch(_p_state) 915 { 916 case IN_WHITE: 917 _p_state=IN_TOKEN; /* switch states */ 918 919 case IN_TOKEN: /* these 2 are */ 920 case IN_QUOTE: /* identical here */ 921 chstore(token,tokmax,c); 922 break; 923 924 case IN_OZONE: 925 goto byebye; 926 } 927 } 928 } /* end of main loop */ 929 930byebye: 931 token[_p_tokpos]=0; /* make sure token ends with EOS */ 932 933 return 0; 934} 935/* 936 * Local Variables: 937 * mode: c 938 * c-basic-offset: 8 939 * fill-column: 78 940 * End: 941 */ 942