dosbuf.c revision 1.1.1.1
1/* $NetBSD: dosbuf.c,v 1.1.1.1 2016/01/10 21:36:21 christos Exp $ */ 2 3/* Messy DOS-specific code for correctly treating binary, Unix text 4 and DOS text files. 5 6 This has several aspects: 7 8 * Guessing the file type (unless the user tells us); 9 * Stripping CR characters from DOS text files (otherwise regex 10 functions won't work correctly); 11 * Reporting correct byte count with -b for any kind of file. 12 13*/ 14 15typedef enum { 16 UNKNOWN, DOS_BINARY, DOS_TEXT, UNIX_TEXT 17} File_type; 18 19struct dos_map { 20 off_t pos; /* position in buffer passed to matcher */ 21 off_t add; /* how much to add when reporting char position */ 22}; 23 24static int dos_report_unix_offset = 0; 25 26static File_type dos_file_type = UNKNOWN; 27static File_type dos_use_file_type = UNKNOWN; 28static off_t dos_stripped_crs = 0; 29static struct dos_map *dos_pos_map; 30static int dos_pos_map_size = 0; 31static int dos_pos_map_used = 0; 32static int inp_map_idx = 0, out_map_idx = 1; 33 34/* Guess DOS file type by looking at its contents. */ 35static inline File_type 36guess_type (char *buf, register size_t buflen) 37{ 38 int crlf_seen = 0; 39 register char *bp = buf; 40 41 while (buflen--) 42 { 43 /* Treat a file as binary if it has a NUL character. */ 44 if (!*bp) 45 return DOS_BINARY; 46 47 /* CR before LF means DOS text file (unless we later see 48 binary characters). */ 49 else if (*bp == '\r' && buflen && bp[1] == '\n') 50 crlf_seen = 1; 51 52 bp++; 53 } 54 55 return crlf_seen ? DOS_TEXT : UNIX_TEXT; 56} 57 58/* Convert external DOS file representation to internal. 59 Return the count of characters left in the buffer. 60 Build table to map character positions when reporting byte counts. */ 61static inline int 62undossify_input (register char *buf, size_t buflen) 63{ 64 int chars_left = 0; 65 66 if (totalcc == 0) 67 { 68 /* New file: forget everything we knew about character 69 position mapping table and file type. */ 70 inp_map_idx = 0; 71 out_map_idx = 1; 72 dos_pos_map_used = 0; 73 dos_stripped_crs = 0; 74 dos_file_type = dos_use_file_type; 75 } 76 77 /* Guess if this file is binary, unless we already know that. */ 78 if (dos_file_type == UNKNOWN) 79 dos_file_type = guess_type(buf, buflen); 80 81 /* If this file is to be treated as DOS Text, strip the CR characters 82 and maybe build the table for character position mapping on output. */ 83 if (dos_file_type == DOS_TEXT) 84 { 85 char *destp = buf; 86 87 while (buflen--) 88 { 89 if (*buf != '\r') 90 { 91 *destp++ = *buf++; 92 chars_left++; 93 } 94 else 95 { 96 buf++; 97 if (out_byte && !dos_report_unix_offset) 98 { 99 dos_stripped_crs++; 100 while (buflen && *buf == '\r') 101 { 102 dos_stripped_crs++; 103 buflen--; 104 buf++; 105 } 106 if (inp_map_idx >= dos_pos_map_size - 1) 107 { 108 dos_pos_map_size = inp_map_idx ? inp_map_idx * 2 : 1000; 109 dos_pos_map = 110 (struct dos_map *)xrealloc((char *)dos_pos_map, 111 dos_pos_map_size * 112 sizeof(struct dos_map)); 113 } 114 115 if (!inp_map_idx) 116 { 117 /* Add sentinel entry. */ 118 dos_pos_map[inp_map_idx].pos = 0; 119 dos_pos_map[inp_map_idx++].add = 0; 120 121 /* Initialize first real entry. */ 122 dos_pos_map[inp_map_idx].add = 0; 123 } 124 125 /* Put the new entry. If the stripped CR characters 126 precede a Newline (the usual case), pretend that 127 they were found *after* the Newline. This makes 128 displayed byte offsets more reasonable in some 129 cases, and fits better the intuitive notion that 130 the line ends *before* the CR, not *after* it. */ 131 inp_map_idx++; 132 dos_pos_map[inp_map_idx-1].pos = 133 (*buf == '\n' ? destp + 1 : destp ) - bufbeg + totalcc; 134 dos_pos_map[inp_map_idx].add = dos_stripped_crs; 135 dos_pos_map_used = inp_map_idx; 136 137 /* The following will be updated on the next pass. */ 138 dos_pos_map[inp_map_idx].pos = destp - bufbeg + totalcc + 1; 139 } 140 } 141 } 142 143 return chars_left; 144 } 145 146 return buflen; 147} 148 149/* Convert internal byte count into external. */ 150static inline off_t 151dossified_pos (off_t byteno) 152{ 153 off_t pos_lo; 154 off_t pos_hi; 155 156 if (dos_file_type != DOS_TEXT || dos_report_unix_offset) 157 return byteno; 158 159 /* Optimization: usually the file will be scanned sequentially. 160 So in most cases, this byte position will be found in the 161 table near the previous one, as recorded in `out_map_idx'. */ 162 pos_lo = dos_pos_map[out_map_idx-1].pos; 163 pos_hi = dos_pos_map[out_map_idx].pos; 164 165 /* If the initial guess failed, search up or down, as 166 appropriate, beginning with the previous place. */ 167 if (byteno >= pos_hi) 168 { 169 out_map_idx++; 170 while (out_map_idx < dos_pos_map_used && 171 byteno >= dos_pos_map[out_map_idx].pos) 172 out_map_idx++; 173 } 174 175 else if (byteno < pos_lo) 176 { 177 out_map_idx--; 178 while (out_map_idx > 1 && byteno < dos_pos_map[out_map_idx-1].pos) 179 out_map_idx--; 180 } 181 182 return byteno + dos_pos_map[out_map_idx].add; 183} 184