1/*	$NetBSD$	*/
2
3/* Messy DOS-specific code for correctly treating binary, Unix text
4   and DOS text files.
5
6   This has several aspects:
7
8     * Guessing the file type (unless the user tells us);
9     * Stripping CR characters from DOS text files (otherwise regex
10       functions won't work correctly);
11     * Reporting correct byte count with -b for any kind of file.
12
13*/
14
15typedef enum {
16  UNKNOWN, DOS_BINARY, DOS_TEXT, UNIX_TEXT
17} File_type;
18
19struct dos_map {
20  off_t pos;	/* position in buffer passed to matcher */
21  off_t add;	/* how much to add when reporting char position */
22};
23
24static int       dos_report_unix_offset = 0;
25
26static File_type dos_file_type     = UNKNOWN;
27static File_type dos_use_file_type = UNKNOWN;
28static off_t     dos_stripped_crs  = 0;
29static struct dos_map *dos_pos_map;
30static int       dos_pos_map_size  = 0;
31static int       dos_pos_map_used  = 0;
32static int       inp_map_idx = 0, out_map_idx = 1;
33
34/* Guess DOS file type by looking at its contents.  */
35static inline File_type
36guess_type (char *buf, register size_t buflen)
37{
38  int crlf_seen = 0;
39  register char *bp = buf;
40
41  while (buflen--)
42    {
43      /* Treat a file as binary if it has a NUL character.  */
44      if (!*bp)
45        return DOS_BINARY;
46
47      /* CR before LF means DOS text file (unless we later see
48         binary characters).  */
49      else if (*bp == '\r' && buflen && bp[1] == '\n')
50        crlf_seen = 1;
51
52      bp++;
53    }
54
55  return crlf_seen ? DOS_TEXT : UNIX_TEXT;
56}
57
58/* Convert external DOS file representation to internal.
59   Return the count of characters left in the buffer.
60   Build table to map character positions when reporting byte counts.  */
61static inline int
62undossify_input (register char *buf, size_t buflen)
63{
64  int chars_left = 0;
65
66  if (totalcc == 0)
67    {
68      /* New file: forget everything we knew about character
69         position mapping table and file type.  */
70      inp_map_idx = 0;
71      out_map_idx = 1;
72      dos_pos_map_used = 0;
73      dos_stripped_crs = 0;
74      dos_file_type = dos_use_file_type;
75    }
76
77  /* Guess if this file is binary, unless we already know that.  */
78  if (dos_file_type == UNKNOWN)
79    dos_file_type = guess_type(buf, buflen);
80
81  /* If this file is to be treated as DOS Text, strip the CR characters
82     and maybe build the table for character position mapping on output.  */
83  if (dos_file_type == DOS_TEXT)
84    {
85      char   *destp   = buf;
86
87      while (buflen--)
88        {
89          if (*buf != '\r')
90            {
91              *destp++ = *buf++;
92              chars_left++;
93            }
94          else
95            {
96              buf++;
97              if (out_byte && !dos_report_unix_offset)
98                {
99                  dos_stripped_crs++;
100                  while (buflen && *buf == '\r')
101                    {
102                      dos_stripped_crs++;
103                      buflen--;
104                      buf++;
105                    }
106                  if (inp_map_idx >= dos_pos_map_size - 1)
107                    {
108                      dos_pos_map_size = inp_map_idx ? inp_map_idx * 2 : 1000;
109                      dos_pos_map =
110                        (struct dos_map *)xrealloc((char *)dos_pos_map,
111						   dos_pos_map_size *
112						   sizeof(struct dos_map));
113                    }
114
115                  if (!inp_map_idx)
116                    {
117                      /* Add sentinel entry.  */
118                      dos_pos_map[inp_map_idx].pos = 0;
119                      dos_pos_map[inp_map_idx++].add = 0;
120
121                      /* Initialize first real entry.  */
122                      dos_pos_map[inp_map_idx].add = 0;
123                    }
124
125                  /* Put the new entry.  If the stripped CR characters
126                     precede a Newline (the usual case), pretend that
127                     they were found *after* the Newline.  This makes
128                     displayed byte offsets more reasonable in some
129                     cases, and fits better the intuitive notion that
130                     the line ends *before* the CR, not *after* it.  */
131                  inp_map_idx++;
132                  dos_pos_map[inp_map_idx-1].pos =
133                    (*buf == '\n' ? destp + 1 : destp ) - bufbeg + totalcc;
134                  dos_pos_map[inp_map_idx].add = dos_stripped_crs;
135                  dos_pos_map_used = inp_map_idx;
136
137                  /* The following will be updated on the next pass.  */
138                  dos_pos_map[inp_map_idx].pos = destp - bufbeg + totalcc + 1;
139                }
140            }
141        }
142
143      return chars_left;
144    }
145
146  return buflen;
147}
148
149/* Convert internal byte count into external.  */
150static inline off_t
151dossified_pos (off_t byteno)
152{
153  off_t pos_lo;
154  off_t pos_hi;
155
156  if (dos_file_type != DOS_TEXT || dos_report_unix_offset)
157    return byteno;
158
159  /* Optimization: usually the file will be scanned sequentially.
160     So in most cases, this byte position will be found in the
161     table near the previous one, as recorded in `out_map_idx'.  */
162  pos_lo = dos_pos_map[out_map_idx-1].pos;
163  pos_hi = dos_pos_map[out_map_idx].pos;
164
165  /* If the initial guess failed, search up or down, as
166     appropriate, beginning with the previous place.  */
167  if (byteno >= pos_hi)
168    {
169      out_map_idx++;
170      while (out_map_idx < dos_pos_map_used &&
171             byteno >= dos_pos_map[out_map_idx].pos)
172        out_map_idx++;
173    }
174
175  else if (byteno < pos_lo)
176    {
177      out_map_idx--;
178      while (out_map_idx > 1 && byteno < dos_pos_map[out_map_idx-1].pos)
179        out_map_idx--;
180    }
181
182  return byteno + dos_pos_map[out_map_idx].add;
183}
184