mbutil.c revision 136647
1/* mbutil.c -- readline multibyte character utility functions */
2
3/* Copyright (C) 2001-2004 Free Software Foundation, Inc.
4
5   This file is part of the GNU Readline Library, a library for
6   reading lines of text with interactive input and history editing.
7
8   The GNU Readline Library is free software; you can redistribute it
9   and/or modify it under the terms of the GNU General Public License
10   as published by the Free Software Foundation; either version 2, or
11   (at your option) any later version.
12
13   The GNU Readline Library is distributed in the hope that it will be
14   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
15   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16   GNU General Public License for more details.
17
18   The GNU General Public License is often shipped with GNU software, and
19   is generally kept in a file called COPYING or LICENSE.  If you do not
20   have a copy of the license, write to the Free Software Foundation,
21   59 Temple Place, Suite 330, Boston, MA 02111 USA. */
22#define READLINE_LIBRARY
23
24#if defined (HAVE_CONFIG_H)
25#  include <config.h>
26#endif
27
28#include <sys/types.h>
29#include <fcntl.h>
30#include "posixjmp.h"
31
32#if defined (HAVE_UNISTD_H)
33#  include <unistd.h>	   /* for _POSIX_VERSION */
34#endif /* HAVE_UNISTD_H */
35
36#if defined (HAVE_STDLIB_H)
37#  include <stdlib.h>
38#else
39#  include "ansi_stdlib.h"
40#endif /* HAVE_STDLIB_H */
41
42#include <stdio.h>
43#include <ctype.h>
44
45/* System-specific feature definitions and include files. */
46#include "rldefs.h"
47#include "rlmbutil.h"
48
49#if defined (TIOCSTAT_IN_SYS_IOCTL)
50#  include <sys/ioctl.h>
51#endif /* TIOCSTAT_IN_SYS_IOCTL */
52
53/* Some standard library routines. */
54#include "readline.h"
55
56#include "rlprivate.h"
57#include "xmalloc.h"
58
59/* Declared here so it can be shared between the readline and history
60   libraries. */
61#if defined (HANDLE_MULTIBYTE)
62int rl_byte_oriented = 0;
63#else
64int rl_byte_oriented = 1;
65#endif
66
67/* **************************************************************** */
68/*								    */
69/*		Multibyte Character Utility Functions		    */
70/*								    */
71/* **************************************************************** */
72
73#if defined(HANDLE_MULTIBYTE)
74
75static int
76_rl_find_next_mbchar_internal (string, seed, count, find_non_zero)
77     char *string;
78     int seed, count, find_non_zero;
79{
80  size_t tmp = 0;
81  mbstate_t ps;
82  int point = 0;
83  wchar_t wc;
84
85  memset(&ps, 0, sizeof (mbstate_t));
86  if (seed < 0)
87    seed = 0;
88  if (count <= 0)
89    return seed;
90
91  point = seed + _rl_adjust_point(string, seed, &ps);
92  /* if this is true, means that seed was not pointed character
93     started byte.  So correct the point and consume count */
94  if (seed < point)
95    count--;
96
97  while (count > 0)
98    {
99      tmp = mbrtowc (&wc, string+point, strlen(string + point), &ps);
100      if (MB_INVALIDCH ((size_t)tmp))
101	{
102	  /* invalid bytes. asume a byte represents a character */
103	  point++;
104	  count--;
105	  /* reset states. */
106	  memset(&ps, 0, sizeof(mbstate_t));
107	}
108      else if (MB_NULLWCH (tmp))
109	break;			/* found wide '\0' */
110      else
111	{
112	  /* valid bytes */
113	  point += tmp;
114	  if (find_non_zero)
115	    {
116	      if (wcwidth (wc) == 0)
117		continue;
118	      else
119		count--;
120	    }
121	  else
122	    count--;
123	}
124    }
125
126  if (find_non_zero)
127    {
128      tmp = mbrtowc (&wc, string + point, strlen (string + point), &ps);
129      while (tmp > 0 && wcwidth (wc) == 0)
130	{
131	  point += tmp;
132	  tmp = mbrtowc (&wc, string + point, strlen (string + point), &ps);
133	  if (MB_NULLWCH (tmp) || MB_INVALIDCH (tmp))
134	    break;
135	}
136    }
137    return point;
138}
139
140static int
141_rl_find_prev_mbchar_internal (string, seed, find_non_zero)
142     char *string;
143     int seed, find_non_zero;
144{
145  mbstate_t ps;
146  int prev, non_zero_prev, point, length;
147  size_t tmp;
148  wchar_t wc;
149
150  memset(&ps, 0, sizeof(mbstate_t));
151  length = strlen(string);
152
153  if (seed < 0)
154    return 0;
155  else if (length < seed)
156    return length;
157
158  prev = non_zero_prev = point = 0;
159  while (point < seed)
160    {
161      tmp = mbrtowc (&wc, string + point, length - point, &ps);
162      if (MB_INVALIDCH ((size_t)tmp))
163	{
164	  /* in this case, bytes are invalid or shorted to compose
165	     multibyte char, so assume that the first byte represents
166	     a single character anyway. */
167	  tmp = 1;
168	  /* clear the state of the byte sequence, because
169	     in this case effect of mbstate is undefined  */
170	  memset(&ps, 0, sizeof (mbstate_t));
171
172	  /* Since we're assuming that this byte represents a single
173	     non-zero-width character, don't forget about it. */
174	  prev = point;
175	}
176      else if (MB_NULLWCH (tmp))
177	break;			/* Found '\0' char.  Can this happen? */
178      else
179	{
180	  if (find_non_zero)
181	    {
182	      if (wcwidth (wc) != 0)
183		prev = point;
184	    }
185	  else
186	    prev = point;
187	}
188
189      point += tmp;
190    }
191
192  return prev;
193}
194
195/* return the number of bytes parsed from the multibyte sequence starting
196   at src, if a non-L'\0' wide character was recognized. It returns 0,
197   if a L'\0' wide character was recognized. It  returns (size_t)(-1),
198   if an invalid multibyte sequence was encountered. It returns (size_t)(-2)
199   if it couldn't parse a complete  multibyte character.  */
200int
201_rl_get_char_len (src, ps)
202     char *src;
203     mbstate_t *ps;
204{
205  size_t tmp;
206
207  tmp = mbrlen((const char *)src, (size_t)strlen (src), ps);
208  if (tmp == (size_t)(-2))
209    {
210      /* shorted to compose multibyte char */
211      if (ps)
212	memset (ps, 0, sizeof(mbstate_t));
213      return -2;
214    }
215  else if (tmp == (size_t)(-1))
216    {
217      /* invalid to compose multibyte char */
218      /* initialize the conversion state */
219      if (ps)
220	memset (ps, 0, sizeof(mbstate_t));
221      return -1;
222    }
223  else if (tmp == (size_t)0)
224    return 0;
225  else
226    return (int)tmp;
227}
228
229/* compare the specified two characters. If the characters matched,
230   return 1. Otherwise return 0. */
231int
232_rl_compare_chars (buf1, pos1, ps1, buf2, pos2, ps2)
233     char *buf1;
234     int pos1;
235     mbstate_t *ps1;
236     char *buf2;
237     int pos2;
238     mbstate_t *ps2;
239{
240  int i, w1, w2;
241
242  if ((w1 = _rl_get_char_len (&buf1[pos1], ps1)) <= 0 ||
243	(w2 = _rl_get_char_len (&buf2[pos2], ps2)) <= 0 ||
244	(w1 != w2) ||
245	(buf1[pos1] != buf2[pos2]))
246    return 0;
247
248  for (i = 1; i < w1; i++)
249    if (buf1[pos1+i] != buf2[pos2+i])
250      return 0;
251
252  return 1;
253}
254
255/* adjust pointed byte and find mbstate of the point of string.
256   adjusted point will be point <= adjusted_point, and returns
257   differences of the byte(adjusted_point - point).
258   if point is invalied (point < 0 || more than string length),
259   it returns -1 */
260int
261_rl_adjust_point(string, point, ps)
262     char *string;
263     int point;
264     mbstate_t *ps;
265{
266  size_t tmp = 0;
267  int length;
268  int pos = 0;
269
270  length = strlen(string);
271  if (point < 0)
272    return -1;
273  if (length < point)
274    return -1;
275
276  while (pos < point)
277    {
278      tmp = mbrlen (string + pos, length - pos, ps);
279      if (MB_INVALIDCH ((size_t)tmp))
280	{
281	  /* in this case, bytes are invalid or shorted to compose
282	     multibyte char, so assume that the first byte represents
283	     a single character anyway. */
284	  pos++;
285	  /* clear the state of the byte sequence, because
286	     in this case effect of mbstate is undefined  */
287	  if (ps)
288	    memset (ps, 0, sizeof (mbstate_t));
289	}
290      else if (MB_NULLWCH (tmp))
291	pos++;
292      else
293	pos += tmp;
294    }
295
296  return (pos - point);
297}
298
299int
300_rl_is_mbchar_matched (string, seed, end, mbchar, length)
301     char *string;
302     int seed, end;
303     char *mbchar;
304     int length;
305{
306  int i;
307
308  if ((end - seed) < length)
309    return 0;
310
311  for (i = 0; i < length; i++)
312    if (string[seed + i] != mbchar[i])
313      return 0;
314  return 1;
315}
316#endif /* HANDLE_MULTIBYTE */
317
318/* Find next `count' characters started byte point of the specified seed.
319   If flags is MB_FIND_NONZERO, we look for non-zero-width multibyte
320   characters. */
321#undef _rl_find_next_mbchar
322int
323_rl_find_next_mbchar (string, seed, count, flags)
324     char *string;
325     int seed, count, flags;
326{
327#if defined (HANDLE_MULTIBYTE)
328  return _rl_find_next_mbchar_internal (string, seed, count, flags);
329#else
330  return (seed + count);
331#endif
332}
333
334/* Find previous character started byte point of the specified seed.
335   Returned point will be point <= seed.  If flags is MB_FIND_NONZERO,
336   we look for non-zero-width multibyte characters. */
337#undef _rl_find_prev_mbchar
338int
339_rl_find_prev_mbchar (string, seed, flags)
340     char *string;
341     int seed, flags;
342{
343#if defined (HANDLE_MULTIBYTE)
344  return _rl_find_prev_mbchar_internal (string, seed, flags);
345#else
346  return ((seed == 0) ? seed : seed - 1);
347#endif
348}
349