mbutil.c revision 119610
1/* mbutil.c -- readline multibyte character utility functions */
2
3/* Copyright (C) 2001 Free Software Foundation, Inc.
4
5   This file is part of the GNU Readline Library, a library for
6   reading lines of text with interactive input and history editing.
7
8   The GNU Readline Library is free software; you can redistribute it
9   and/or modify it under the terms of the GNU General Public License
10   as published by the Free Software Foundation; either version 2, or
11   (at your option) any later version.
12
13   The GNU Readline Library is distributed in the hope that it will be
14   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
15   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16   GNU General Public License for more details.
17
18   The GNU General Public License is often shipped with GNU software, and
19   is generally kept in a file called COPYING or LICENSE.  If you do not
20   have a copy of the license, write to the Free Software Foundation,
21   59 Temple Place, Suite 330, Boston, MA 02111 USA. */
22#define READLINE_LIBRARY
23
24#if defined (HAVE_CONFIG_H)
25#  include <config.h>
26#endif
27
28#include <sys/types.h>
29#include <fcntl.h>
30#include "posixjmp.h"
31
32#if defined (HAVE_UNISTD_H)
33#  include <unistd.h>	   /* for _POSIX_VERSION */
34#endif /* HAVE_UNISTD_H */
35
36#if defined (HAVE_STDLIB_H)
37#  include <stdlib.h>
38#else
39#  include "ansi_stdlib.h"
40#endif /* HAVE_STDLIB_H */
41
42#include <stdio.h>
43#include <ctype.h>
44
45/* System-specific feature definitions and include files. */
46#include "rldefs.h"
47#include "rlmbutil.h"
48
49#if defined (TIOCSTAT_IN_SYS_IOCTL)
50#  include <sys/ioctl.h>
51#endif /* TIOCSTAT_IN_SYS_IOCTL */
52
53/* Some standard library routines. */
54#include "readline.h"
55
56#include "rlprivate.h"
57#include "xmalloc.h"
58
59/* Declared here so it can be shared between the readline and history
60   libraries. */
61#if defined (HANDLE_MULTIBYTE)
62int rl_byte_oriented = 0;
63#else
64int rl_byte_oriented = 1;
65#endif
66
67/* **************************************************************** */
68/*								    */
69/*		Multibyte Character Utility Functions		    */
70/*								    */
71/* **************************************************************** */
72
73#if defined(HANDLE_MULTIBYTE)
74
75static int
76_rl_find_next_mbchar_internal (string, seed, count, find_non_zero)
77     char *string;
78     int seed, count, find_non_zero;
79{
80  size_t tmp = 0;
81  mbstate_t ps;
82  int point = 0;
83  wchar_t wc;
84
85  memset(&ps, 0, sizeof (mbstate_t));
86  if (seed < 0)
87    seed = 0;
88  if (count <= 0)
89    return seed;
90
91  point = seed + _rl_adjust_point(string, seed, &ps);
92  /* if this is true, means that seed was not pointed character
93     started byte.  So correct the point and consume count */
94  if (seed < point)
95    count --;
96
97  while (count > 0)
98    {
99      tmp = mbrtowc (&wc, string+point, strlen(string + point), &ps);
100      if ((size_t)(tmp) == (size_t)-1 || (size_t)(tmp) == (size_t)-2)
101	{
102	  /* invalid bytes. asume a byte represents a character */
103	  point++;
104	  count--;
105	  /* reset states. */
106	  memset(&ps, 0, sizeof(mbstate_t));
107	}
108      else if (tmp == (size_t)0)
109	/* found '\0' char */
110	break;
111      else
112	{
113	  /* valid bytes */
114	  point += tmp;
115	  if (find_non_zero)
116	    {
117	      if (wcwidth (wc) == 0)
118		continue;
119	      else
120		count--;
121	    }
122	  else
123	    count--;
124	}
125    }
126
127  if (find_non_zero)
128    {
129      tmp = mbrtowc (&wc, string + point, strlen (string + point), &ps);
130      while (wcwidth (wc) == 0)
131	{
132	  point += tmp;
133	  tmp = mbrtowc (&wc, string + point, strlen (string + point), &ps);
134	  if (tmp == (size_t)(0) || tmp == (size_t)(-1) || tmp == (size_t)(-2))
135	    break;
136	}
137    }
138    return point;
139}
140
141static int
142_rl_find_prev_mbchar_internal (string, seed, find_non_zero)
143     char *string;
144     int seed, find_non_zero;
145{
146  mbstate_t ps;
147  int prev, non_zero_prev, point, length;
148  size_t tmp;
149  wchar_t wc;
150
151  memset(&ps, 0, sizeof(mbstate_t));
152  length = strlen(string);
153
154  if (seed < 0)
155    return 0;
156  else if (length < seed)
157    return length;
158
159  prev = non_zero_prev = point = 0;
160  while (point < seed)
161    {
162      tmp = mbrtowc (&wc, string + point, length - point, &ps);
163      if ((size_t)(tmp) == (size_t)-1 || (size_t)(tmp) == (size_t)-2)
164	{
165	  /* in this case, bytes are invalid or shorted to compose
166	     multibyte char, so assume that the first byte represents
167	     a single character anyway. */
168	  tmp = 1;
169	  /* clear the state of the byte sequence, because
170	     in this case effect of mbstate is undefined  */
171	  memset(&ps, 0, sizeof (mbstate_t));
172	}
173      else if (tmp == 0)
174	break;			/* Found '\0' char.  Can this happen? */
175      else
176	{
177	  if (find_non_zero)
178	    {
179	      if (wcwidth (wc) != 0)
180		prev = point;
181	    }
182	  else
183	    prev = point;
184	}
185
186      point += tmp;
187    }
188
189  return prev;
190}
191
192/* return the number of bytes parsed from the multibyte sequence starting
193   at src, if a non-L'\0' wide character was recognized. It returns 0,
194   if a L'\0' wide character was recognized. It  returns (size_t)(-1),
195   if an invalid multibyte sequence was encountered. It returns (size_t)(-2)
196   if it couldn't parse a complete  multibyte character.  */
197int
198_rl_get_char_len (src, ps)
199     char *src;
200     mbstate_t *ps;
201{
202  size_t tmp;
203
204  tmp = mbrlen((const char *)src, (size_t)strlen (src), ps);
205  if (tmp == (size_t)(-2))
206    {
207      /* shorted to compose multibyte char */
208      memset (ps, 0, sizeof(mbstate_t));
209      return -2;
210    }
211  else if (tmp == (size_t)(-1))
212    {
213      /* invalid to compose multibyte char */
214      /* initialize the conversion state */
215      memset (ps, 0, sizeof(mbstate_t));
216      return -1;
217    }
218  else if (tmp == (size_t)0)
219    return 0;
220  else
221    return (int)tmp;
222}
223
224/* compare the specified two characters. If the characters matched,
225   return 1. Otherwise return 0. */
226int
227_rl_compare_chars (buf1, pos1, ps1, buf2, pos2, ps2)
228     char *buf1, *buf2;
229     mbstate_t *ps1, *ps2;
230     int pos1, pos2;
231{
232  int i, w1, w2;
233
234  if ((w1 = _rl_get_char_len (&buf1[pos1], ps1)) <= 0 ||
235	(w2 = _rl_get_char_len (&buf2[pos2], ps2)) <= 0 ||
236	(w1 != w2) ||
237	(buf1[pos1] != buf2[pos2]))
238    return 0;
239
240  for (i = 1; i < w1; i++)
241    if (buf1[pos1+i] != buf2[pos2+i])
242      return 0;
243
244  return 1;
245}
246
247/* adjust pointed byte and find mbstate of the point of string.
248   adjusted point will be point <= adjusted_point, and returns
249   differences of the byte(adjusted_point - point).
250   if point is invalied (point < 0 || more than string length),
251   it returns -1 */
252int
253_rl_adjust_point(string, point, ps)
254     char *string;
255     int point;
256     mbstate_t *ps;
257{
258  size_t tmp = 0;
259  int length;
260  int pos = 0;
261
262  length = strlen(string);
263  if (point < 0)
264    return -1;
265  if (length < point)
266    return -1;
267
268  while (pos < point)
269    {
270      tmp = mbrlen (string + pos, length - pos, ps);
271      if((size_t)(tmp) == (size_t)-1 || (size_t)(tmp) == (size_t)-2)
272	{
273	  /* in this case, bytes are invalid or shorted to compose
274	     multibyte char, so assume that the first byte represents
275	     a single character anyway. */
276	  pos++;
277	  /* clear the state of the byte sequence, because
278	     in this case effect of mbstate is undefined  */
279	  memset (ps, 0, sizeof (mbstate_t));
280	}
281      else
282	pos += tmp;
283    }
284
285  return (pos - point);
286}
287
288int
289_rl_is_mbchar_matched (string, seed, end, mbchar, length)
290     char *string;
291     int seed, end;
292     char *mbchar;
293     int length;
294{
295  int i;
296
297  if ((end - seed) < length)
298    return 0;
299
300  for (i = 0; i < length; i++)
301    if (string[seed + i] != mbchar[i])
302      return 0;
303  return 1;
304}
305#endif /* HANDLE_MULTIBYTE */
306
307/* Find next `count' characters started byte point of the specified seed.
308   If flags is MB_FIND_NONZERO, we look for non-zero-width multibyte
309   characters. */
310#undef _rl_find_next_mbchar
311int
312_rl_find_next_mbchar (string, seed, count, flags)
313     char *string;
314     int seed, count, flags;
315{
316#if defined (HANDLE_MULTIBYTE)
317  return _rl_find_next_mbchar_internal (string, seed, count, flags);
318#else
319  return (seed + count);
320#endif
321}
322
323/* Find previous character started byte point of the specified seed.
324   Returned point will be point <= seed.  If flags is MB_FIND_NONZERO,
325   we look for non-zero-width multibyte characters. */
326#undef _rl_find_prev_mbchar
327int
328_rl_find_prev_mbchar (string, seed, flags)
329     char *string;
330     int seed, flags;
331{
332#if defined (HANDLE_MULTIBYTE)
333  return _rl_find_prev_mbchar_internal (string, seed, flags);
334#else
335  return ((seed == 0) ? seed : seed - 1);
336#endif
337}
338