1/* mbutil.c -- readline multibyte character utility functions */
2
3/* Copyright (C) 2001-2005 Free Software Foundation, Inc.
4
5   This file is part of the GNU Readline Library, a library for
6   reading lines of text with interactive input and history editing.
7
8   The GNU Readline Library is free software; you can redistribute it
9   and/or modify it under the terms of the GNU General Public License
10   as published by the Free Software Foundation; either version 2, or
11   (at your option) any later version.
12
13   The GNU Readline Library is distributed in the hope that it will be
14   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
15   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16   GNU General Public License for more details.
17
18   The GNU General Public License is often shipped with GNU software, and
19   is generally kept in a file called COPYING or LICENSE.  If you do not
20   have a copy of the license, write to the Free Software Foundation,
21   59 Temple Place, Suite 330, Boston, MA 02111 USA. */
22#define READLINE_LIBRARY
23
24#if defined (HAVE_CONFIG_H)
25#  include <config.h>
26#endif
27
28#include <sys/types.h>
29#include <fcntl.h>
30#include "posixjmp.h"
31
32#if defined (HAVE_UNISTD_H)
33#  include <unistd.h>	   /* for _POSIX_VERSION */
34#endif /* HAVE_UNISTD_H */
35
36#if defined (HAVE_STDLIB_H)
37#  include <stdlib.h>
38#else
39#  include "ansi_stdlib.h"
40#endif /* HAVE_STDLIB_H */
41
42#include <stdio.h>
43#include <ctype.h>
44
45/* System-specific feature definitions and include files. */
46#include "rldefs.h"
47#include "rlmbutil.h"
48
49#if defined (TIOCSTAT_IN_SYS_IOCTL)
50#  include <sys/ioctl.h>
51#endif /* TIOCSTAT_IN_SYS_IOCTL */
52
53/* Some standard library routines. */
54#include "readline.h"
55
56#include "rlprivate.h"
57#include "xmalloc.h"
58
59/* Declared here so it can be shared between the readline and history
60   libraries. */
61#if defined (HANDLE_MULTIBYTE)
62int rl_byte_oriented = 0;
63#else
64int rl_byte_oriented = 1;
65#endif
66
67/* **************************************************************** */
68/*								    */
69/*		Multibyte Character Utility Functions		    */
70/*								    */
71/* **************************************************************** */
72
73#if defined(HANDLE_MULTIBYTE)
74
75static int
76_rl_find_next_mbchar_internal (string, seed, count, find_non_zero)
77     char *string;
78     int seed, count, find_non_zero;
79{
80  size_t tmp, len;
81  mbstate_t ps;
82  int point;
83  wchar_t wc;
84
85  tmp = 0;
86
87  memset(&ps, 0, sizeof (mbstate_t));
88  if (seed < 0)
89    seed = 0;
90  if (count <= 0)
91    return seed;
92
93  point = seed + _rl_adjust_point (string, seed, &ps);
94  /* if this is true, means that seed was not pointed character
95     started byte.  So correct the point and consume count */
96  if (seed < point)
97    count--;
98
99  while (count > 0)
100    {
101      len = strlen (string + point);
102      if (len == 0)
103	break;
104      tmp = mbrtowc (&wc, string+point, len, &ps);
105      if (MB_INVALIDCH ((size_t)tmp))
106	{
107	  /* invalid bytes. asume a byte represents a character */
108	  point++;
109	  count--;
110	  /* reset states. */
111	  memset(&ps, 0, sizeof(mbstate_t));
112	}
113      else if (MB_NULLWCH (tmp))
114	break;			/* found wide '\0' */
115      else
116	{
117	  /* valid bytes */
118	  point += tmp;
119	  if (find_non_zero)
120	    {
121	      if (wcwidth (wc) == 0)
122		continue;
123	      else
124		count--;
125	    }
126	  else
127	    count--;
128	}
129    }
130
131  if (find_non_zero)
132    {
133      tmp = mbrtowc (&wc, string + point, strlen (string + point), &ps);
134      while (MB_NULLWCH (tmp) == 0 && MB_INVALIDCH (tmp) == 0 && wcwidth (wc) == 0)
135	{
136	  point += tmp;
137	  tmp = mbrtowc (&wc, string + point, strlen (string + point), &ps);
138	}
139    }
140
141  return point;
142}
143
144static int
145_rl_find_prev_mbchar_internal (string, seed, find_non_zero)
146     char *string;
147     int seed, find_non_zero;
148{
149  mbstate_t ps;
150  int prev, non_zero_prev, point, length;
151  size_t tmp;
152  wchar_t wc;
153
154  memset(&ps, 0, sizeof(mbstate_t));
155  length = strlen(string);
156
157  if (seed < 0)
158    return 0;
159  else if (length < seed)
160    return length;
161
162  prev = non_zero_prev = point = 0;
163  while (point < seed)
164    {
165      tmp = mbrtowc (&wc, string + point, length - point, &ps);
166      if (MB_INVALIDCH ((size_t)tmp))
167	{
168	  /* in this case, bytes are invalid or shorted to compose
169	     multibyte char, so assume that the first byte represents
170	     a single character anyway. */
171	  tmp = 1;
172	  /* clear the state of the byte sequence, because
173	     in this case effect of mbstate is undefined  */
174	  memset(&ps, 0, sizeof (mbstate_t));
175
176	  /* Since we're assuming that this byte represents a single
177	     non-zero-width character, don't forget about it. */
178	  prev = point;
179	}
180      else if (MB_NULLWCH (tmp))
181	break;			/* Found '\0' char.  Can this happen? */
182      else
183	{
184	  if (find_non_zero)
185	    {
186	      if (wcwidth (wc) != 0)
187		prev = point;
188	    }
189	  else
190	    prev = point;
191	}
192
193      point += tmp;
194    }
195
196  return prev;
197}
198
199/* return the number of bytes parsed from the multibyte sequence starting
200   at src, if a non-L'\0' wide character was recognized. It returns 0,
201   if a L'\0' wide character was recognized. It  returns (size_t)(-1),
202   if an invalid multibyte sequence was encountered. It returns (size_t)(-2)
203   if it couldn't parse a complete  multibyte character.  */
204int
205_rl_get_char_len (src, ps)
206     char *src;
207     mbstate_t *ps;
208{
209  size_t tmp;
210
211  tmp = mbrlen((const char *)src, (size_t)strlen (src), ps);
212  if (tmp == (size_t)(-2))
213    {
214      /* shorted to compose multibyte char */
215      if (ps)
216	memset (ps, 0, sizeof(mbstate_t));
217      return -2;
218    }
219  else if (tmp == (size_t)(-1))
220    {
221      /* invalid to compose multibyte char */
222      /* initialize the conversion state */
223      if (ps)
224	memset (ps, 0, sizeof(mbstate_t));
225      return -1;
226    }
227  else if (tmp == (size_t)0)
228    return 0;
229  else
230    return (int)tmp;
231}
232
233/* compare the specified two characters. If the characters matched,
234   return 1. Otherwise return 0. */
235int
236_rl_compare_chars (buf1, pos1, ps1, buf2, pos2, ps2)
237     char *buf1;
238     int pos1;
239     mbstate_t *ps1;
240     char *buf2;
241     int pos2;
242     mbstate_t *ps2;
243{
244  int i, w1, w2;
245
246  if ((w1 = _rl_get_char_len (&buf1[pos1], ps1)) <= 0 ||
247	(w2 = _rl_get_char_len (&buf2[pos2], ps2)) <= 0 ||
248	(w1 != w2) ||
249	(buf1[pos1] != buf2[pos2]))
250    return 0;
251
252  for (i = 1; i < w1; i++)
253    if (buf1[pos1+i] != buf2[pos2+i])
254      return 0;
255
256  return 1;
257}
258
259/* adjust pointed byte and find mbstate of the point of string.
260   adjusted point will be point <= adjusted_point, and returns
261   differences of the byte(adjusted_point - point).
262   if point is invalied (point < 0 || more than string length),
263   it returns -1 */
264int
265_rl_adjust_point(string, point, ps)
266     char *string;
267     int point;
268     mbstate_t *ps;
269{
270  size_t tmp = 0;
271  int length;
272  int pos = 0;
273
274  length = strlen(string);
275  if (point < 0)
276    return -1;
277  if (length < point)
278    return -1;
279
280  while (pos < point)
281    {
282      tmp = mbrlen (string + pos, length - pos, ps);
283      if (MB_INVALIDCH ((size_t)tmp))
284	{
285	  /* in this case, bytes are invalid or shorted to compose
286	     multibyte char, so assume that the first byte represents
287	     a single character anyway. */
288	  pos++;
289	  /* clear the state of the byte sequence, because
290	     in this case effect of mbstate is undefined  */
291	  if (ps)
292	    memset (ps, 0, sizeof (mbstate_t));
293	}
294      else if (MB_NULLWCH (tmp))
295	pos++;
296      else
297	pos += tmp;
298    }
299
300  return (pos - point);
301}
302
303int
304_rl_is_mbchar_matched (string, seed, end, mbchar, length)
305     char *string;
306     int seed, end;
307     char *mbchar;
308     int length;
309{
310  int i;
311
312  if ((end - seed) < length)
313    return 0;
314
315  for (i = 0; i < length; i++)
316    if (string[seed + i] != mbchar[i])
317      return 0;
318  return 1;
319}
320
321wchar_t
322_rl_char_value (buf, ind)
323     char *buf;
324     int ind;
325{
326  size_t tmp;
327  wchar_t wc;
328  mbstate_t ps;
329  int l;
330
331  if (MB_LEN_MAX == 1 || rl_byte_oriented)
332    return ((wchar_t) buf[ind]);
333  l = strlen (buf);
334  if (ind >= l - 1)
335    return ((wchar_t) buf[ind]);
336  memset (&ps, 0, sizeof (mbstate_t));
337  tmp = mbrtowc (&wc, buf + ind, l - ind, &ps);
338  if (MB_INVALIDCH (tmp) || MB_NULLWCH (tmp))
339    return ((wchar_t) buf[ind]);
340  return wc;
341}
342#endif /* HANDLE_MULTIBYTE */
343
344/* Find next `count' characters started byte point of the specified seed.
345   If flags is MB_FIND_NONZERO, we look for non-zero-width multibyte
346   characters. */
347#undef _rl_find_next_mbchar
348int
349_rl_find_next_mbchar (string, seed, count, flags)
350     char *string;
351     int seed, count, flags;
352{
353#if defined (HANDLE_MULTIBYTE)
354  return _rl_find_next_mbchar_internal (string, seed, count, flags);
355#else
356  return (seed + count);
357#endif
358}
359
360/* Find previous character started byte point of the specified seed.
361   Returned point will be point <= seed.  If flags is MB_FIND_NONZERO,
362   we look for non-zero-width multibyte characters. */
363#undef _rl_find_prev_mbchar
364int
365_rl_find_prev_mbchar (string, seed, flags)
366     char *string;
367     int seed, flags;
368{
369#if defined (HANDLE_MULTIBYTE)
370  return _rl_find_prev_mbchar_internal (string, seed, flags);
371#else
372  return ((seed == 0) ? seed : seed - 1);
373#endif
374}
375