1/* $OpenLDAP$ */
2/* This work is part of OpenLDAP Software <http://www.openldap.org/>.
3 *
4 * Copyright 1998-2011 The OpenLDAP Foundation.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted only as authorized by the OpenLDAP
9 * Public License.
10 *
11 * A copy of this license is available in file LICENSE in the
12 * top-level directory of the distribution or, alternatively, at
13 * <http://www.OpenLDAP.org/license.html>.
14 */
15/* Copyright 2001 Computing Research Labs, New Mexico State University
16 *
17 * Permission is hereby granted, free of charge, to any person obtaining a
18 * copy of this software and associated documentation files (the "Software"),
19 * to deal in the Software without restriction, including without limitation
20 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
21 * and/or sell copies of the Software, and to permit persons to whom the
22 * Software is furnished to do so, subject to the following conditions:
23 *
24 * The above copyright notice and this permission notice shall be included in
25 * all copies or substantial portions of the Software.
26 *
27 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
28 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
29 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
30 * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
31 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
32 * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
33 * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
34 */
35/* $Id: ucpgba.c,v 1.5 2001/01/02 18:46:20 mleisher Exp $ */
36
37#include "portable.h"
38
39#include <stdio.h>
40#include <stdlib.h>
41
42#include "ucdata.h"
43#include "ucpgba.h"
44
45/*
46 * These macros are used while reordering of RTL runs of text for the
47 * special case of non-spacing characters being in runs of weakly
48 * directional text.  They check for weak and non-spacing, and digits and
49 * non-spacing.
50 */
51#define ISWEAKSPECIAL(cc)  ucisprop(cc, UC_EN|UC_ES|UC_MN, UC_ET|UC_AN|UC_CS)
52#define ISDIGITSPECIAL(cc) ucisprop(cc, UC_ND|UC_MN, 0)
53
54/*
55 * These macros are used while breaking a string into runs of text in
56 * different directions.  Descriptions:
57 *
58 * ISLTR_LTR - Test for members of an LTR run in an LTR context.  This looks
59 *             for characters with ltr, non-spacing, weak, and neutral
60 *             properties.
61 *
62 * ISRTL_RTL - Test for members of an RTL run in an RTL context.  This looks
63 *             for characters with rtl, non-spacing, weak, and neutral
64 *             properties.
65 *
66 * ISRTL_NEUTRAL  - Test for RTL or neutral characters.
67 *
68 * ISWEAK_NEUTRAL - Test for weak or neutral characters.
69 */
70#define ISLTR_LTR(cc) ucisprop(cc, UC_L|UC_MN|UC_EN|UC_ES,\
71                               UC_ET|UC_CS|UC_B|UC_S|UC_WS|UC_ON)
72
73#define ISRTL_RTL(cc) ucisprop(cc, UC_R|UC_MN|UC_EN|UC_ES,\
74                               UC_ET|UC_AN|UC_CS|UC_B|UC_S|UC_WS|UC_ON)
75
76#define ISRTL_NEUTRAL(cc) ucisprop(cc, UC_R, UC_B|UC_S|UC_WS|UC_ON)
77#define ISWEAK_NEUTRAL(cc) ucisprop(cc, UC_EN|UC_ES, \
78                                    UC_B|UC_S|UC_WS|UC_ON|UC_ET|UC_AN|UC_CS)
79
80/*
81 * This table is temporarily hard-coded here until it can be constructed
82 * automatically somehow.
83 */
84static unsigned long _symmetric_pairs[] = {
85    0x0028, 0x0029, 0x0029, 0x0028, 0x003C, 0x003E, 0x003E, 0x003C,
86    0x005B, 0x005D, 0x005D, 0x005B, 0x007B, 0x007D, 0x007D, 0x007B,
87    0x2045, 0x2046, 0x2046, 0x2045, 0x207D, 0x207E, 0x207E, 0x207D,
88    0x208D, 0x208E, 0x208E, 0x208D, 0x3008, 0x3009, 0x3009, 0x3008,
89    0x300A, 0x300B, 0x300B, 0x300A, 0x300C, 0x300D, 0x300D, 0x300C,
90    0x300E, 0x300F, 0x300F, 0x300E, 0x3010, 0x3011, 0x3011, 0x3010,
91    0x3014, 0x3015, 0x3015, 0x3014, 0x3016, 0x3017, 0x3017, 0x3016,
92    0x3018, 0x3019, 0x3019, 0x3018, 0x301A, 0x301B, 0x301B, 0x301A,
93    0xFD3E, 0xFD3F, 0xFD3F, 0xFD3E, 0xFE59, 0xFE5A, 0xFE5A, 0xFE59,
94    0xFE5B, 0xFE5C, 0xFE5C, 0xFE5B, 0xFE5D, 0xFE5E, 0xFE5E, 0xFE5D,
95    0xFF08, 0xFF09, 0xFF09, 0xFF08, 0xFF3B, 0xFF3D, 0xFF3D, 0xFF3B,
96    0xFF5B, 0xFF5D, 0xFF5D, 0xFF5B, 0xFF62, 0xFF63, 0xFF63, 0xFF62,
97};
98
99static int _symmetric_pairs_size =
100sizeof(_symmetric_pairs)/sizeof(_symmetric_pairs[0]);
101
102/*
103 * This routine looks up the other form of a symmetric pair.
104 */
105static unsigned long
106_ucsymmetric_pair(unsigned long c)
107{
108    int i;
109
110    for (i = 0; i < _symmetric_pairs_size; i += 2) {
111        if (_symmetric_pairs[i] == c)
112          return _symmetric_pairs[i+1];
113    }
114    return c;
115}
116
117/*
118 * This routine creates a new run, copies the text into it, links it into the
119 * logical text order chain and returns it to the caller to be linked into
120 * the visual text order chain.
121 */
122static ucrun_t *
123_add_run(ucstring_t *str, unsigned long *src,
124         unsigned long start, unsigned long end, int direction)
125{
126    long i, t;
127    ucrun_t *run;
128
129    run = (ucrun_t *) malloc(sizeof(ucrun_t));
130    run->visual_next = run->visual_prev = 0;
131    run->direction = direction;
132
133    run->cursor = ~0;
134
135    run->chars = (unsigned long *)
136        malloc(sizeof(unsigned long) * ((end - start) << 1));
137    run->positions = run->chars + (end - start);
138
139    run->source = src;
140    run->start = start;
141    run->end = end;
142
143    if (direction == UCPGBA_RTL) {
144        /*
145         * Copy the source text into the run in reverse order and select
146         * replacements for the pairwise punctuation and the <> characters.
147         */
148        for (i = 0, t = end - 1; start < end; start++, t--, i++) {
149            run->positions[i] = t;
150            if (ucissymmetric(src[t]) || src[t] == '<' || src[t] == '>')
151              run->chars[i] = _ucsymmetric_pair(src[t]);
152            else
153              run->chars[i] = src[t];
154        }
155    } else {
156        /*
157         * Copy the source text into the run directly.
158         */
159        for (i = start; i < end; i++) {
160            run->positions[i - start] = i;
161            run->chars[i - start] = src[i];
162        }
163    }
164
165    /*
166     * Add the run to the logical list for cursor traversal.
167     */
168    if (str->logical_first == 0)
169      str->logical_first = str->logical_last = run;
170    else {
171        run->logical_prev = str->logical_last;
172        str->logical_last->logical_next = run;
173        str->logical_last = run;
174    }
175
176    return run;
177}
178
179static void
180_ucadd_rtl_segment(ucstring_t *str, unsigned long *source, unsigned long start,
181                   unsigned long end)
182{
183    unsigned long s, e;
184    ucrun_t *run, *lrun;
185
186    /*
187     * This is used to splice runs into strings with overall LTR direction.
188     * The `lrun' variable will never be NULL because at least one LTR run was
189     * added before this RTL run.
190     */
191    lrun = str->visual_last;
192
193    for (e = s = start; s < end;) {
194        for (; e < end && ISRTL_NEUTRAL(source[e]); e++) ;
195
196        if (e > s) {
197            run = _add_run(str, source, s, e, UCPGBA_RTL);
198
199            /*
200             * Add the run to the visual list for cursor traversal.
201             */
202            if (str->visual_first != 0) {
203                if (str->direction == UCPGBA_LTR) {
204                    run->visual_prev = lrun;
205                    run->visual_next = lrun->visual_next;
206                    if (lrun->visual_next != 0)
207                      lrun->visual_next->visual_prev = run;
208                    lrun->visual_next = run;
209                    if (lrun == str->visual_last)
210                      str->visual_last = run;
211                } else {
212                    run->visual_next = str->visual_first;
213                    str->visual_first->visual_prev = run;
214                    str->visual_first = run;
215                }
216            } else
217              str->visual_first = str->visual_last = run;
218        }
219
220        /*
221         * Handle digits in a special way.  This makes sure the weakly
222         * directional characters appear on the expected sides of a number
223         * depending on whether that number is Arabic or not.
224         */
225        for (s = e; e < end && ISWEAKSPECIAL(source[e]); e++) {
226            if (!ISDIGITSPECIAL(source[e]) &&
227                (e + 1 == end || !ISDIGITSPECIAL(source[e + 1])))
228              break;
229        }
230
231        if (e > s) {
232            run = _add_run(str, source, s, e, UCPGBA_LTR);
233
234            /*
235             * Add the run to the visual list for cursor traversal.
236             */
237            if (str->visual_first != 0) {
238                if (str->direction == UCPGBA_LTR) {
239                    run->visual_prev = lrun;
240                    run->visual_next = lrun->visual_next;
241                    if (lrun->visual_next != 0)
242                      lrun->visual_next->visual_prev = run;
243                    lrun->visual_next = run;
244                    if (lrun == str->visual_last)
245                      str->visual_last = run;
246                } else {
247                    run->visual_next = str->visual_first;
248                    str->visual_first->visual_prev = run;
249                    str->visual_first = run;
250                }
251            } else
252              str->visual_first = str->visual_last = run;
253        }
254
255        /*
256         * Collect all weak non-digit sequences for an RTL segment.  These
257         * will appear as part of the next RTL segment or will be added as
258         * an RTL segment by themselves.
259         */
260        for (s = e; e < end && ucisweak(source[e]) && !ucisdigit(source[e]);
261             e++) ;
262    }
263
264    /*
265     * Capture any weak non-digit sequences that occur at the end of the RTL
266     * run.
267     */
268    if (e > s) {
269        run = _add_run(str, source, s, e, UCPGBA_RTL);
270
271        /*
272         * Add the run to the visual list for cursor traversal.
273         */
274        if (str->visual_first != 0) {
275            if (str->direction == UCPGBA_LTR) {
276                run->visual_prev = lrun;
277                run->visual_next = lrun->visual_next;
278                if (lrun->visual_next != 0)
279                  lrun->visual_next->visual_prev = run;
280                lrun->visual_next = run;
281                if (lrun == str->visual_last)
282                  str->visual_last = run;
283            } else {
284                run->visual_next = str->visual_first;
285                str->visual_first->visual_prev = run;
286                str->visual_first = run;
287            }
288        } else
289          str->visual_first = str->visual_last = run;
290    }
291}
292
293static void
294_ucadd_ltr_segment(ucstring_t *str, unsigned long *source, unsigned long start,
295                   unsigned long end)
296{
297    ucrun_t *run;
298
299    run = _add_run(str, source, start, end, UCPGBA_LTR);
300
301    /*
302     * Add the run to the visual list for cursor traversal.
303     */
304    if (str->visual_first != 0) {
305        if (str->direction == UCPGBA_LTR) {
306            run->visual_prev = str->visual_last;
307            str->visual_last->visual_next = run;
308            str->visual_last = run;
309        } else {
310            run->visual_next = str->visual_first;
311            str->visual_first->visual_prev = run;
312            str->visual_first = run;
313        }
314    } else
315      str->visual_first = str->visual_last = run;
316}
317
318ucstring_t *
319ucstring_create(unsigned long *source, unsigned long start, unsigned long end,
320                int default_direction, int cursor_motion)
321{
322    int rtl_first;
323    unsigned long s, e, ld;
324    ucstring_t *str;
325
326    str = (ucstring_t *) malloc(sizeof(ucstring_t));
327
328    /*
329     * Set the initial values.
330     */
331    str->cursor_motion = cursor_motion;
332    str->logical_first = str->logical_last = 0;
333    str->visual_first = str->visual_last = str->cursor = 0;
334    str->source = source;
335    str->start = start;
336    str->end = end;
337
338    /*
339     * If the length of the string is 0, then just return it at this point.
340     */
341    if (start == end)
342      return str;
343
344    /*
345     * This flag indicates whether the collection loop for RTL is called
346     * before the LTR loop the first time.
347     */
348    rtl_first = 0;
349
350    /*
351     * Look for the first character in the string that has strong
352     * directionality.
353     */
354    for (s = start; s < end && !ucisstrong(source[s]); s++) ;
355
356    if (s == end)
357      /*
358       * If the string contains no characters with strong directionality, use
359       * the default direction.
360       */
361      str->direction = default_direction;
362    else
363      str->direction = ucisrtl(source[s]) ? UCPGBA_RTL : UCPGBA_LTR;
364
365    if (str->direction == UCPGBA_RTL)
366      /*
367       * Set the flag that causes the RTL collection loop to run first.
368       */
369      rtl_first = 1;
370
371    /*
372     * This loop now separates the string into runs based on directionality.
373     */
374    for (s = e = 0; s < end; s = e) {
375        if (!rtl_first) {
376            /*
377             * Determine the next run of LTR text.
378             */
379
380            ld = s;
381            while (e < end && ISLTR_LTR(source[e])) {
382                if (ucisdigit(source[e]) &&
383                    !(0x660 <= source[e] && source[e] <= 0x669))
384                  ld = e;
385                e++;
386            }
387            if (str->direction != UCPGBA_LTR) {
388                while (e > ld && ISWEAK_NEUTRAL(source[e - 1]))
389                  e--;
390            }
391
392            /*
393             * Add the LTR segment to the string.
394             */
395            if (e > s)
396              _ucadd_ltr_segment(str, source, s, e);
397        }
398
399        /*
400         * Determine the next run of RTL text.
401         */
402        ld = s = e;
403        while (e < end && ISRTL_RTL(source[e])) {
404            if (ucisdigit(source[e]) &&
405                !(0x660 <= source[e] && source[e] <= 0x669))
406              ld = e;
407            e++;
408        }
409        if (str->direction != UCPGBA_RTL) {
410            while (e > ld && ISWEAK_NEUTRAL(source[e - 1]))
411              e--;
412        }
413
414        /*
415         * Add the RTL segment to the string.
416         */
417        if (e > s)
418          _ucadd_rtl_segment(str, source, s, e);
419
420        /*
421         * Clear the flag that allowed the RTL collection loop to run first
422         * for strings with overall RTL directionality.
423         */
424        rtl_first = 0;
425    }
426
427    /*
428     * Set up the initial cursor run.
429     */
430    str->cursor = str->logical_first;
431    if (str != 0)
432      str->cursor->cursor = (str->cursor->direction == UCPGBA_RTL) ?
433          str->cursor->end - str->cursor->start : 0;
434
435    return str;
436}
437
438void
439ucstring_free(ucstring_t *s)
440{
441    ucrun_t *l, *r;
442
443    if (s == 0)
444      return;
445
446    for (l = 0, r = s->visual_first; r != 0; r = r->visual_next) {
447        if (r->end > r->start)
448          free((char *) r->chars);
449        if (l)
450          free((char *) l);
451        l = r;
452    }
453    if (l)
454      free((char *) l);
455
456    free((char *) s);
457}
458
459int
460ucstring_set_cursor_motion(ucstring_t *str, int cursor_motion)
461{
462    int n;
463
464    if (str == 0)
465      return -1;
466
467    n = str->cursor_motion;
468    str->cursor_motion = cursor_motion;
469    return n;
470}
471
472static int
473_ucstring_visual_cursor_right(ucstring_t *str, int count)
474{
475    int cnt = count;
476    unsigned long size;
477    ucrun_t *cursor;
478
479    if (str == 0)
480      return 0;
481
482    cursor = str->cursor;
483    while (cnt > 0) {
484        size = cursor->end - cursor->start;
485        if ((cursor->direction == UCPGBA_RTL && cursor->cursor + 1 == size) ||
486            cursor->cursor + 1 > size) {
487            /*
488             * If the next run is NULL, then the cursor is already on the
489             * far right end already.
490             */
491            if (cursor->visual_next == 0)
492              /*
493               * If movement occured, then report it.
494               */
495              return (cnt != count);
496
497            /*
498             * Move to the next run.
499             */
500            str->cursor = cursor = cursor->visual_next;
501            cursor->cursor = (cursor->direction == UCPGBA_RTL) ? -1 : 0;
502            size = cursor->end - cursor->start;
503        } else
504          cursor->cursor++;
505        cnt--;
506    }
507    return 1;
508}
509
510static int
511_ucstring_logical_cursor_right(ucstring_t *str, int count)
512{
513    int cnt = count;
514    unsigned long size;
515    ucrun_t *cursor;
516
517    if (str == 0)
518      return 0;
519
520    cursor = str->cursor;
521    while (cnt > 0) {
522        size = cursor->end - cursor->start;
523        if (str->direction == UCPGBA_RTL) {
524            if (cursor->direction == UCPGBA_RTL) {
525                if (cursor->cursor + 1 == size) {
526                    if (cursor == str->logical_first)
527                      /*
528                       * Already at the beginning of the string.
529                       */
530                      return (cnt != count);
531
532                    str->cursor = cursor = cursor->logical_prev;
533                    size = cursor->end - cursor->start;
534                    cursor->cursor = (cursor->direction == UCPGBA_LTR) ?
535                        size : 0;
536                } else
537                  cursor->cursor++;
538            } else {
539                if (cursor->cursor == 0) {
540                    if (cursor == str->logical_first)
541                      /*
542                       * At the beginning of the string already.
543                       */
544                      return (cnt != count);
545
546                    str->cursor = cursor = cursor->logical_prev;
547                    size = cursor->end - cursor->start;
548                    cursor->cursor = (cursor->direction == UCPGBA_LTR) ?
549                        size : 0;
550                } else
551                  cursor->cursor--;
552            }
553        } else {
554            if (cursor->direction == UCPGBA_RTL) {
555                if (cursor->cursor == 0) {
556                    if (cursor == str->logical_last)
557                      /*
558                       * Already at the end of the string.
559                       */
560                      return (cnt != count);
561
562                    str->cursor = cursor = cursor->logical_next;
563                    size = cursor->end - cursor->start;
564                    cursor->cursor = (cursor->direction == UCPGBA_LTR) ?
565                        0 : size - 1;
566                } else
567                  cursor->cursor--;
568            } else {
569                if (cursor->cursor + 1 > size) {
570                    if (cursor == str->logical_last)
571                      /*
572                       * Already at the end of the string.
573                       */
574                      return (cnt != count);
575
576                    str->cursor = cursor = cursor->logical_next;
577                    cursor->cursor = (cursor->direction == UCPGBA_LTR) ?
578                        0 : size - 1;
579                } else
580                  cursor->cursor++;
581            }
582        }
583        cnt--;
584    }
585    return 1;
586}
587
588int
589ucstring_cursor_right(ucstring_t *str, int count)
590{
591    if (str == 0)
592      return 0;
593    return (str->cursor_motion == UCPGBA_CURSOR_VISUAL) ?
594        _ucstring_visual_cursor_right(str, count) :
595        _ucstring_logical_cursor_right(str, count);
596}
597
598static int
599_ucstring_visual_cursor_left(ucstring_t *str, int count)
600{
601    int cnt = count;
602    unsigned long size;
603    ucrun_t *cursor;
604
605    if (str == 0)
606      return 0;
607
608    cursor = str->cursor;
609    while (cnt > 0) {
610        size = cursor->end - cursor->start;
611        if ((cursor->direction == UCPGBA_LTR && cursor->cursor == 0) ||
612            cursor->cursor - 1 < -1) {
613            /*
614             * If the preceding run is NULL, then the cursor is already on the
615             * far left end already.
616             */
617            if (cursor->visual_prev == 0)
618              /*
619               * If movement occured, then report it.
620               */
621              return (cnt != count);
622
623            /*
624             * Move to the previous run.
625             */
626            str->cursor = cursor = cursor->visual_prev;
627            size = cursor->end - cursor->start;
628            cursor->cursor = (cursor->direction == UCPGBA_RTL) ?
629                size : size - 1;
630        } else
631          cursor->cursor--;
632        cnt--;
633    }
634    return 1;
635}
636
637static int
638_ucstring_logical_cursor_left(ucstring_t *str, int count)
639{
640    int cnt = count;
641    unsigned long size;
642    ucrun_t *cursor;
643
644    if (str == 0)
645      return 0;
646
647    cursor = str->cursor;
648    while (cnt > 0) {
649        size = cursor->end - cursor->start;
650        if (str->direction == UCPGBA_RTL) {
651            if (cursor->direction == UCPGBA_RTL) {
652                if (cursor->cursor == -1) {
653                    if (cursor == str->logical_last)
654                      /*
655                       * Already at the end of the string.
656                       */
657                      return (cnt != count);
658
659                    str->cursor = cursor = cursor->logical_next;
660                    size = cursor->end - cursor->start;
661                    cursor->cursor = (cursor->direction == UCPGBA_LTR) ?
662                        0 : size - 1;
663                } else
664                  cursor->cursor--;
665            } else {
666                if (cursor->cursor + 1 > size) {
667                    if (cursor == str->logical_last)
668                      /*
669                       * At the end of the string already.
670                       */
671                      return (cnt != count);
672
673                    str->cursor = cursor = cursor->logical_next;
674                    size = cursor->end - cursor->start;
675                    cursor->cursor = (cursor->direction == UCPGBA_LTR) ?
676                        0 : size - 1;
677                } else
678                  cursor->cursor++;
679            }
680        } else {
681            if (cursor->direction == UCPGBA_RTL) {
682                if (cursor->cursor + 1 == size) {
683                    if (cursor == str->logical_first)
684                      /*
685                       * Already at the beginning of the string.
686                       */
687                      return (cnt != count);
688
689                    str->cursor = cursor = cursor->logical_prev;
690                    size = cursor->end - cursor->start;
691                    cursor->cursor = (cursor->direction == UCPGBA_LTR) ?
692                        size : 0;
693                } else
694                  cursor->cursor++;
695            } else {
696                if (cursor->cursor == 0) {
697                    if (cursor == str->logical_first)
698                      /*
699                       * Already at the beginning of the string.
700                       */
701                      return (cnt != count);
702
703                    str->cursor = cursor = cursor->logical_prev;
704                    cursor->cursor = (cursor->direction == UCPGBA_LTR) ?
705                        size : 0;
706                } else
707                  cursor->cursor--;
708            }
709        }
710        cnt--;
711    }
712    return 1;
713}
714
715int
716ucstring_cursor_left(ucstring_t *str, int count)
717{
718    if (str == 0)
719      return 0;
720    return (str->cursor_motion == UCPGBA_CURSOR_VISUAL) ?
721        _ucstring_visual_cursor_left(str, count) :
722        _ucstring_logical_cursor_left(str, count);
723}
724
725void
726ucstring_cursor_info(ucstring_t *str, int *direction, unsigned long *position)
727{
728    long c;
729    unsigned long size;
730    ucrun_t *cursor;
731
732    if (str == 0 || direction == 0 || position == 0)
733      return;
734
735    cursor = str->cursor;
736
737    *direction = cursor->direction;
738
739    c = cursor->cursor;
740    size = cursor->end - cursor->start;
741
742    if (c == size)
743      *position = (cursor->direction == UCPGBA_RTL) ?
744          cursor->start : cursor->positions[c - 1];
745    else if (c == -1)
746      *position = (cursor->direction == UCPGBA_RTL) ?
747          cursor->end : cursor->start;
748    else
749      *position = cursor->positions[c];
750}
751