1(*
2    Title:      Standard Basis Library: String Structure
3    Copyright   David Matthews 1999, 2005, 2016, 2018
4
5    This library is free software; you can redistribute it and/or
6    modify it under the terms of the GNU Lesser General Public
7    License version 2.1 as published by the Free Software Foundation.
8    
9    This library is distributed in the hope that it will be useful,
10    but WITHOUT ANY WARRANTY; without even the implied warranty of
11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12    Lesser General Public License for more details.
13    
14    You should have received a copy of the GNU Lesser General Public
15    License along with this library; if not, write to the Free Software
16    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17*)
18
19(*
20    This file declares Char, String and CharVector.  String and CharVector
21    are simply different views on the same underlying structure.
22*)
23(* The overloads for char and string for the relational operators have
24   already been set up in the prelude.  *)
25  
26local
27    open LibrarySupport
28
29    (* Redefine these as functions on the abstract type. *)
30    val System_move_bytesA:
31        address*address*word*word*word->unit = RunCall.moveBytes
32
33    val wordSize : word = LibrarySupport.wordSize
34
35    local
36        fun singleCharString(c: word): string =
37        let
38            val v = allocString 0w1
39            val () = RunCall.storeByte(v, wordSize, c)
40            val () = RunCall.clearMutableBit v
41        in
42            v
43        end
44        (* We haven't defined Vector at this stage. *)
45        val charMap = RunCall.allocateWordMemory(0w256, 0wx40, 0w0)
46        val intAsWord: int -> word = RunCall.unsafeCast
47        fun setEntries i =
48            if i < 256
49            then (RunCall.storeWord(charMap, intAsWord i, singleCharString(intAsWord i)); setEntries(i+1))
50            else ();
51        val () = setEntries 0
52        val () = RunCall.clearMutableBit charMap
53    in
54        (* Since we've covered the full range from 0 to 255 we don't need a bounds check. *)
55        fun charAsString (ch: char): string = RunCall.loadWord(charMap, RunCall.unsafeCast ch)
56    end
57
58    val bcopy: string*string*word*word*word -> unit = RunCall.moveBytes
59
60    (* This can be used where we have already checked the range. *)
61    fun unsafeStringSub(s: string, i: word): char =
62        RunCall.loadByteFromImmutable(s, i + wordSize)
63
64    fun unsafeSubstring(s: string, i: word, l: word) : string =
65    let
66        val baseLen = sizeAsWord s (* Length of base string. *)
67    in
68        if i = 0w0 andalso l = baseLen then s
69        else if l = 0w0 then "" (* Empty string. *)
70        else if l = 0w1
71        (* Single character string - use pre-built strings. *)
72        then charAsString(unsafeStringSub(s, i))
73        else
74        let
75            (* Multiple character string. *)
76            val vec = allocString l
77        in
78            RunCall.moveBytes(s, vec, wordSize+i, wordSize, l);
79            RunCall.clearMutableBit vec;
80            vec
81        end
82    end
83
84    (* Casts between int and word. *)
85    val intAsWord: int -> word = RunCall.unsafeCast
86    and wordAsInt: word -> int = RunCall.unsafeCast
87
88    (* String concatenation. *) 
89    fun op ^ (a: string, b: string): string =
90        let
91            val a_length = sizeAsWord a
92            and b_length = sizeAsWord b
93        in
94            (* Handle the special cases where one of the strings is
95               empty.  As well as saving on duplicating storage it
96               also means we don't have to consider the special
97               case when the result string is a single character. *)
98            if a_length = 0w0 then b
99            else if b_length = 0w0 then a
100            else (* Normal case *)
101            let
102                val vec = LibrarySupport.allocString(a_length + b_length)
103            in
104                bcopy(a, vec, wordSize, wordSize, a_length);
105                bcopy(b, vec, wordSize, wordSize+a_length, b_length);
106                RunCall.clearMutableBit vec;
107                vec
108            end
109        end (* op ^ *)
110
111    (* String comparison function used in isPrefix and isSuffix.
112       N.B.  The caller must make sure that neither string is a single character. *)
113    local
114        val byteVecEq: string * string * word * word * word -> bool = RunCall.byteVectorEqual
115    in
116        fun byteMatch s1 s2 i j l =
117            byteVecEq(s1, s2, i+wordSize, j+wordSize, l)
118    end
119
120    (* We use stringExplode in String and Substring. *)
121    fun stringExplode (s: string, i: word, l: word) : char list =
122    let 
123        fun exp_str (num, res) =
124            if num = 0w0
125            then res
126            else exp_str (num - 0w1, RunCall.loadByteFromImmutable(s, num+i-0w1+wordSize) :: res)
127    in
128        exp_str (l, [])
129    end
130
131    (* There's an irritating dependency here. Char uses StringCvt.reader
132       which means that StringCvt depends on Char so String depends on
133       StringCvt.  That means we can't define StringCvt in terms of String
134       which would be easiest. *)
135    structure Char =
136    struct
137        type char = char and string = string
138        val maxOrd = 255 (* Range from 0 to 255 *)
139        
140        (* Single characters are represented by the number so we only need
141           to check the argument and then convert it. *) 
142        fun chr i : char =
143            if i < 0 orelse i > maxOrd
144            then raise General.Chr else RunCall.unsafeCast i
145            
146        val ord: char -> int = RunCall.unsafeCast
147    
148        val minChar = chr 0 and maxChar = chr maxOrd
149        
150        fun succ c = if ord c = maxOrd then raise Chr else chr(ord c + 1)
151        and pred c = if ord c = 0 then raise Chr else chr(ord c - 1)
152
153        fun isUpper c = #"A" <= c andalso c <= #"Z" 
154        fun isLower c = #"a" <= c andalso c <= #"z" 
155        fun isDigit c = #"0" <= c andalso c <= #"9" 
156        fun isAlpha c = isUpper c orelse isLower c 
157        fun isAlphaNum c = isAlpha c orelse isDigit c 
158        fun isHexDigit c = 
159            isDigit c orelse (#"a" <= c andalso c <= #"f")
160                 orelse (#"A" <= c andalso c <= #"F") 
161        fun isGraph c = #"!" <= c andalso c <= #"~" 
162        fun isPrint c = isGraph c orelse c = #" " 
163        fun isPunct c = isGraph c andalso not (isAlphaNum c)
164        (* NOTE: The web page includes 0 <= ord c but all chars satisfy that. *)
165        fun isAscii c = c <= chr 127 
166        (* NOTE: The web page defines isCtrl not isCntrl *)
167        fun isCntrl c = isAscii c andalso not (isPrint c) 
168        (* NOTE: There's a mistake in the web page.  It says c <= #"\ " *)
169        fun isSpace c = (#"\t" <= c andalso c <= #"\r") orelse c = #" "
170        fun toLower c = if isUpper c then chr (ord c + 32) else c 
171        fun toUpper c = if isLower c then chr (ord c - 32) else c 
172
173        (* TODO: More efficient versions.
174           Probably best to use comparison for short strings and table
175           look-up for longer ones.  *)
176        fun contains s =
177            let
178            fun match 0w0 _ = false
179              | match i c = unsafeStringSub(s, i-0w1) = c orelse match (i-0w1) c
180            in
181            match (sizeAsWord s)
182            end
183            
184        fun notContains s c = not (contains s c)
185    end; (* structure Char *)
186
187    structure String =
188    (* This structure is the basis of both String and CharVector. *)
189    struct
190        type string = string
191        and vector = string
192        and elem = char
193        and char = char
194
195        (* We don't have Word.toInt yet so we have to use casts in these next two. *)
196        val size : string -> int = RunCall.unsafeCast o LibrarySupport.sizeAsWord
197        val maxSize: int = RunCall.unsafeCast LibrarySupport.maxString
198   
199        val str: char ->string = charAsString
200
201        (* Concatentate a list of strings. *)
202        fun concat [] = ""
203         |  concat [s] = s (* Handle special case to reduce copying. *)
204            (* Could also handle the case of concat(""::s) = concat s *)
205         |  concat L =
206            let
207                fun total n []     = n
208                 | total n (H::T) = total (n + size H) T
209                (* How many characters do we have to implode?   This could
210                   possibly be long (although we would probably have run out
211                   of memory long before) so we have to add these as integers
212                   and then raise an exception if it's not short. *)
213                val chars : int = total 0 L;
214            in
215                if chars = 0
216                then ""
217                else (* Normal case *)
218                let
219                    val chs = unsignedShortOrRaiseSize chars (* Check it's short. *)
220                    val vec = LibrarySupport.allocString chs
221                  
222                    fun copy (_, []:string list) = ()
223                     | copy (i, H :: T) =
224                        let
225                            val src_len = sizeAsWord H
226                        in
227                            bcopy(H, vec, wordSize, i, src_len);
228                            copy(i+src_len, T)
229                        end
230                in
231                copy (wordSize, L);
232                RunCall.clearMutableBit vec;
233                vec
234                end
235            end (* concat *)
236
237        fun concatWith _ [] = ""
238         |  concatWith _ [one] = one
239         |  concatWith s (hd :: tl) =
240            let
241                fun mk [] = []
242                  | mk (h::t) = s :: h :: mk t
243            in
244                concat(hd :: mk tl)
245            end
246        
247        (* implode is very similar to concat, in fact it could be defined
248           as a cast version of it. *)
249        fun implode [] : string = ""
250        |   implode (L as (H::_)) =
251            let
252                (* How many characters do we have to implode? *)
253                val listLength = length L
254                (* In practice we could never make a list with a
255                   combined length which was a long integer but
256                   we still check it here in unsignedShortOrRaiseSize. *)
257                val chars: word = unsignedShortOrRaiseSize listLength
258            in
259                if chars = 0w1 then str H
260                else
261                let
262                    val dest = LibrarySupport.allocString chars
263
264                    fun copy (_, []:char list) = ()
265                      | copy (i, H :: T) =
266                        (
267                        RunCall.storeByte (dest, i, H);
268                        copy (i + 0w1, T)
269                        )
270                in
271                    copy (wordSize, L);
272                    RunCall.clearMutableBit dest; (* reset mutable flag *)
273                    dest
274                end
275            end
276
277        (* This was previously built-in because of the way it worked in
278           the Poly language.  It could be defined as concat[a,b] but we
279           define it separately for efficiency. *)
280        val op ^ : string * string -> string = op ^
281
282        fun sub (s: string, i: int): char =
283            if i < 0 orelse i >= size s
284            then raise General.Subscript
285            else RunCall.loadByteFromImmutable(s, intAsWord i + wordSize);
286    
287        (* Explode a string into a list of characters. *)
288        fun explode (s : string) : char list = stringExplode(s, 0w0, sizeAsWord s)
289
290        (* TODO: Could be defined more efficiently, perhaps by copying
291           it into an array. *)
292        (* This would be easier if we could process the string twice as we
293           do with toString but we need to be careful to call f only once
294           for each character in case it has a side-effect. *)
295        fun translate f s =
296        let
297            val len = sizeAsWord s
298        in
299            let
300                (* Accumulate the characters into a list. *)
301                fun mapChars i l =
302                    if i = len then l
303                    else mapChars (i+0w1) (f(RunCall.loadByteFromImmutable(s, i+wordSize)) :: l)
304                
305                (* Reverse has not yet been defined. *)
306                fun revAppend([], a) = a
307                |   revAppend(x::y, a) = revAppend(y, x::a)
308            in
309                (* Reverse the list and concatenate it. *)
310                concat(revAppend(mapChars 0w0 [], []))
311            end
312        end
313        
314        fun substring (s, i, j) =
315        let
316            val len = sizeAsWord s
317            (* Check that the index and length are both non-negative. *)
318            val i' = unsignedShortOrRaiseSubscript i
319            and j' = unsignedShortOrRaiseSubscript j
320        in
321            if i'+j' > len
322            then raise Subscript
323            else unsafeSubstring(s, i', j')
324        end
325    
326        fun extract (s, i, NONE) = substring (s, i, size s - i)
327         |  extract (s, i, SOME j) = substring (s, i, j)
328    
329        (* tokens and fields are very similar except that tokens does not return
330           empty strings for adjacent delimiters whereas fields does.  *)
331        fun tokens p s =
332            let
333            val length = size s
334            fun tok' i l = (* i is the character to examine.  l is the start of a token *)
335                if i = length
336                then (* Finished the input.  Return any partially completed string. *)
337                    (
338                    if l = i then [] else [substring (s, l, i-l)]
339                    )
340                else if p (sub(s, i)) (* TODO: We don't need sub to do the range check here *)
341                then (* It's a delimiter.  If we have more than one character in the
342                        string we create a string otherwise we just continue. *)
343                    (
344                    if l = i then tok' (i+1) (i+1)
345                    else substring (s, l, i-l) :: tok' (i+1) (i+1)
346                    )
347                else (* Token: Keep accumulating characters. *) tok' (i+1) l
348            in
349            tok' 0 0
350            end
351    
352        fun fields p s =
353            let
354            val length = size s
355            
356            fun field' i l = (* i is the character to examine.  l is the start of a token *)
357                if i = length
358                then (* Finished the input.  Return any partially completed string. *)
359                    [substring (s, l, i-l)]
360                else if p (unsafeStringSub(s, intAsWord i))
361                then (* It's a delimiter.  Finish the partially completed string and
362                        start another. *)
363                    substring (s, l, i-l) :: field' (i+1) (i+1)
364                else (* Field: Keep accumulating characters. *) field' (i+1) l
365            in
366            field' 0 0
367            end
368        
369        (* True if s1 is a prefix of s2 *)
370        (* G&R now says that a string is a prefix of itself.  *)
371        fun isPrefix s1 s2 =
372        let
373            val size_s1 = size s1 and size_s2 = size s2
374        in
375            if size_s1 <= size_s2
376            then byteMatch s1 s2 0w0 0w0 (intAsWord size_s1)
377            else false
378        end
379
380        (* True if s1 is a suffix of s2 *)
381        fun isSuffix s1 s2 =
382        let
383            val size_s1 = size s1 and size_s2 = size s2
384        in
385            if size_s1 <= size_s2
386            then byteMatch s1 s2 0w0 (intAsWord (size_s2 - size_s1)) (intAsWord size_s1)
387            else false
388        end
389
390        (* True if s1 is a substring of s2 *)
391        fun isSubstring s1 s2 =
392        let
393            val size_s1 = size s1 and size_s2 = size s2
394            (* Start at the beginning and compare until we get a match. *)
395            fun doMatch i s =
396            if s < size_s1 then false (* The remainder of the string is too small to match. *)
397            else if byteMatch s1 s2 0w0 i (intAsWord size_s1)
398            then true
399            else doMatch (i+0w1) (s-1)
400        in
401            doMatch 0w0 size_s2
402        end
403        
404        
405        (* Functions specific to CharVector, apart from map which is common. *)
406        fun tabulate (0, _) : vector = "" (* Must not try to lock it. *)
407         |  tabulate (1, f) : vector = charAsString(f 0)
408         |  tabulate (length: int , f : int->elem): vector =
409        let
410            val len = unsignedShortOrRaiseSize length (* Raises Size if length < 0 *)
411            val vec = LibrarySupport.allocString len
412            (* Initialise it to the function values. *)
413            fun init i = 
414                if len <= i then ()
415                else (RunCall.storeByte(vec, i+wordSize, f(wordAsInt i)); init(i+0w1))
416        in
417            init 0w0;
418            RunCall.clearMutableBit vec;
419            vec
420        end
421
422        (* Create the other functions. *)
423        structure VectorOps =
424            VectorOperations(
425                struct
426                    type vector = vector and elem = elem
427                    val length = sizeAsWord
428                    fun unsafeSub(s, i) = RunCall.loadByteFromImmutable(s, i + wordSize);
429                    fun unsafeSet(_, _, _) = raise Fail "Should not be called"
430                end);
431    
432        open VectorOps;
433
434        fun map f vec =
435        let
436            val len = sizeAsWord vec
437        in
438            if len = 0w0 then ""
439            else (* len > 1 *)
440            let
441                (* Allocate a new vector. *)
442                val new_vec = LibrarySupport.allocString len
443                val byte_limit = len + wordSize
444                    
445                fun domap i =
446                    if i >= byte_limit then ()
447                    else (RunCall.storeByte(new_vec, i, f(RunCall.loadByteFromImmutable(vec, i))); domap(i+0w1))
448            in
449                domap wordSize;
450                RunCall.clearMutableBit new_vec;
451                new_vec
452            end
453        end
454            
455        local
456            (* String comparison. *)
457            fun compareString(s1, s2) =
458            let
459                val s1l = sizeAsWord s1 and s2l = sizeAsWord s2
460                val test = RunCall.byteVectorCompare(s1, s2, wordSize, wordSize, if s1l < s2l then s1l else s2l)
461            in
462                if test = 0 (* If the strings are the same up to the shorter length ... *)
463                then RunCall.unsafeCast(s1l - s2l) (* The result depends on the lengths. *)
464                else test
465            end
466        in
467            fun compare (s1, s2) =
468            let
469                val c = compareString(s1, s2)
470            in
471                if c = 0
472                then General.EQUAL
473                else if c > 0
474                then General.GREATER
475                else General.LESS
476            end
477
478            (* String relational operators.  They could all be defined in terms of "compare" but this
479               generates better code. *)
480            val op >= =
481            fn (s1: string, s2: string) =>
482                let
483                    val s1l = sizeAsWord s1 and s2l = sizeAsWord s2
484                    val test = RunCall.byteVectorCompare(s1, s2, wordSize, wordSize, if s1l < s2l then s1l else s2l)
485                in
486                    if test = 0
487                    then s1l >= s2l
488                    else test >= 0
489                end
490
491            and op <= =
492            fn (s1: string, s2: string) =>
493                let
494                    val s1l = sizeAsWord s1 and s2l = sizeAsWord s2
495                    val test = RunCall.byteVectorCompare(s1, s2, wordSize, wordSize, if s1l < s2l then s1l else s2l)
496                in
497                    if test = 0
498                    then s1l <= s2l
499                    else test <= 0
500                end
501
502            and op > =
503            fn (s1: string, s2: string) =>
504                let
505                    val s1l = sizeAsWord s1 and s2l = sizeAsWord s2
506                    val test = RunCall.byteVectorCompare(s1, s2, wordSize, wordSize, if s1l < s2l then s1l else s2l)
507                in
508                    if test = 0
509                    then s1l > s2l
510                    else test > 0
511                end
512
513            and op < =
514            fn (s1: string, s2: string) =>
515                let
516                    val s1l = sizeAsWord s1 and s2l = sizeAsWord s2
517                    val test = RunCall.byteVectorCompare(s1, s2, wordSize, wordSize, if s1l < s2l then s1l else s2l)
518                in
519                    if test = 0
520                    then s1l < s2l
521                    else test < 0
522                end
523         end
524
525                   
526    end (* String *)
527
528
529    structure StringCvt =
530    struct
531        val mem_move: string*string*word*word*word -> unit = RunCall.moveBytes
532
533        datatype radix = BIN | OCT | DEC | HEX
534
535        datatype realfmt
536          = SCI of int option
537          | FIX of int option
538          | GEN of int option
539          | EXACT
540  
541        type  ('a, 'b) reader = 'b -> ('a * 'b) option
542
543        fun padLeft c i s =
544        if i <= 0 (* unsignedShortOrRaiseSize raises Size if i < 0 which isn't right here. *)
545        then s
546        else
547        let
548            val len: word = sizeAsWord s
549            val iW = unsignedShortOrRaiseSize i (* checks that i is a short. *)
550        in
551            if len >= iW then s
552            else 
553            let
554                val extra = iW - len
555                val str = LibrarySupport.allocString iW
556                fun setCh n =
557                    if n = extra then ()
558                    (* Set the character part of the string. *)
559                    else ( RunCall.storeByte(str, n+wordSize, c); setCh(n+0w1) )
560            in
561                setCh 0w0;
562                (* Copy the character part of the string over. *)
563                mem_move(s, str, wordSize, extra + wordSize, len);
564                RunCall.clearMutableBit str;
565                str
566            end
567        end
568
569        fun padRight c i s =
570        if i <= 0 (* unsignedShortOrRaiseSize raises Size if i < 0 which isn't right here. *)
571        then s
572        else
573        let
574            val len = sizeAsWord s
575            val iW = unsignedShortOrRaiseSize i (* checks that i is a short. *)
576        in
577            if len >= iW then s
578            else 
579            let
580                val str = LibrarySupport.allocString iW
581                fun setCh n =
582                    if n = iW then ()
583                    (* Set the character part of the string. *)
584                    else ( RunCall.storeByte(str, n+wordSize, c); setCh(n+0w1) )
585            in
586                (* Copy the character part of the string over. *)
587                mem_move(s, str, wordSize, wordSize, len);
588                setCh len;
589                RunCall.clearMutableBit str;
590                str
591            end
592        end
593
594        (* p is described as a predicate.  That implies that it is
595           side-effect free.  If it is we could use it e.g. twice, once to work out
596           the length of the string and then to create the string itself. 
597           Assume that it may have side-effects and that we can only execute it
598           once. *)
599
600        local
601            fun split' p f res src =
602                case f src of
603                    NONE => (String.implode(rev res), src) (* Not available. *)
604                  | SOME (ch, src') => (* Char available *)
605                        if p ch
606                        then (* It matches - include in the result *)
607                            split' p f (ch :: res) src'
608                        else (String.implode(rev res), src) (* No match *)
609        in
610            fun splitl p f src = split' p f [] src
611        end
612
613        (* It may be worth defining takel independently but it doesn't add
614           much overhead by contrast with dropl *)
615        fun takel p f s = #1(splitl p f s)
616        (* fun dropl p f s = #2(splitl p f s) *)
617
618        (* This is probably as efficient as it can be. *)
619        fun dropl p f src =
620            case f src of
621                NONE => src (* Not available. *)
622              | SOME (ch, src') => (* Char available *)
623                    if p ch
624                    then dropl p f src'
625                    else src (* No match *)
626
627        (* Copied isSpace from Char structure to avoid circular dependency. *)
628        fun skipWS f src =
629            case f src of
630                NONE => src (* Not available. *)
631              | SOME (ch, src') => (* Char available *)
632                    if (#"\t" <= ch andalso ch <= #"\r") orelse ch = #" "
633                    then skipWS f src'
634                    else src (* No match *)
635
636        datatype cs = Index of word
637
638        (* Index into the string. *)
639        fun scanString cvt s =
640            let
641            val len = sizeAsWord s
642            fun rdr (Index i) =
643                if i = len then NONE
644                (* Since we know the index is between 0 and len-1 we can use
645                   the unsafe subscript function here. *)
646                else SOME(unsafeStringSub(s, i), Index(i+0w1))
647            in
648            case cvt rdr (Index 0w0) of
649                NONE => NONE
650              | SOME(res, _) => SOME res
651            end
652
653    end
654
655    local
656        open Char
657    in
658        (* Convert the first i digits as a hex number.  Check the result is
659           in the range before returning it. *)
660        local
661            fun readHex' _    str 0 res =
662                    if res > maxOrd then NONE else SOME(chr res, str)
663              | readHex' getc str i res = 
664                    case getc str of
665                        NONE => (* No char available.  That's ok if we are converting
666                                   as many chars as we can and have already converted one
667                                   but not if we are converting n chars and haven't got
668                                   them *)
669                            if i >= ~1 orelse res > maxOrd then NONE else SOME(chr res, str)
670                      | SOME(ch, str') =>
671                            if #"0" <= ch andalso ch <= #"9"
672                            then readHex' getc str' (i-1) (res*16 + ord ch - ord #"0")
673                            else if #"a" <= ch andalso ch <= #"f"
674                            then readHex' getc str' (i-1) (res*16 + ord ch - ord #"a" + 10)
675                            else if #"A" <= ch andalso ch <= #"F"
676                            then readHex' getc str' (i-1) (res*16 + ord ch - ord #"A" + 10)
677                            else (* Not a hex char. Ok if we are converting as many as we can. *)
678                                if i >= ~1 orelse res > maxOrd then NONE else SOME(chr res, str)
679        in
680            fun readHexN getc str i = readHex' getc str i 0
681            and readHex getc str = readHex' getc str ~1 0
682        end
683    
684        (* Convert the first i digits as a decimal. There must be exactly i digits. *)  
685        fun readDec _    str 0 res =
686                if res > maxOrd then NONE else SOME(chr res, str)
687          | readDec getc str i res = 
688                case getc str of
689                    NONE =>
690                        if res > maxOrd orelse i > 0 (* not enough chars *) then NONE
691                        else SOME(chr res, str)
692                  | SOME(ch, str') =>
693                        if #"0" <= ch andalso ord #"9" >= ord ch
694                        then readDec getc str' (i-1) (res*10 + ord ch - ord #"0")
695                        else (* Not enough valid digits. *) NONE
696
697        (* Convert up to i digits as an octal number.  There may be fewer than i digits. *)
698        fun readOct _    str 0 res =
699                if res > maxOrd then NONE else SOME(chr res, str)
700          | readOct getc str i res = 
701                case getc str of
702                    NONE =>
703                        if res > maxOrd then NONE
704                        else SOME(chr res, str)
705                  | SOME(ch, str') =>
706                        if #"0" <= ch andalso ord #"7" >= ord ch
707                        then readOct getc str' (i-1) (res*8 + ord ch - ord #"0")
708                        else (* Stop here. *) if res > maxOrd then NONE
709                        else SOME(chr res, str)
710
711        (* This function is used as the basis of Char.scan and String.scan.  There is a
712           crucial difference between Char.scan and String.scan in that Char.scan returns
713           NONE if it cannot read a single character whereas String.scan returns NONE only
714           if it encounters a bad escape before reading any valid input, which includes a
715           format sequence (\<whitespace>\).  This function returns NONE if it encounters
716           a bad escape but SOME("", strm) if it encounters end-of-stream or has read a
717           format sequence. *)
718        fun scanBase (getc: (char, 'a) StringCvt.reader) (str :'a) : (string * 'a) option =
719        case getc str of (* Read the first character. *)
720            NONE => SOME("", str) (* Just end-of-stream. *)
721          | SOME(ch, str') =>
722                if ch < chr 32 orelse chr 126 < ch
723                then NONE (* Non-printable character. *)
724                else if ch = #"\\"
725                then (* escape *)
726                    (
727                    case getc str' of
728                        NONE => NONE
729                      | SOME(#"a", str'') => SOME("\a", str'')
730                      | SOME(#"b", str'') => SOME("\b", str'')
731                      | SOME(#"t", str'') => SOME("\t", str'')
732                      | SOME(#"n", str'') => SOME("\n", str'')
733                      | SOME(#"v", str'') => SOME("\v", str'')
734                      | SOME(#"f", str'') => SOME("\f", str'')
735                      | SOME(#"r", str'') => SOME("\r", str'')
736                      | SOME(#"\\", str'') => SOME("\\", str'')
737                      | SOME(#"\"", str'') => SOME("\"", str'')
738                      | SOME(#"^", str'') => (* Control char *)
739                                (
740                                case getc str'' of
741                                    NONE => NONE
742                                  | SOME(ch'', str''') =>
743                                        if ord ch'' >= 64 andalso 95 >= ord ch''
744                                        then SOME(charAsString(chr(ord ch'' - 64)), str''')
745                                        else NONE
746                                )
747                      | SOME(#"u", str'') =>
748                            (* Hex encoding: Read 4 hex digits *)
749                                (* NOTE: There's a contradiction in the web page:
750                                   It says both 4 hex digits and also "the longest
751                                   sequence of such characters"
752                                 *)
753                                 (case readHexN getc str'' 4 of NONE => NONE | SOME(s, str) => SOME(charAsString s, str))
754                      | SOME(ch', str'') =>
755                            if isSpace ch'
756                            then (* Remove \f...f\ and then recurse. *)
757                                (
758                                case getc (StringCvt.skipWS getc str'') of
759                                    NONE => NONE
760                                  | SOME(ch'', str''') =>
761                                      if ch'' <> #"\\" then NONE (* Bad format *)
762                                      else SOME("", str''') (* Return an empty string. *)
763                                )
764                            else if #"0" <= ch' andalso ch' <= #"2"
765                            then (* Decimal encoding *)
766                                (* NOTE: There's a contradiction in the web page:
767                                   It says both 3 digits and also "the longest
768                                   sequence of such characters".
769                                   The tests insist on 3 digits so we go with
770                                   that. *)
771                                (case readDec getc str' 3 0 of NONE => NONE | SOME(s, str) => SOME(charAsString s, str))
772                            else (* Unknown escape *) NONE
773                    )
774                else SOME(charAsString ch, str') (* Result is the character. *)
775    
776        (* Convert C escapes *)
777        fun scanC (getc: (char, 'a) StringCvt.reader) (str :'a) : (char * 'a) option =
778            case getc str of (* Read the first character. *)
779                NONE => NONE
780              | SOME(ch, str') =>
781                    if ch < chr 32 orelse chr 126 < ch
782                    then NONE (* Non-printable character. *)
783                    else if ch = #"\\"
784                    then (* escape *)
785                        (
786                        case getc str' of
787                            NONE => NONE
788                          | SOME(#"a", str'') => SOME((*#"\a"*) chr 7, str'')
789                          | SOME(#"b", str'') => SOME((*#"\b"*) chr 8, str'')
790                          | SOME(#"t", str'') => SOME(#"\t", str'')
791                          | SOME(#"n", str'') => SOME(#"\n", str'')
792                          | SOME(#"v", str'') => SOME((*#"\v" *) chr 11, str'')
793                          | SOME(#"f", str'') => SOME((*#"\f"*) chr 12, str'')
794                          | SOME(#"r", str'') => SOME((*#"\r"*) chr 13, str'')
795                          | SOME(#"?", str'') => SOME(#"?", str'')
796                          | SOME(#"\\", str'') => SOME(#"\\", str'')
797                          | SOME(#"\"", str'') => SOME(#"\"", str'')
798                          | SOME(#"'", str'') => SOME(#"'", str'')
799                          | SOME(#"^", str'') => (* Control char *)
800                                    (
801                                    case getc str'' of
802                                        NONE => NONE
803                                      | SOME(ch'', str''') =>
804                                            if ord ch'' >= 64 andalso 95 >= ord ch''
805                                            then SOME(chr(ord ch'' - 64), str''')
806                                            else NONE
807                                    )
808                        (* Note: the web page says \u here but it seems it should
809                           be \x.  That's confirmed by the latest version of
810                           the library definition. *)
811                          | SOME(#"x", str'') => (* Hex encoding. *)
812                                     readHex getc str''
813                          | SOME(ch', _) =>
814                                if #"0" <= ch' andalso ch' <= #"7"
815                                then (* Octal encoding *) readOct getc str' 3 0
816                                else (* Unknown escape *) NONE
817                        )
818                    else SOME(ch, str') (* Result is the character. *)
819    end
820
821in
822
823    (* At this point we can start to add conversion functions. *)
824    structure CharVector: MONO_VECTOR =
825    struct
826        fun mapi f vec =
827        let
828            val len = sizeAsWord vec
829        in
830            if len = 0w0 then ""
831            else
832            let
833                (* Allocate a new vector. *)
834                val new_vec = LibrarySupport.allocString len
835                
836                fun domap j =
837                    if j >= len then ()
838                    else (RunCall.storeByte(new_vec, j+wordSize,
839                            f(wordAsInt(j), RunCall.loadByteFromImmutable(vec, j+wordSize)));
840                          domap(j+0w1))
841            in
842                domap 0w0;
843                RunCall.clearMutableBit new_vec;
844                new_vec
845            end
846        end
847
848        (* Return a copy of the string with a particular character replaced *)
849        fun update (v, i, c) =
850            if i < 0 orelse i >= String.size v
851            then raise Subscript
852            else mapi (fn (j, s) => if j = i then c else s) v
853
854        open String
855        (* Name changes needed for CharVector. *)
856        val maxLen = maxSize
857        val fromList = implode
858        val length = size
859    end
860
861    structure Char: CHAR =
862    struct
863        open Char
864
865        fun scan (getc: (char, 'a) StringCvt.reader) (str :'a) : (char * 'a) option =
866            case scanBase getc str of
867                NONE => NONE
868            |   SOME("", strm') => (* May be end-of-string or we may have read a format sequence. *)
869                    (case getc strm' of NONE => (* end-of-string *) NONE | _ => scan getc strm')
870            |   SOME(s, strm') => SOME(unsafeStringSub(s, 0w0), strm') (* Only ever a single character *)
871    
872        (* Convert from a string. *)
873        (* TODO: More efficient conversion using the string directly rather
874           than scanString ? *)
875        val fromString = StringCvt.scanString scan
876        and fromCString = StringCvt.scanString scanC
877        
878        (* Convert to printable string. *)
879        local
880            local
881                (* Conversion to octal has now been defined to generate
882                   three octal digits in the same way as conversion to
883                   integer. *)
884                fun octIntRepr base digs (i: int) =
885                    if digs = 0 then ""
886                    else octIntRepr base (digs-1) (i div base) ^
887                            charAsString(chr(i mod base + ord #"0"))
888            in
889                val intRepr = octIntRepr 10 3
890                val octalRepr = octIntRepr 8 3
891            end
892        in
893    
894            (* Conversion to ML escapes. *)
895            fun toString ch =
896                (* First handle the special cases *)
897                if ch = #"\\" then "\\\\"
898                else if ch = #"\"" then "\\\""
899                else if isPrint ch (* Other printable characters *)
900                then charAsString ch
901                else (* Control chars: Special cases first *)
902                    if ch = chr 7 then "\\a"
903                else if ch = chr 8 then "\\b"
904                else if ch = chr 9 then "\\t"
905                else if ch = chr 10 then "\\n"
906                else if ch = chr 11 then "\\v"
907                else if ch = chr 12 then "\\f"
908                else if ch = chr 13 then "\\r"
909                else if ch < chr 32 (* Other chars must be escaped. *)
910                then "\\^" ^ charAsString(chr(ord ch + 64))
911                else (* Use 3 digit notation. *)
912                (* Note: Web site assumes ASCII, not Unicode. *)
913                    "\\" ^ intRepr(ord ch)
914    
915            (* Conversion to C escapes. *)
916            fun toCString ch =
917                (* First handle the special cases *)
918                if ch = #"\\" then "\\\\"
919                else if ch = #"\"" then "\\\""
920                else if ch = #"?" then "\\?"
921                else if ch = #"'" then "\\'"
922                else if isPrint ch (* Other printable characters *)
923                then charAsString ch
924                else (* Control chars: Special cases first *)
925                    if ch = chr 7 then "\\a"
926                else if ch = chr 8 then "\\b"
927                else if ch = chr 9 then "\\t"
928                else if ch = chr 10 then "\\n"
929                else if ch = chr 11 then "\\v"
930                else if ch = chr 12 then "\\f"
931                else if ch = chr 13 then "\\r"
932                else (* Use octal notation. *)
933                (* Note: Web site assumes ASCII, not Unicode. *)
934                    "\\" ^ octalRepr(ord ch)
935        end;
936            
937        (* Install conversion and print functions. *)
938        local
939            (* It might be worth rewriting scan to raise Conversion with
940               a string argument so we can pass back information about
941               why an escape code was invalid. *)
942            fun convChar s =
943                let
944                val len = sizeAsWord s
945                fun rdr i =
946                    if i = len then NONE
947                    else SOME(unsafeStringSub(s, i), i+0w1)
948                in
949                    case scan rdr 0w0 of
950                        NONE => raise RunCall.Conversion "Invalid character constant"
951                      | SOME(res, index') =>
952                            (* Check that we have converted all the string. *)
953                            if index' <> len
954                            then raise RunCall.Conversion "Not exactly one character"
955                            else res
956                end
957
958            fun print_char _ _ (c: char) =
959                PolyML.PrettyString("#\"" ^ toString c ^ "\"")
960        in
961            val () = RunCall.addOverload convChar "convChar";
962            val () = PolyML.addPrettyPrinter print_char
963        end
964    
965        (* Define the type-specific inequalities. *)
966        val op < : char * char -> bool = op <
967        val op <= : char * char -> bool = op <=
968        val op > : char * char -> bool = op >
969        val op >= : char * char -> bool = op >=
970    
971        fun compare (ch, ch') =
972            if ch < ch' then General.LESS
973            else if ch > ch' then General.GREATER else General.EQUAL
974        end
975
976    structure String: STRING =
977    struct
978        open String
979
980        (* Generate escape characters. *)
981        local
982            fun toStrings convert s =
983            let
984                val len = sizeAsWord s
985                (* First pass - find out the size of the result string. *)
986                fun getSize i n =
987                    if i = len then n
988                    else getSize (i+0w1)
989                            (n + size(convert(RunCall.loadByteFromImmutable(s, i+wordSize))))
990                (* The result could possibly be long so we add the lengths
991                   as integers and convert and check when we've finished. *)
992                val newSize = unsignedShortOrRaiseSize (getSize 0w0 0)
993            in
994                (* If the size is the same we can return the original string.
995                   This relies on the fact that the conversions either return
996                   the character unchanged or return a longer escape sequence. *)
997                if newSize = len
998                then s
999                else
1000                let
1001                    (* Second pass: create the output string and copy to it. *)
1002                    val newVec = LibrarySupport.allocString newSize
1003                    fun copyToOut i j =
1004                    if i = len then ()
1005                    else
1006                    let
1007                        val conv = convert(RunCall.loadByteFromImmutable(s, i+wordSize))
1008                        val convSize = sizeAsWord conv
1009                    in
1010                        bcopy(conv, newVec, wordSize, j, convSize);
1011                        copyToOut (i+0w1) (j+convSize)
1012                    end
1013                in
1014                    copyToOut 0w0 wordSize;
1015                    RunCall.clearMutableBit newVec;
1016                    newVec
1017                end
1018            end
1019        in
1020            val toString = toStrings Char.toString
1021            and toCString = toStrings Char.toCString
1022        end
1023        
1024        (* Convert escapes. *)
1025        fun scan (getc: (char, 'a) StringCvt.reader) (str :'a) : (string * 'a) option =
1026        let
1027            fun scanString str (l: string list) haveRead =
1028                case scanBase getc str of
1029                    NONE => (* Invalid escape sequence *)
1030                        if haveRead then SOME(concat(rev l), str) else NONE
1031                |   SOME("", strm') => (* End of input or read a format sequence. *)
1032                        (case getc strm' of NONE => SOME(concat(rev l), strm') | _ => scanString strm' l true)
1033                |   SOME(s, strm') => scanString strm' (s :: l) true (* More to do. *)
1034        in
1035            scanString str [] false
1036        end
1037                
1038        val fromString = StringCvt.scanString scan          
1039    
1040        (* TODO: More efficient version. *)
1041        fun fromCString "" = SOME "" (* Special case *)
1042        |   fromCString s =
1043            let
1044                val len = sizeAsWord s
1045                fun rdr i =
1046                    if i = len then NONE
1047                    else SOME(unsafeStringSub(s, i), i+0w1)
1048                (* Repeatedly convert escape sequences and accumulate the
1049                   results in a list. *)
1050                fun convChar i =
1051                    case scanC rdr i of
1052                        NONE => []
1053                      | SOME(res, j) => res :: convChar j
1054            in
1055                (* If we couldn't even get a single character we return NONE. *)
1056                case convChar 0w0 of
1057                    [] => NONE
1058                |   res => SOME(implode res)
1059            end
1060    
1061        (* Install conversion and print functions. *)
1062        local
1063            (* It might be worth rewrite scan to raise Conversion with
1064               a string argument so we can pass back information about
1065               why an escape code was invalid. *)
1066            (* Unlike fromString which returns as much of the input string
1067               as could be converted this raises an exception if the
1068               input contains any invalid character. *)
1069            fun convString s =
1070                let
1071                val len = sizeAsWord s
1072                fun rdr i =
1073                    if i = len then NONE
1074                    else SOME(unsafeStringSub(s, i), i+0w1)
1075                (* Repeatedly convert escape sequences and accumulate the
1076                   results in a list. *)
1077                fun convChars i =
1078                    if i = len then [] (* Finished *)
1079                    else case Char.scan rdr i of
1080                        NONE => (* Bad conversion *)
1081                            raise RunCall.Conversion "Invalid string constant"
1082                      | SOME(res, j) => res :: convChars j
1083                in
1084                    implode(convChars 0w0)
1085                end
1086
1087            fun print_string _ _ (s: string) =
1088                PolyML.PrettyString(concat["\"", toString s, "\""])
1089        in
1090            val () = RunCall.addOverload convString "convString";
1091            val () = PolyML.addPrettyPrinter print_string
1092        end
1093    end
1094
1095    (* CharArray is very similar to Word8Array and most of the code is duplicated. *)
1096    structure CharArray : MONO_ARRAY =
1097    struct
1098        (* We can't use the segment length for the length of the vector
1099           as we do for "normal" arrays and vectors.  There are two ways
1100           of handling this.  We could implement arrays in the same
1101           way as strings, with a length word in the first word, or we
1102           could store the length separately.  The former has the advantage
1103           of using less store but the latter allows the byte vector to be
1104           used for other purposes and is probably faster.  *)
1105        type address = LibrarySupport.address
1106        datatype array = datatype LibrarySupport.CharArray.array
1107        (* N.B.  This representation is hard-wired into TextIO.  Don't
1108           change this representation without changing that as well. *)
1109
1110        type vector = string and elem = char
1111
1112        infix 9 sub (* For what it's worth *)
1113                
1114        val maxLen = String.maxSize (* Use the same maximum as string. *)
1115    
1116        fun length(Array(l, _)) = wordAsInt l
1117        
1118        fun array (length, ini) =
1119        let
1120            (* The array is allocated unitialised. *)
1121            val len = unsignedShortOrRaiseSize length
1122            val vec = LibrarySupport.allocBytes len
1123            fun init i = 
1124                if len <= i then ()
1125                else (RunCall.storeByte(vec, i, ini); init(i+0w1))
1126        in
1127            init 0w0;
1128            Array(len, vec)
1129        end
1130    
1131        fun op sub (Array(l, v), i: int): elem =
1132        let
1133            val iW =
1134                if isShortInt i
1135                then intAsWord i
1136                else raise General.Subscript
1137        in
1138            (* Negative values will always be >= l when compared unsigned. *)
1139            if iW >= l then raise General.Subscript
1140            else RunCall.loadByte (v, iW)
1141        end
1142    
1143        fun update (Array (l, v), i: int, new) : unit =
1144        let
1145            val iW =
1146                if isShortInt i andalso i >= 0
1147                then intAsWord i
1148                else raise General.Subscript
1149        in
1150            if iW >= l
1151            then raise General.Subscript
1152            else RunCall.storeByte (v, iW, new)
1153        end;
1154    
1155        (* Create an array from a list. *)
1156        local
1157            fun fromList' (l : char list) : word*address =
1158            let
1159                (* List has not yet been defined.  The length is limited by the
1160                   memory so this won't overflow. *)
1161                fun listLength([], n) = n
1162                |   listLength(_::l, n) = listLength(l, n+0w1)
1163                val length = listLength(l, 0w0)
1164                    
1165                (* Make a array initialised to zero. *)
1166                val vec = LibrarySupport.allocBytes length
1167                
1168                (* Copy the list elements into the array. *)
1169                fun init (v, i, a :: l) = (RunCall.storeByte(v, i, a); init(v, i + 0w1, l))
1170                |  init (_, _, []) = ()
1171                
1172            in
1173                init(vec, 0w0, l);
1174                (length, vec)
1175            end
1176        in
1177            fun fromList (l : elem list) : array = Array(fromList' l)
1178        end
1179            
1180        fun tabulate (length: int , f : int->elem): array =
1181        let
1182            val len = unsignedShortOrRaiseSize length
1183            val vec = LibrarySupport.allocBytes len
1184            (* Initialise it to the function values. *)
1185            fun init i = 
1186                if len <= i then ()
1187                else (RunCall.storeByte(vec, i, f(wordAsInt i)); init(i+0w1))
1188        in
1189            init 0w0;
1190            Array(len, vec)
1191        end
1192        
1193        fun vector (Array(len, vec)) =
1194            if len = 0w0 then ""
1195            else if len = 0w1
1196            then (* Single character string. *)
1197                charAsString (RunCall.loadByte (vec, 0w0))
1198            else
1199            let
1200                (* Make an array initialised to zero. *)
1201                val new_vec = LibrarySupport.allocString len
1202            in
1203                System_move_bytesA(vec, RunCall.unsafeCast new_vec, 0w0, wordSize, len);
1204                RunCall.clearMutableBit new_vec;
1205                new_vec
1206            end
1207    
1208        (* Copy an array into another.  It's possible for the arrays to be
1209           the same but in that case diW must be zero and the copy is a no-op. *)
1210        fun copy {src=Array (len, s), dst=Array (dlen, d), di: int} =
1211            let
1212                val diW = unsignedShortOrRaiseSubscript di
1213            in
1214                if diW+len > dlen
1215                then raise General.Subscript
1216                else System_move_bytesA(s, d, 0w0, diW, len)
1217        end
1218    
1219        (* Copy a vector into an array. *)
1220        (* Since the source is actually a string we have to start the
1221           copy from si+wordSize. *)
1222        fun copyVec {src, dst=Array (dlen, d), di: int} =
1223            let
1224                val len = sizeAsWord src
1225                val diW = unsignedShortOrRaiseSubscript di
1226            in
1227                if diW + len > dlen
1228                then raise General.Subscript
1229                else System_move_bytesA(RunCall.unsafeCast src, d, wordSize, diW, len)
1230            end
1231            
1232        (* Create the other functions. *)
1233        structure ArrayOps =
1234            VectorOperations(
1235                struct
1236                    type vector = array and elem = elem
1237                    fun length(Array(len, _)) = len
1238                    fun unsafeSub(Array(_, v), i) = RunCall.loadByte(v, i)
1239                    and unsafeSet(Array(_, v), i, c) = RunCall.storeByte(v, i, c)
1240                end);
1241    
1242        open ArrayOps;
1243    
1244        local
1245            (* Install the pretty printer for CharArray.array *)
1246            (* We may have to do this outside the structure if we
1247               have opaque signature matching. *)
1248            fun pretty _ _ x =
1249                PolyML.PrettyString(String.concat["\"", String.toString(vector x), "\""])
1250        in
1251            val () = PolyML.addPrettyPrinter pretty
1252        end
1253    end;
1254
1255    structure Substring :>
1256    sig
1257        type  substring
1258        eqtype char
1259        eqtype string
1260        val size : substring -> int
1261        val base : substring -> (string * int * int)
1262        val isEmpty : substring -> bool
1263    
1264        val sub : (substring * int) -> char
1265        val getc : substring -> (char * substring) option
1266        val first : substring -> char option
1267        
1268        val extract : (string * int * int option) -> substring
1269        val substring : (string * int * int) -> substring
1270        (*val slice : (substring * int * int option) -> substring*)
1271        val full: string -> substring
1272        val string : substring -> string
1273        
1274        val concat: substring list ->string
1275        val concatWith: string -> substring list ->string
1276    
1277        val explode : substring -> char list
1278        val translate : (char -> string) -> substring -> string
1279        val app : (char -> unit) -> substring -> unit
1280        val foldl : ((char * 'a) -> 'a) -> 'a -> substring -> 'a
1281        val foldr : ((char * 'a) -> 'a) -> 'a -> substring -> 'a
1282        val tokens : (char -> bool) -> substring -> substring list
1283        val fields : (char -> bool) -> substring -> substring list
1284        val isPrefix: string -> substring -> bool
1285        val isSubstring: string -> substring -> bool
1286        val isSuffix: string -> substring -> bool
1287    
1288        val compare : (substring * substring) -> General.order
1289        val collate : ((char * char) -> General.order) ->
1290                         (substring * substring) -> General.order
1291    
1292        val triml : int -> substring -> substring
1293        val trimr : int -> substring -> substring
1294        val splitl : (char -> bool) -> substring -> (substring * substring)
1295        val splitr : (char -> bool) -> substring -> (substring * substring)
1296        val splitAt : (substring * int) -> (substring * substring)
1297        val dropl : (char -> bool) -> substring -> substring
1298        val dropr : (char -> bool) -> substring -> substring
1299        val takel : (char -> bool) -> substring -> substring
1300        val taker : (char -> bool) -> substring -> substring
1301        val position : string -> substring -> (substring * substring)
1302        val span : (substring * substring) -> substring
1303
1304        type vector
1305        type elem
1306        type slice
1307        
1308        val length : slice -> int
1309        val subslice: slice * int * int option -> slice
1310        val slice: vector * int * int option -> slice
1311        val vector: slice -> vector
1312        val getItem: slice -> (elem * slice) option
1313        val appi : ((int * elem) -> unit) -> slice -> unit
1314        val mapi : ((int * elem) -> elem) -> slice -> vector
1315        val map : (elem -> elem) -> slice -> vector
1316        val foldli : ((int * elem * 'a) -> 'a) -> 'a -> slice -> 'a
1317        val foldri : ((int * elem * 'a) -> 'a) -> 'a -> slice -> 'a
1318        val findi: (int * elem -> bool) -> slice -> (int * elem) option
1319        val find: (elem -> bool) -> slice -> elem option
1320        val exists: (elem -> bool) -> slice -> bool
1321        val all: (elem -> bool) -> slice -> bool
1322        sharing type slice = substring
1323    end
1324        where type elem = char where type vector = string where type char = char where type string = string =
1325    struct
1326        type vector = string and elem = char
1327
1328        structure VectorSliceOps =
1329            VectorSliceOperations(
1330                struct
1331                    type vector = vector and elem = char
1332                    val vecLength = sizeAsWord
1333                    fun unsafeVecSub(s, i: word) = RunCall.loadByteFromImmutable(s, i + wordSize)
1334                    fun unsafeVecUpdate _ = raise Fail "Should not be called" (* Not applicable *)
1335                end);
1336    
1337        open VectorSliceOps;
1338
1339        (* vector: get the slice out.  Since the underlying vector is implemented using the basic
1340           string type we can use substring here. *)
1341        fun vector slice : vector =
1342        let
1343            val (vector, start, length) = base slice
1344        in
1345            unsafeSubstring(vector, intAsWord start, intAsWord length)
1346        end
1347
1348        (* It would be more efficient to do these as single operations but it's probably too complicated. *)
1349        fun concat L = String.concat(List.map vector L)
1350        fun concatWith s L = String.concatWith s (List.map vector L)
1351        fun map f slice = String.map f (vector slice)
1352        fun mapi f slice = CharVector.mapi f (vector slice)
1353        
1354        (* Substring operations. *)
1355        type substring = slice
1356        type char = elem
1357        type string = vector
1358        
1359        val size = length
1360    
1361        (* Since we've already checked the bounds we don't need to do it here. *)
1362        fun string(Slice{vector=s, start=i, length=l}) = unsafeSubstring(s, i, l)
1363    
1364        (* Check that the index and length are valid. *)
1365        fun substring(s, i, j) =
1366            if i < 0 orelse j < 0 orelse String.size s < i+j
1367            then raise General.Subscript
1368            else Slice{vector=s, start=intAsWord i, length=intAsWord j}
1369
1370        fun extract(s, i, NONE) = substring(s, i, String.size s-i)
1371         |  extract(s, i, SOME j) = substring(s, i, j)
1372
1373        fun triml k = 
1374            if k < 0 then raise General.Subscript
1375            else fn (Slice{vector=s, start=i, length=l}) =>
1376                if k > wordAsInt l then Slice{vector=s, start=i+l, length=0w0}
1377                else Slice{vector=s, start=i + intAsWord k, length=l - intAsWord k}
1378            
1379        fun trimr k =
1380            if k < 0 then raise General.Subscript
1381            else fn (Slice{vector=s, start=i, length=l}) =>
1382                if k > wordAsInt l then Slice{vector=s, start=i, length=0w0}
1383                else Slice{vector=s, start=i, length=l - intAsWord k}
1384
1385        fun explode (Slice{vector=s, start=i, length=l}) : char list = stringExplode(s, i, l)
1386    
1387        (* Compare two strings.  We could define compare in terms of collate and it
1388           would be just as efficient provided we set PolyML.Compiler.maxInlineSize
1389           to a large enough value that collate was inlined, and hence Char.compare
1390           would be inlined.  *)
1391        fun compare (Slice{vector=s, start=j, length=l}, Slice{vector=s', start=j', length=l'}) =
1392            let
1393            fun comp' i =
1394                if i = l
1395                then
1396                    (
1397                    if l = l' then General.EQUAL
1398                    else (* l < l' *) General.LESS
1399                    )
1400                else if i = l' (* and not l *) then General.GREATER
1401                else
1402                    case Char.compare(unsafeStringSub(s, i+j), unsafeStringSub(s', i+j')) of
1403                        General.EQUAL => comp' (i+0w1)
1404                      | General.LESS => General.LESS
1405                      | General.GREATER => General.GREATER
1406            in
1407            comp' 0w0
1408            end
1409
1410        fun isPrefix (s1: string) (Slice{vector=s2, start=i, length=l}) =
1411        let
1412            val size_s1 = sizeAsWord s1
1413        in
1414            if size_s1 > l
1415            then false
1416            else byteMatch s1 s2 0w0 i size_s1
1417        end
1418
1419        (* True if s1 is a suffix of s2 *)
1420        fun isSuffix s1 (Slice{vector=s2, start=i, length=l}) =
1421        let
1422            val size_s1 = sizeAsWord s1
1423        in
1424            if size_s1 > l
1425            then false
1426            else byteMatch s1 s2 0w0 (l + i - size_s1) size_s1
1427        end
1428
1429        (* True if s1 is a substring of s2 *)
1430        fun isSubstring s1 (Slice{vector=s2, start, length}) =
1431        let
1432            val size_s1 = sizeAsWord s1
1433            (* Start at the beginning and compare until we get a match. *)
1434            fun doMatch i s =
1435            if s < size_s1 then false (* The remainder of the string is too small to match. *)
1436            else if byteMatch s1 s2 0w0 i size_s1
1437            then true
1438            else doMatch (i+0w1) (s-0w1)
1439        in
1440            doMatch start length
1441        end
1442
1443        (* TODO: This would be quicker with an RTS function to scan for a
1444           character in a string. *)
1445        fun splitl f (Slice{vector=s, start=i, length=l}) =
1446            let
1447            fun find j =
1448                if j = i+l
1449                then (* All chars satisfy f *) (Slice{vector=s, start=i, length=l}, Slice{vector=s, start=j, length=0w0})
1450                else if f(unsafeStringSub(s, j)) then find (j+0w1)
1451                else (* Found a separator *)
1452                    (Slice{vector=s, start=i, length=j-i}, Slice{vector=s, start=j, length=l+i-j})
1453            in
1454            find i
1455            end
1456    
1457        (* TODO: This would be quicker with an RTS function to scan for a
1458           character in a string. *)
1459        fun splitr f (Slice{vector=s, start=i, length=l}) =
1460            let
1461            fun find j =
1462                if j = i
1463                then (* All chars satisfy f *) (Slice{vector=s, start=j, length=0w0}, Slice{vector=s, start=i, length=l})
1464                else if f(unsafeStringSub(s, j-0w1)) then find (j-0w1)
1465                else (* Found a separator *)
1466                    (Slice{vector=s, start=i, length=j-i}, Slice{vector=s, start=j, length=l+i-j})
1467            in
1468            find (i+l)
1469            end
1470            
1471        fun splitAt (Slice{vector=s, start=i, length=l}, j) =
1472        let
1473            val j' = unsignedShortOrRaiseSubscript j
1474        in
1475            if j' > l then raise General.Subscript
1476            else (Slice{vector=s, start=i, length=j'}, Slice{vector=s, start=i+j', length=l-j'})
1477        end
1478        
1479        (* TODO: Define these directly rather than via split.  It's not so expensive
1480           doing it this way for substrings because we don't actually copy the strings. *)
1481        fun takel p s = #1(splitl p s)
1482        and dropl p s = #2(splitl p s)
1483        and taker p s = #2(splitr p s)
1484        and dropr p s = #1(splitr p s)
1485
1486        (* NOTE: There's an error in the web page.  The example function uses "trim"
1487           rather than "triml".
1488           QUESTION: The check i'+n' >= i does not guarantee that ss is to the left of ss',
1489           merely that the end of ss' is to the right of the beginning of ss. 
1490           I can't remember my reasoning about this at the moment.  *)
1491        
1492        fun span (Slice{vector=s, start=i, length=_}, Slice{vector=s', start=i', length=n'}) =
1493            (* First check with pointer equality and only if that fails do we use the
1494               string equality function. *)
1495            if (RunCall.pointerEq(s, s') orelse s = s') andalso i'+n' >= i
1496            then Slice{vector=s, start=i, length=i'+n'-i}
1497            else raise General.Span 
1498           
1499        (* tokens and fields are very similar except that tokens does not return
1500           empty strings for adjacent delimiters whereas fields does.
1501           This definition is almost the same as String.tokens and String.fields. *)
1502        (* QUESTION: Are these defined always to return the results as substrings
1503           of the original base string?  That's important if we want to be able to
1504           use "span" to join them up again.  *)
1505        fun tokens p (Slice{vector=s, start=j, length}) =
1506            let
1507            val ends = j+length
1508            fun tok' i l = (* i is the character to examine.  l is the start of a token *)
1509                if i = ends
1510                then (* Finished the input.  Return any partially completed string. *)
1511                    (
1512                    if l = i then [] else [Slice{vector=s, start=l, length=i-l}]
1513                    )
1514                else if p (unsafeStringSub(s, i))
1515                then (* It's a delimiter.  If we have more than one character in the
1516                        string we create a string otherwise we just continue. *)
1517                    (
1518                    if l = i then tok' (i+0w1) (i+0w1)
1519                    else Slice{vector=s, start=l, length=i-l} :: tok' (i+0w1) (i+0w1)
1520                    )
1521                else (* Token: Keep accumulating characters. *) tok' (i+0w1) l
1522            in
1523            tok' j j
1524            end
1525    
1526        fun fields p (Slice{vector=s, start=j, length}) =
1527            let
1528            val ends = j+length
1529            
1530            fun field' i l = (* i is the character to examine.  l is the start of a token *)
1531                if i = ends
1532                then (* Finished the input.  Return any partially completed string. *)
1533                    [Slice{vector=s, start=l, length=i-l}]
1534                else if p (unsafeStringSub(s, i))
1535                then (* It's a delimiter.  Finish the partially completed string and
1536                        start another. *)
1537                    Slice{vector=s, start=l, length=i-l} :: field' (i+0w1) (i+0w1)
1538                else (* Field: Keep accumulating characters. *) field' (i+0w1) l
1539            in
1540            field' j j
1541            end
1542    
1543        (* TODO: Could be defined more efficiently. *)
1544        (* map and translate are defined to apply f from left to right. *)
1545        fun translate f s = String.concat(List.map f (explode s))
1546        
1547        fun position s (Slice{vector=s', start=i, length=n}) =
1548        let
1549            val m = sizeAsWord s (* Length of string to match. *)
1550            fun pos k =
1551                if k > n-m then (* No match *) (Slice{vector=s', start=i, length=n}, Slice{vector=s', start=i+n, length=0w0})
1552                else if compare(full s, Slice{vector=s', start=i+k, length=m}) = EQUAL
1553                then (* Match *) (Slice{vector=s', start=i, length=k}, Slice{vector=s', start=k+i, length=n-k})
1554                else pos (k+0w1)
1555        in
1556            (* Because m and n are word values n-m is UNSIGNED so we have to check
1557               this before we call "pos". *)
1558            if m > n then (Slice{vector=s', start=i, length=n}, Slice{vector=s', start=i+n, length=0w0})
1559            else pos 0w0
1560        end
1561
1562        (* Return the first character of the string together with the rest of the
1563           string.  *)
1564        fun getc(Slice{length=0w0, ...}) = NONE
1565          | getc(Slice{vector=s, start=i, length=l}) = SOME(unsafeStringSub(s, i), Slice{vector=s, start=i+0w1, length=l-0w1})
1566    
1567        fun first(Slice{length=0w0, ...}) = NONE
1568          | first(Slice{vector=s, start=i, length=_}) = SOME(unsafeStringSub(s, i))
1569        
1570    end;
1571
1572    (* CharVectorSlice. *)
1573    structure CharVectorSlice: MONO_VECTOR_SLICE where type elem = char where type vector = string = Substring;
1574
1575    structure Substring : SUBSTRING =
1576        struct open Substring;
1577        val slice = subslice
1578        end
1579        
1580    local
1581        (* Install the pretty printer for CharVector.slice (and substring) *)
1582        (* We may have to do this outside the structure if we
1583           have opaque signature matching. *)
1584        fun pretty _ _ s =
1585            PolyML.PrettyString(String.concat["\"", String.toString(Substring.string s), "\""])
1586    in
1587        val _ = PolyML.addPrettyPrinter pretty
1588    end;
1589
1590    structure CharArraySlice:> MONO_ARRAY_SLICE where type elem = char where type vector = string
1591                    where type vector_slice = CharVectorSlice.slice where type array = CharArray.array =
1592    struct
1593        type elem = char
1594        type vector = string
1595        datatype array = datatype LibrarySupport.CharArray.array
1596        (* N.B.  This representation is hard-wired into TextIO.  Don't
1597           change this representation without changing that as well. *)
1598        type vector_slice = CharVectorSlice.slice
1599
1600        structure ArraySliceOps =
1601            VectorSliceOperations(
1602                struct
1603                    type vector = array and elem = char
1604                    fun unsafeVecSub(Array(_, s: LibrarySupport.address), i) = RunCall.loadByte(s, i)
1605                    and unsafeVecUpdate(Array(_, s), i, x) = RunCall.storeByte (s, i, x)
1606                    and vecLength(Array(l, _)) = l
1607                end);
1608    
1609        open ArraySliceOps;
1610
1611        (* vector: get the slice out. *)
1612        fun vector slice: vector =
1613            let
1614                val (Array(_, vec), start, length) = base slice
1615            in
1616                if length = 0 then ""
1617                else if length = 1
1618                then (* Optimise single character strings. *)
1619                    charAsString(RunCall.loadByte (vec, intAsWord start))
1620                else
1621                let
1622                    val len = intAsWord length
1623                    (* Make an array initialised to zero. *)
1624                    val new_vec = LibrarySupport.allocString len
1625                in
1626                    System_move_bytesA(vec, RunCall.unsafeCast new_vec, intAsWord start, wordSize, len);
1627                    RunCall.clearMutableBit new_vec;
1628                    new_vec
1629                end
1630            end
1631
1632        (* Copy a slice into an array. N.B. The arrays could be the same. *)
1633        fun copy {src, dst, di: int} =
1634        let
1635            val (src, start, length) = base src
1636        in
1637            if di < 0 orelse di+length > CharArray.length dst
1638            then raise General.Subscript
1639            else (* We can't use System_move_bytes because of the potential overlap problem.
1640                    Instead we use explicit copying choosing to copy up or down depending
1641                    on the index whether the source and destination are the same or not. *)
1642            let
1643                fun copyUp n =
1644                if n = length then ()
1645                else (CharArray.update(dst, n+di, CharArray.sub(src, n+start)); copyUp(n+1))
1646                
1647                and copyDown n =
1648                if n < 0 then ()
1649                else (CharArray.update(dst, n+di, CharArray.sub(src, n+start)); copyDown(n-1))
1650            in
1651                if di > start then copyDown(length-1) else copyUp 0
1652            end
1653        end
1654    
1655        (* Copy a vector slice into an array. *)
1656        fun copyVec {src: CharVectorSlice.slice, dst=Array (dlen, d), di: int} =
1657            let
1658                val (source, i, l) = CharVectorSlice.base src
1659                val len = intAsWord l and offset = intAsWord i
1660                val diW = unsignedShortOrRaiseSubscript di
1661            in
1662                if diW + len > dlen
1663                then raise General.Subscript
1664                    (* The source is represented by a string whose first word is the length. *)
1665                else System_move_bytesA(RunCall.unsafeCast source, d, offset + wordSize, diW, len)
1666            end
1667        
1668    end (* CharArraySlice *);
1669    
1670    local
1671        (* Install the pretty printer for CharArraySlice.slice *)
1672        (* We may have to do this outside the structure if we
1673           have opaque signature matching. *)
1674        fun pretty _ _ x =
1675            PolyML.PrettyString(String.concat["\"", CharArraySlice.vector x, "\""])
1676    in
1677        val _ = PolyML.addPrettyPrinter pretty
1678    end
1679    
1680    structure StringCvt : STRING_CVT = StringCvt
1681end;
1682
1683val () = RunCall.addOverload Char.>= ">="
1684and () = RunCall.addOverload Char.<= "<="
1685and () = RunCall.addOverload Char.>  ">"
1686and () = RunCall.addOverload Char.<  "<";
1687
1688val () = RunCall.addOverload String.>= ">="
1689and () = RunCall.addOverload String.<= "<="
1690and () = RunCall.addOverload String.>  ">"
1691and () = RunCall.addOverload String.<  "<";
1692
1693(* Values available unqualified at the top level. *)
1694val ord : char -> int = Char.ord 
1695val chr : int -> char = Char.chr 
1696val concat : string list -> string =String.concat 
1697val implode : char list -> string = String.implode 
1698val explode : string -> char list = String.explode 
1699val substring : string * int * int -> string = String.substring;
1700val op ^ : string * string -> string = String.^;
1701type substring = Substring.substring;
1702val size: string -> int = String.size;
1703val str: char -> string = String.str;
1704
1705(* These are declared in the prelude. *)
1706(* val size : string -> int = String.size 
1707   val str : char -> string = String.str *)
1708