1(* 2 Title: Standard Basis Library: String Structure 3 Copyright David Matthews 1999, 2005, 2016, 2018 4 5 This library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License version 2.1 as published by the Free Software Foundation. 8 9 This library is distributed in the hope that it will be useful, 10 but WITHOUT ANY WARRANTY; without even the implied warranty of 11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 Lesser General Public License for more details. 13 14 You should have received a copy of the GNU Lesser General Public 15 License along with this library; if not, write to the Free Software 16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17*) 18 19(* 20 This file declares Char, String and CharVector. String and CharVector 21 are simply different views on the same underlying structure. 22*) 23(* The overloads for char and string for the relational operators have 24 already been set up in the prelude. *) 25 26local 27 open LibrarySupport 28 29 (* Redefine these as functions on the abstract type. *) 30 val System_move_bytesA: 31 address*address*word*word*word->unit = RunCall.moveBytes 32 33 val wordSize : word = LibrarySupport.wordSize 34 35 local 36 fun singleCharString(c: word): string = 37 let 38 val v = allocString 0w1 39 val () = RunCall.storeByte(v, wordSize, c) 40 val () = RunCall.clearMutableBit v 41 in 42 v 43 end 44 (* We haven't defined Vector at this stage. *) 45 val charMap = RunCall.allocateWordMemory(0w256, 0wx40, 0w0) 46 val intAsWord: int -> word = RunCall.unsafeCast 47 fun setEntries i = 48 if i < 256 49 then (RunCall.storeWord(charMap, intAsWord i, singleCharString(intAsWord i)); setEntries(i+1)) 50 else (); 51 val () = setEntries 0 52 val () = RunCall.clearMutableBit charMap 53 in 54 (* Since we've covered the full range from 0 to 255 we don't need a bounds check. *) 55 fun charAsString (ch: char): string = RunCall.loadWord(charMap, RunCall.unsafeCast ch) 56 end 57 58 val bcopy: string*string*word*word*word -> unit = RunCall.moveBytes 59 60 (* This can be used where we have already checked the range. *) 61 fun unsafeStringSub(s: string, i: word): char = 62 RunCall.loadByteFromImmutable(s, i + wordSize) 63 64 fun unsafeSubstring(s: string, i: word, l: word) : string = 65 let 66 val baseLen = sizeAsWord s (* Length of base string. *) 67 in 68 if i = 0w0 andalso l = baseLen then s 69 else if l = 0w0 then "" (* Empty string. *) 70 else if l = 0w1 71 (* Single character string - use pre-built strings. *) 72 then charAsString(unsafeStringSub(s, i)) 73 else 74 let 75 (* Multiple character string. *) 76 val vec = allocString l 77 in 78 RunCall.moveBytes(s, vec, wordSize+i, wordSize, l); 79 RunCall.clearMutableBit vec; 80 vec 81 end 82 end 83 84 (* Casts between int and word. *) 85 val intAsWord: int -> word = RunCall.unsafeCast 86 and wordAsInt: word -> int = RunCall.unsafeCast 87 88 (* String concatenation. *) 89 fun op ^ (a: string, b: string): string = 90 let 91 val a_length = sizeAsWord a 92 and b_length = sizeAsWord b 93 in 94 (* Handle the special cases where one of the strings is 95 empty. As well as saving on duplicating storage it 96 also means we don't have to consider the special 97 case when the result string is a single character. *) 98 if a_length = 0w0 then b 99 else if b_length = 0w0 then a 100 else (* Normal case *) 101 let 102 val vec = LibrarySupport.allocString(a_length + b_length) 103 in 104 bcopy(a, vec, wordSize, wordSize, a_length); 105 bcopy(b, vec, wordSize, wordSize+a_length, b_length); 106 RunCall.clearMutableBit vec; 107 vec 108 end 109 end (* op ^ *) 110 111 (* String comparison function used in isPrefix and isSuffix. 112 N.B. The caller must make sure that neither string is a single character. *) 113 local 114 val byteVecEq: string * string * word * word * word -> bool = RunCall.byteVectorEqual 115 in 116 fun byteMatch s1 s2 i j l = 117 byteVecEq(s1, s2, i+wordSize, j+wordSize, l) 118 end 119 120 (* We use stringExplode in String and Substring. *) 121 fun stringExplode (s: string, i: word, l: word) : char list = 122 let 123 fun exp_str (num, res) = 124 if num = 0w0 125 then res 126 else exp_str (num - 0w1, RunCall.loadByteFromImmutable(s, num+i-0w1+wordSize) :: res) 127 in 128 exp_str (l, []) 129 end 130 131 (* There's an irritating dependency here. Char uses StringCvt.reader 132 which means that StringCvt depends on Char so String depends on 133 StringCvt. That means we can't define StringCvt in terms of String 134 which would be easiest. *) 135 structure Char = 136 struct 137 type char = char and string = string 138 val maxOrd = 255 (* Range from 0 to 255 *) 139 140 (* Single characters are represented by the number so we only need 141 to check the argument and then convert it. *) 142 fun chr i : char = 143 if i < 0 orelse i > maxOrd 144 then raise General.Chr else RunCall.unsafeCast i 145 146 val ord: char -> int = RunCall.unsafeCast 147 148 val minChar = chr 0 and maxChar = chr maxOrd 149 150 fun succ c = if ord c = maxOrd then raise Chr else chr(ord c + 1) 151 and pred c = if ord c = 0 then raise Chr else chr(ord c - 1) 152 153 fun isUpper c = #"A" <= c andalso c <= #"Z" 154 fun isLower c = #"a" <= c andalso c <= #"z" 155 fun isDigit c = #"0" <= c andalso c <= #"9" 156 fun isAlpha c = isUpper c orelse isLower c 157 fun isAlphaNum c = isAlpha c orelse isDigit c 158 fun isHexDigit c = 159 isDigit c orelse (#"a" <= c andalso c <= #"f") 160 orelse (#"A" <= c andalso c <= #"F") 161 fun isGraph c = #"!" <= c andalso c <= #"~" 162 fun isPrint c = isGraph c orelse c = #" " 163 fun isPunct c = isGraph c andalso not (isAlphaNum c) 164 (* NOTE: The web page includes 0 <= ord c but all chars satisfy that. *) 165 fun isAscii c = c <= chr 127 166 (* NOTE: The web page defines isCtrl not isCntrl *) 167 fun isCntrl c = isAscii c andalso not (isPrint c) 168 (* NOTE: There's a mistake in the web page. It says c <= #"\ " *) 169 fun isSpace c = (#"\t" <= c andalso c <= #"\r") orelse c = #" " 170 fun toLower c = if isUpper c then chr (ord c + 32) else c 171 fun toUpper c = if isLower c then chr (ord c - 32) else c 172 173 (* TODO: More efficient versions. 174 Probably best to use comparison for short strings and table 175 look-up for longer ones. *) 176 fun contains s = 177 let 178 fun match 0w0 _ = false 179 | match i c = unsafeStringSub(s, i-0w1) = c orelse match (i-0w1) c 180 in 181 match (sizeAsWord s) 182 end 183 184 fun notContains s c = not (contains s c) 185 end; (* structure Char *) 186 187 structure String = 188 (* This structure is the basis of both String and CharVector. *) 189 struct 190 type string = string 191 and vector = string 192 and elem = char 193 and char = char 194 195 (* We don't have Word.toInt yet so we have to use casts in these next two. *) 196 val size : string -> int = RunCall.unsafeCast o LibrarySupport.sizeAsWord 197 val maxSize: int = RunCall.unsafeCast LibrarySupport.maxString 198 199 val str: char ->string = charAsString 200 201 (* Concatentate a list of strings. *) 202 fun concat [] = "" 203 | concat [s] = s (* Handle special case to reduce copying. *) 204 (* Could also handle the case of concat(""::s) = concat s *) 205 | concat L = 206 let 207 fun total n [] = n 208 | total n (H::T) = total (n + size H) T 209 (* How many characters do we have to implode? This could 210 possibly be long (although we would probably have run out 211 of memory long before) so we have to add these as integers 212 and then raise an exception if it's not short. *) 213 val chars : int = total 0 L; 214 in 215 if chars = 0 216 then "" 217 else (* Normal case *) 218 let 219 val chs = unsignedShortOrRaiseSize chars (* Check it's short. *) 220 val vec = LibrarySupport.allocString chs 221 222 fun copy (_, []:string list) = () 223 | copy (i, H :: T) = 224 let 225 val src_len = sizeAsWord H 226 in 227 bcopy(H, vec, wordSize, i, src_len); 228 copy(i+src_len, T) 229 end 230 in 231 copy (wordSize, L); 232 RunCall.clearMutableBit vec; 233 vec 234 end 235 end (* concat *) 236 237 fun concatWith _ [] = "" 238 | concatWith _ [one] = one 239 | concatWith s (hd :: tl) = 240 let 241 fun mk [] = [] 242 | mk (h::t) = s :: h :: mk t 243 in 244 concat(hd :: mk tl) 245 end 246 247 (* implode is very similar to concat, in fact it could be defined 248 as a cast version of it. *) 249 fun implode [] : string = "" 250 | implode (L as (H::_)) = 251 let 252 (* How many characters do we have to implode? *) 253 val listLength = length L 254 (* In practice we could never make a list with a 255 combined length which was a long integer but 256 we still check it here in unsignedShortOrRaiseSize. *) 257 val chars: word = unsignedShortOrRaiseSize listLength 258 in 259 if chars = 0w1 then str H 260 else 261 let 262 val dest = LibrarySupport.allocString chars 263 264 fun copy (_, []:char list) = () 265 | copy (i, H :: T) = 266 ( 267 RunCall.storeByte (dest, i, H); 268 copy (i + 0w1, T) 269 ) 270 in 271 copy (wordSize, L); 272 RunCall.clearMutableBit dest; (* reset mutable flag *) 273 dest 274 end 275 end 276 277 (* This was previously built-in because of the way it worked in 278 the Poly language. It could be defined as concat[a,b] but we 279 define it separately for efficiency. *) 280 val op ^ : string * string -> string = op ^ 281 282 fun sub (s: string, i: int): char = 283 if i < 0 orelse i >= size s 284 then raise General.Subscript 285 else RunCall.loadByteFromImmutable(s, intAsWord i + wordSize); 286 287 (* Explode a string into a list of characters. *) 288 fun explode (s : string) : char list = stringExplode(s, 0w0, sizeAsWord s) 289 290 (* TODO: Could be defined more efficiently, perhaps by copying 291 it into an array. *) 292 (* This would be easier if we could process the string twice as we 293 do with toString but we need to be careful to call f only once 294 for each character in case it has a side-effect. *) 295 fun translate f s = 296 let 297 val len = sizeAsWord s 298 in 299 let 300 (* Accumulate the characters into a list. *) 301 fun mapChars i l = 302 if i = len then l 303 else mapChars (i+0w1) (f(RunCall.loadByteFromImmutable(s, i+wordSize)) :: l) 304 305 (* Reverse has not yet been defined. *) 306 fun revAppend([], a) = a 307 | revAppend(x::y, a) = revAppend(y, x::a) 308 in 309 (* Reverse the list and concatenate it. *) 310 concat(revAppend(mapChars 0w0 [], [])) 311 end 312 end 313 314 fun substring (s, i, j) = 315 let 316 val len = sizeAsWord s 317 (* Check that the index and length are both non-negative. *) 318 val i' = unsignedShortOrRaiseSubscript i 319 and j' = unsignedShortOrRaiseSubscript j 320 in 321 if i'+j' > len 322 then raise Subscript 323 else unsafeSubstring(s, i', j') 324 end 325 326 fun extract (s, i, NONE) = substring (s, i, size s - i) 327 | extract (s, i, SOME j) = substring (s, i, j) 328 329 (* tokens and fields are very similar except that tokens does not return 330 empty strings for adjacent delimiters whereas fields does. *) 331 fun tokens p s = 332 let 333 val length = size s 334 fun tok' i l = (* i is the character to examine. l is the start of a token *) 335 if i = length 336 then (* Finished the input. Return any partially completed string. *) 337 ( 338 if l = i then [] else [substring (s, l, i-l)] 339 ) 340 else if p (sub(s, i)) (* TODO: We don't need sub to do the range check here *) 341 then (* It's a delimiter. If we have more than one character in the 342 string we create a string otherwise we just continue. *) 343 ( 344 if l = i then tok' (i+1) (i+1) 345 else substring (s, l, i-l) :: tok' (i+1) (i+1) 346 ) 347 else (* Token: Keep accumulating characters. *) tok' (i+1) l 348 in 349 tok' 0 0 350 end 351 352 fun fields p s = 353 let 354 val length = size s 355 356 fun field' i l = (* i is the character to examine. l is the start of a token *) 357 if i = length 358 then (* Finished the input. Return any partially completed string. *) 359 [substring (s, l, i-l)] 360 else if p (unsafeStringSub(s, intAsWord i)) 361 then (* It's a delimiter. Finish the partially completed string and 362 start another. *) 363 substring (s, l, i-l) :: field' (i+1) (i+1) 364 else (* Field: Keep accumulating characters. *) field' (i+1) l 365 in 366 field' 0 0 367 end 368 369 (* True if s1 is a prefix of s2 *) 370 (* G&R now says that a string is a prefix of itself. *) 371 fun isPrefix s1 s2 = 372 let 373 val size_s1 = size s1 and size_s2 = size s2 374 in 375 if size_s1 <= size_s2 376 then byteMatch s1 s2 0w0 0w0 (intAsWord size_s1) 377 else false 378 end 379 380 (* True if s1 is a suffix of s2 *) 381 fun isSuffix s1 s2 = 382 let 383 val size_s1 = size s1 and size_s2 = size s2 384 in 385 if size_s1 <= size_s2 386 then byteMatch s1 s2 0w0 (intAsWord (size_s2 - size_s1)) (intAsWord size_s1) 387 else false 388 end 389 390 (* True if s1 is a substring of s2 *) 391 fun isSubstring s1 s2 = 392 let 393 val size_s1 = size s1 and size_s2 = size s2 394 (* Start at the beginning and compare until we get a match. *) 395 fun doMatch i s = 396 if s < size_s1 then false (* The remainder of the string is too small to match. *) 397 else if byteMatch s1 s2 0w0 i (intAsWord size_s1) 398 then true 399 else doMatch (i+0w1) (s-1) 400 in 401 doMatch 0w0 size_s2 402 end 403 404 405 (* Functions specific to CharVector, apart from map which is common. *) 406 fun tabulate (0, _) : vector = "" (* Must not try to lock it. *) 407 | tabulate (1, f) : vector = charAsString(f 0) 408 | tabulate (length: int , f : int->elem): vector = 409 let 410 val len = unsignedShortOrRaiseSize length (* Raises Size if length < 0 *) 411 val vec = LibrarySupport.allocString len 412 (* Initialise it to the function values. *) 413 fun init i = 414 if len <= i then () 415 else (RunCall.storeByte(vec, i+wordSize, f(wordAsInt i)); init(i+0w1)) 416 in 417 init 0w0; 418 RunCall.clearMutableBit vec; 419 vec 420 end 421 422 (* Create the other functions. *) 423 structure VectorOps = 424 VectorOperations( 425 struct 426 type vector = vector and elem = elem 427 val length = sizeAsWord 428 fun unsafeSub(s, i) = RunCall.loadByteFromImmutable(s, i + wordSize); 429 fun unsafeSet(_, _, _) = raise Fail "Should not be called" 430 end); 431 432 open VectorOps; 433 434 fun map f vec = 435 let 436 val len = sizeAsWord vec 437 in 438 if len = 0w0 then "" 439 else (* len > 1 *) 440 let 441 (* Allocate a new vector. *) 442 val new_vec = LibrarySupport.allocString len 443 val byte_limit = len + wordSize 444 445 fun domap i = 446 if i >= byte_limit then () 447 else (RunCall.storeByte(new_vec, i, f(RunCall.loadByteFromImmutable(vec, i))); domap(i+0w1)) 448 in 449 domap wordSize; 450 RunCall.clearMutableBit new_vec; 451 new_vec 452 end 453 end 454 455 local 456 (* String comparison. *) 457 fun compareString(s1, s2) = 458 let 459 val s1l = sizeAsWord s1 and s2l = sizeAsWord s2 460 val test = RunCall.byteVectorCompare(s1, s2, wordSize, wordSize, if s1l < s2l then s1l else s2l) 461 in 462 if test = 0 (* If the strings are the same up to the shorter length ... *) 463 then RunCall.unsafeCast(s1l - s2l) (* The result depends on the lengths. *) 464 else test 465 end 466 in 467 fun compare (s1, s2) = 468 let 469 val c = compareString(s1, s2) 470 in 471 if c = 0 472 then General.EQUAL 473 else if c > 0 474 then General.GREATER 475 else General.LESS 476 end 477 478 (* String relational operators. They could all be defined in terms of "compare" but this 479 generates better code. *) 480 val op >= = 481 fn (s1: string, s2: string) => 482 let 483 val s1l = sizeAsWord s1 and s2l = sizeAsWord s2 484 val test = RunCall.byteVectorCompare(s1, s2, wordSize, wordSize, if s1l < s2l then s1l else s2l) 485 in 486 if test = 0 487 then s1l >= s2l 488 else test >= 0 489 end 490 491 and op <= = 492 fn (s1: string, s2: string) => 493 let 494 val s1l = sizeAsWord s1 and s2l = sizeAsWord s2 495 val test = RunCall.byteVectorCompare(s1, s2, wordSize, wordSize, if s1l < s2l then s1l else s2l) 496 in 497 if test = 0 498 then s1l <= s2l 499 else test <= 0 500 end 501 502 and op > = 503 fn (s1: string, s2: string) => 504 let 505 val s1l = sizeAsWord s1 and s2l = sizeAsWord s2 506 val test = RunCall.byteVectorCompare(s1, s2, wordSize, wordSize, if s1l < s2l then s1l else s2l) 507 in 508 if test = 0 509 then s1l > s2l 510 else test > 0 511 end 512 513 and op < = 514 fn (s1: string, s2: string) => 515 let 516 val s1l = sizeAsWord s1 and s2l = sizeAsWord s2 517 val test = RunCall.byteVectorCompare(s1, s2, wordSize, wordSize, if s1l < s2l then s1l else s2l) 518 in 519 if test = 0 520 then s1l < s2l 521 else test < 0 522 end 523 end 524 525 526 end (* String *) 527 528 529 structure StringCvt = 530 struct 531 val mem_move: string*string*word*word*word -> unit = RunCall.moveBytes 532 533 datatype radix = BIN | OCT | DEC | HEX 534 535 datatype realfmt 536 = SCI of int option 537 | FIX of int option 538 | GEN of int option 539 | EXACT 540 541 type ('a, 'b) reader = 'b -> ('a * 'b) option 542 543 fun padLeft c i s = 544 if i <= 0 (* unsignedShortOrRaiseSize raises Size if i < 0 which isn't right here. *) 545 then s 546 else 547 let 548 val len: word = sizeAsWord s 549 val iW = unsignedShortOrRaiseSize i (* checks that i is a short. *) 550 in 551 if len >= iW then s 552 else 553 let 554 val extra = iW - len 555 val str = LibrarySupport.allocString iW 556 fun setCh n = 557 if n = extra then () 558 (* Set the character part of the string. *) 559 else ( RunCall.storeByte(str, n+wordSize, c); setCh(n+0w1) ) 560 in 561 setCh 0w0; 562 (* Copy the character part of the string over. *) 563 mem_move(s, str, wordSize, extra + wordSize, len); 564 RunCall.clearMutableBit str; 565 str 566 end 567 end 568 569 fun padRight c i s = 570 if i <= 0 (* unsignedShortOrRaiseSize raises Size if i < 0 which isn't right here. *) 571 then s 572 else 573 let 574 val len = sizeAsWord s 575 val iW = unsignedShortOrRaiseSize i (* checks that i is a short. *) 576 in 577 if len >= iW then s 578 else 579 let 580 val str = LibrarySupport.allocString iW 581 fun setCh n = 582 if n = iW then () 583 (* Set the character part of the string. *) 584 else ( RunCall.storeByte(str, n+wordSize, c); setCh(n+0w1) ) 585 in 586 (* Copy the character part of the string over. *) 587 mem_move(s, str, wordSize, wordSize, len); 588 setCh len; 589 RunCall.clearMutableBit str; 590 str 591 end 592 end 593 594 (* p is described as a predicate. That implies that it is 595 side-effect free. If it is we could use it e.g. twice, once to work out 596 the length of the string and then to create the string itself. 597 Assume that it may have side-effects and that we can only execute it 598 once. *) 599 600 local 601 fun split' p f res src = 602 case f src of 603 NONE => (String.implode(rev res), src) (* Not available. *) 604 | SOME (ch, src') => (* Char available *) 605 if p ch 606 then (* It matches - include in the result *) 607 split' p f (ch :: res) src' 608 else (String.implode(rev res), src) (* No match *) 609 in 610 fun splitl p f src = split' p f [] src 611 end 612 613 (* It may be worth defining takel independently but it doesn't add 614 much overhead by contrast with dropl *) 615 fun takel p f s = #1(splitl p f s) 616 (* fun dropl p f s = #2(splitl p f s) *) 617 618 (* This is probably as efficient as it can be. *) 619 fun dropl p f src = 620 case f src of 621 NONE => src (* Not available. *) 622 | SOME (ch, src') => (* Char available *) 623 if p ch 624 then dropl p f src' 625 else src (* No match *) 626 627 (* Copied isSpace from Char structure to avoid circular dependency. *) 628 fun skipWS f src = 629 case f src of 630 NONE => src (* Not available. *) 631 | SOME (ch, src') => (* Char available *) 632 if (#"\t" <= ch andalso ch <= #"\r") orelse ch = #" " 633 then skipWS f src' 634 else src (* No match *) 635 636 datatype cs = Index of word 637 638 (* Index into the string. *) 639 fun scanString cvt s = 640 let 641 val len = sizeAsWord s 642 fun rdr (Index i) = 643 if i = len then NONE 644 (* Since we know the index is between 0 and len-1 we can use 645 the unsafe subscript function here. *) 646 else SOME(unsafeStringSub(s, i), Index(i+0w1)) 647 in 648 case cvt rdr (Index 0w0) of 649 NONE => NONE 650 | SOME(res, _) => SOME res 651 end 652 653 end 654 655 local 656 open Char 657 in 658 (* Convert the first i digits as a hex number. Check the result is 659 in the range before returning it. *) 660 local 661 fun readHex' _ str 0 res = 662 if res > maxOrd then NONE else SOME(chr res, str) 663 | readHex' getc str i res = 664 case getc str of 665 NONE => (* No char available. That's ok if we are converting 666 as many chars as we can and have already converted one 667 but not if we are converting n chars and haven't got 668 them *) 669 if i >= ~1 orelse res > maxOrd then NONE else SOME(chr res, str) 670 | SOME(ch, str') => 671 if #"0" <= ch andalso ch <= #"9" 672 then readHex' getc str' (i-1) (res*16 + ord ch - ord #"0") 673 else if #"a" <= ch andalso ch <= #"f" 674 then readHex' getc str' (i-1) (res*16 + ord ch - ord #"a" + 10) 675 else if #"A" <= ch andalso ch <= #"F" 676 then readHex' getc str' (i-1) (res*16 + ord ch - ord #"A" + 10) 677 else (* Not a hex char. Ok if we are converting as many as we can. *) 678 if i >= ~1 orelse res > maxOrd then NONE else SOME(chr res, str) 679 in 680 fun readHexN getc str i = readHex' getc str i 0 681 and readHex getc str = readHex' getc str ~1 0 682 end 683 684 (* Convert the first i digits as a decimal. There must be exactly i digits. *) 685 fun readDec _ str 0 res = 686 if res > maxOrd then NONE else SOME(chr res, str) 687 | readDec getc str i res = 688 case getc str of 689 NONE => 690 if res > maxOrd orelse i > 0 (* not enough chars *) then NONE 691 else SOME(chr res, str) 692 | SOME(ch, str') => 693 if #"0" <= ch andalso ord #"9" >= ord ch 694 then readDec getc str' (i-1) (res*10 + ord ch - ord #"0") 695 else (* Not enough valid digits. *) NONE 696 697 (* Convert up to i digits as an octal number. There may be fewer than i digits. *) 698 fun readOct _ str 0 res = 699 if res > maxOrd then NONE else SOME(chr res, str) 700 | readOct getc str i res = 701 case getc str of 702 NONE => 703 if res > maxOrd then NONE 704 else SOME(chr res, str) 705 | SOME(ch, str') => 706 if #"0" <= ch andalso ord #"7" >= ord ch 707 then readOct getc str' (i-1) (res*8 + ord ch - ord #"0") 708 else (* Stop here. *) if res > maxOrd then NONE 709 else SOME(chr res, str) 710 711 (* This function is used as the basis of Char.scan and String.scan. There is a 712 crucial difference between Char.scan and String.scan in that Char.scan returns 713 NONE if it cannot read a single character whereas String.scan returns NONE only 714 if it encounters a bad escape before reading any valid input, which includes a 715 format sequence (\<whitespace>\). This function returns NONE if it encounters 716 a bad escape but SOME("", strm) if it encounters end-of-stream or has read a 717 format sequence. *) 718 fun scanBase (getc: (char, 'a) StringCvt.reader) (str :'a) : (string * 'a) option = 719 case getc str of (* Read the first character. *) 720 NONE => SOME("", str) (* Just end-of-stream. *) 721 | SOME(ch, str') => 722 if ch < chr 32 orelse chr 126 < ch 723 then NONE (* Non-printable character. *) 724 else if ch = #"\\" 725 then (* escape *) 726 ( 727 case getc str' of 728 NONE => NONE 729 | SOME(#"a", str'') => SOME("\a", str'') 730 | SOME(#"b", str'') => SOME("\b", str'') 731 | SOME(#"t", str'') => SOME("\t", str'') 732 | SOME(#"n", str'') => SOME("\n", str'') 733 | SOME(#"v", str'') => SOME("\v", str'') 734 | SOME(#"f", str'') => SOME("\f", str'') 735 | SOME(#"r", str'') => SOME("\r", str'') 736 | SOME(#"\\", str'') => SOME("\\", str'') 737 | SOME(#"\"", str'') => SOME("\"", str'') 738 | SOME(#"^", str'') => (* Control char *) 739 ( 740 case getc str'' of 741 NONE => NONE 742 | SOME(ch'', str''') => 743 if ord ch'' >= 64 andalso 95 >= ord ch'' 744 then SOME(charAsString(chr(ord ch'' - 64)), str''') 745 else NONE 746 ) 747 | SOME(#"u", str'') => 748 (* Hex encoding: Read 4 hex digits *) 749 (* NOTE: There's a contradiction in the web page: 750 It says both 4 hex digits and also "the longest 751 sequence of such characters" 752 *) 753 (case readHexN getc str'' 4 of NONE => NONE | SOME(s, str) => SOME(charAsString s, str)) 754 | SOME(ch', str'') => 755 if isSpace ch' 756 then (* Remove \f...f\ and then recurse. *) 757 ( 758 case getc (StringCvt.skipWS getc str'') of 759 NONE => NONE 760 | SOME(ch'', str''') => 761 if ch'' <> #"\\" then NONE (* Bad format *) 762 else SOME("", str''') (* Return an empty string. *) 763 ) 764 else if #"0" <= ch' andalso ch' <= #"2" 765 then (* Decimal encoding *) 766 (* NOTE: There's a contradiction in the web page: 767 It says both 3 digits and also "the longest 768 sequence of such characters". 769 The tests insist on 3 digits so we go with 770 that. *) 771 (case readDec getc str' 3 0 of NONE => NONE | SOME(s, str) => SOME(charAsString s, str)) 772 else (* Unknown escape *) NONE 773 ) 774 else SOME(charAsString ch, str') (* Result is the character. *) 775 776 (* Convert C escapes *) 777 fun scanC (getc: (char, 'a) StringCvt.reader) (str :'a) : (char * 'a) option = 778 case getc str of (* Read the first character. *) 779 NONE => NONE 780 | SOME(ch, str') => 781 if ch < chr 32 orelse chr 126 < ch 782 then NONE (* Non-printable character. *) 783 else if ch = #"\\" 784 then (* escape *) 785 ( 786 case getc str' of 787 NONE => NONE 788 | SOME(#"a", str'') => SOME((*#"\a"*) chr 7, str'') 789 | SOME(#"b", str'') => SOME((*#"\b"*) chr 8, str'') 790 | SOME(#"t", str'') => SOME(#"\t", str'') 791 | SOME(#"n", str'') => SOME(#"\n", str'') 792 | SOME(#"v", str'') => SOME((*#"\v" *) chr 11, str'') 793 | SOME(#"f", str'') => SOME((*#"\f"*) chr 12, str'') 794 | SOME(#"r", str'') => SOME((*#"\r"*) chr 13, str'') 795 | SOME(#"?", str'') => SOME(#"?", str'') 796 | SOME(#"\\", str'') => SOME(#"\\", str'') 797 | SOME(#"\"", str'') => SOME(#"\"", str'') 798 | SOME(#"'", str'') => SOME(#"'", str'') 799 | SOME(#"^", str'') => (* Control char *) 800 ( 801 case getc str'' of 802 NONE => NONE 803 | SOME(ch'', str''') => 804 if ord ch'' >= 64 andalso 95 >= ord ch'' 805 then SOME(chr(ord ch'' - 64), str''') 806 else NONE 807 ) 808 (* Note: the web page says \u here but it seems it should 809 be \x. That's confirmed by the latest version of 810 the library definition. *) 811 | SOME(#"x", str'') => (* Hex encoding. *) 812 readHex getc str'' 813 | SOME(ch', _) => 814 if #"0" <= ch' andalso ch' <= #"7" 815 then (* Octal encoding *) readOct getc str' 3 0 816 else (* Unknown escape *) NONE 817 ) 818 else SOME(ch, str') (* Result is the character. *) 819 end 820 821in 822 823 (* At this point we can start to add conversion functions. *) 824 structure CharVector: MONO_VECTOR = 825 struct 826 fun mapi f vec = 827 let 828 val len = sizeAsWord vec 829 in 830 if len = 0w0 then "" 831 else 832 let 833 (* Allocate a new vector. *) 834 val new_vec = LibrarySupport.allocString len 835 836 fun domap j = 837 if j >= len then () 838 else (RunCall.storeByte(new_vec, j+wordSize, 839 f(wordAsInt(j), RunCall.loadByteFromImmutable(vec, j+wordSize))); 840 domap(j+0w1)) 841 in 842 domap 0w0; 843 RunCall.clearMutableBit new_vec; 844 new_vec 845 end 846 end 847 848 (* Return a copy of the string with a particular character replaced *) 849 fun update (v, i, c) = 850 if i < 0 orelse i >= String.size v 851 then raise Subscript 852 else mapi (fn (j, s) => if j = i then c else s) v 853 854 open String 855 (* Name changes needed for CharVector. *) 856 val maxLen = maxSize 857 val fromList = implode 858 val length = size 859 end 860 861 structure Char: CHAR = 862 struct 863 open Char 864 865 fun scan (getc: (char, 'a) StringCvt.reader) (str :'a) : (char * 'a) option = 866 case scanBase getc str of 867 NONE => NONE 868 | SOME("", strm') => (* May be end-of-string or we may have read a format sequence. *) 869 (case getc strm' of NONE => (* end-of-string *) NONE | _ => scan getc strm') 870 | SOME(s, strm') => SOME(unsafeStringSub(s, 0w0), strm') (* Only ever a single character *) 871 872 (* Convert from a string. *) 873 (* TODO: More efficient conversion using the string directly rather 874 than scanString ? *) 875 val fromString = StringCvt.scanString scan 876 and fromCString = StringCvt.scanString scanC 877 878 (* Convert to printable string. *) 879 local 880 local 881 (* Conversion to octal has now been defined to generate 882 three octal digits in the same way as conversion to 883 integer. *) 884 fun octIntRepr base digs (i: int) = 885 if digs = 0 then "" 886 else octIntRepr base (digs-1) (i div base) ^ 887 charAsString(chr(i mod base + ord #"0")) 888 in 889 val intRepr = octIntRepr 10 3 890 val octalRepr = octIntRepr 8 3 891 end 892 in 893 894 (* Conversion to ML escapes. *) 895 fun toString ch = 896 (* First handle the special cases *) 897 if ch = #"\\" then "\\\\" 898 else if ch = #"\"" then "\\\"" 899 else if isPrint ch (* Other printable characters *) 900 then charAsString ch 901 else (* Control chars: Special cases first *) 902 if ch = chr 7 then "\\a" 903 else if ch = chr 8 then "\\b" 904 else if ch = chr 9 then "\\t" 905 else if ch = chr 10 then "\\n" 906 else if ch = chr 11 then "\\v" 907 else if ch = chr 12 then "\\f" 908 else if ch = chr 13 then "\\r" 909 else if ch < chr 32 (* Other chars must be escaped. *) 910 then "\\^" ^ charAsString(chr(ord ch + 64)) 911 else (* Use 3 digit notation. *) 912 (* Note: Web site assumes ASCII, not Unicode. *) 913 "\\" ^ intRepr(ord ch) 914 915 (* Conversion to C escapes. *) 916 fun toCString ch = 917 (* First handle the special cases *) 918 if ch = #"\\" then "\\\\" 919 else if ch = #"\"" then "\\\"" 920 else if ch = #"?" then "\\?" 921 else if ch = #"'" then "\\'" 922 else if isPrint ch (* Other printable characters *) 923 then charAsString ch 924 else (* Control chars: Special cases first *) 925 if ch = chr 7 then "\\a" 926 else if ch = chr 8 then "\\b" 927 else if ch = chr 9 then "\\t" 928 else if ch = chr 10 then "\\n" 929 else if ch = chr 11 then "\\v" 930 else if ch = chr 12 then "\\f" 931 else if ch = chr 13 then "\\r" 932 else (* Use octal notation. *) 933 (* Note: Web site assumes ASCII, not Unicode. *) 934 "\\" ^ octalRepr(ord ch) 935 end; 936 937 (* Install conversion and print functions. *) 938 local 939 (* It might be worth rewriting scan to raise Conversion with 940 a string argument so we can pass back information about 941 why an escape code was invalid. *) 942 fun convChar s = 943 let 944 val len = sizeAsWord s 945 fun rdr i = 946 if i = len then NONE 947 else SOME(unsafeStringSub(s, i), i+0w1) 948 in 949 case scan rdr 0w0 of 950 NONE => raise RunCall.Conversion "Invalid character constant" 951 | SOME(res, index') => 952 (* Check that we have converted all the string. *) 953 if index' <> len 954 then raise RunCall.Conversion "Not exactly one character" 955 else res 956 end 957 958 fun print_char _ _ (c: char) = 959 PolyML.PrettyString("#\"" ^ toString c ^ "\"") 960 in 961 val () = RunCall.addOverload convChar "convChar"; 962 val () = PolyML.addPrettyPrinter print_char 963 end 964 965 (* Define the type-specific inequalities. *) 966 val op < : char * char -> bool = op < 967 val op <= : char * char -> bool = op <= 968 val op > : char * char -> bool = op > 969 val op >= : char * char -> bool = op >= 970 971 fun compare (ch, ch') = 972 if ch < ch' then General.LESS 973 else if ch > ch' then General.GREATER else General.EQUAL 974 end 975 976 structure String: STRING = 977 struct 978 open String 979 980 (* Generate escape characters. *) 981 local 982 fun toStrings convert s = 983 let 984 val len = sizeAsWord s 985 (* First pass - find out the size of the result string. *) 986 fun getSize i n = 987 if i = len then n 988 else getSize (i+0w1) 989 (n + size(convert(RunCall.loadByteFromImmutable(s, i+wordSize)))) 990 (* The result could possibly be long so we add the lengths 991 as integers and convert and check when we've finished. *) 992 val newSize = unsignedShortOrRaiseSize (getSize 0w0 0) 993 in 994 (* If the size is the same we can return the original string. 995 This relies on the fact that the conversions either return 996 the character unchanged or return a longer escape sequence. *) 997 if newSize = len 998 then s 999 else 1000 let 1001 (* Second pass: create the output string and copy to it. *) 1002 val newVec = LibrarySupport.allocString newSize 1003 fun copyToOut i j = 1004 if i = len then () 1005 else 1006 let 1007 val conv = convert(RunCall.loadByteFromImmutable(s, i+wordSize)) 1008 val convSize = sizeAsWord conv 1009 in 1010 bcopy(conv, newVec, wordSize, j, convSize); 1011 copyToOut (i+0w1) (j+convSize) 1012 end 1013 in 1014 copyToOut 0w0 wordSize; 1015 RunCall.clearMutableBit newVec; 1016 newVec 1017 end 1018 end 1019 in 1020 val toString = toStrings Char.toString 1021 and toCString = toStrings Char.toCString 1022 end 1023 1024 (* Convert escapes. *) 1025 fun scan (getc: (char, 'a) StringCvt.reader) (str :'a) : (string * 'a) option = 1026 let 1027 fun scanString str (l: string list) haveRead = 1028 case scanBase getc str of 1029 NONE => (* Invalid escape sequence *) 1030 if haveRead then SOME(concat(rev l), str) else NONE 1031 | SOME("", strm') => (* End of input or read a format sequence. *) 1032 (case getc strm' of NONE => SOME(concat(rev l), strm') | _ => scanString strm' l true) 1033 | SOME(s, strm') => scanString strm' (s :: l) true (* More to do. *) 1034 in 1035 scanString str [] false 1036 end 1037 1038 val fromString = StringCvt.scanString scan 1039 1040 (* TODO: More efficient version. *) 1041 fun fromCString "" = SOME "" (* Special case *) 1042 | fromCString s = 1043 let 1044 val len = sizeAsWord s 1045 fun rdr i = 1046 if i = len then NONE 1047 else SOME(unsafeStringSub(s, i), i+0w1) 1048 (* Repeatedly convert escape sequences and accumulate the 1049 results in a list. *) 1050 fun convChar i = 1051 case scanC rdr i of 1052 NONE => [] 1053 | SOME(res, j) => res :: convChar j 1054 in 1055 (* If we couldn't even get a single character we return NONE. *) 1056 case convChar 0w0 of 1057 [] => NONE 1058 | res => SOME(implode res) 1059 end 1060 1061 (* Install conversion and print functions. *) 1062 local 1063 (* It might be worth rewrite scan to raise Conversion with 1064 a string argument so we can pass back information about 1065 why an escape code was invalid. *) 1066 (* Unlike fromString which returns as much of the input string 1067 as could be converted this raises an exception if the 1068 input contains any invalid character. *) 1069 fun convString s = 1070 let 1071 val len = sizeAsWord s 1072 fun rdr i = 1073 if i = len then NONE 1074 else SOME(unsafeStringSub(s, i), i+0w1) 1075 (* Repeatedly convert escape sequences and accumulate the 1076 results in a list. *) 1077 fun convChars i = 1078 if i = len then [] (* Finished *) 1079 else case Char.scan rdr i of 1080 NONE => (* Bad conversion *) 1081 raise RunCall.Conversion "Invalid string constant" 1082 | SOME(res, j) => res :: convChars j 1083 in 1084 implode(convChars 0w0) 1085 end 1086 1087 fun print_string _ _ (s: string) = 1088 PolyML.PrettyString(concat["\"", toString s, "\""]) 1089 in 1090 val () = RunCall.addOverload convString "convString"; 1091 val () = PolyML.addPrettyPrinter print_string 1092 end 1093 end 1094 1095 (* CharArray is very similar to Word8Array and most of the code is duplicated. *) 1096 structure CharArray : MONO_ARRAY = 1097 struct 1098 (* We can't use the segment length for the length of the vector 1099 as we do for "normal" arrays and vectors. There are two ways 1100 of handling this. We could implement arrays in the same 1101 way as strings, with a length word in the first word, or we 1102 could store the length separately. The former has the advantage 1103 of using less store but the latter allows the byte vector to be 1104 used for other purposes and is probably faster. *) 1105 type address = LibrarySupport.address 1106 datatype array = datatype LibrarySupport.CharArray.array 1107 (* N.B. This representation is hard-wired into TextIO. Don't 1108 change this representation without changing that as well. *) 1109 1110 type vector = string and elem = char 1111 1112 infix 9 sub (* For what it's worth *) 1113 1114 val maxLen = String.maxSize (* Use the same maximum as string. *) 1115 1116 fun length(Array(l, _)) = wordAsInt l 1117 1118 fun array (length, ini) = 1119 let 1120 (* The array is allocated unitialised. *) 1121 val len = unsignedShortOrRaiseSize length 1122 val vec = LibrarySupport.allocBytes len 1123 fun init i = 1124 if len <= i then () 1125 else (RunCall.storeByte(vec, i, ini); init(i+0w1)) 1126 in 1127 init 0w0; 1128 Array(len, vec) 1129 end 1130 1131 fun op sub (Array(l, v), i: int): elem = 1132 let 1133 val iW = 1134 if isShortInt i 1135 then intAsWord i 1136 else raise General.Subscript 1137 in 1138 (* Negative values will always be >= l when compared unsigned. *) 1139 if iW >= l then raise General.Subscript 1140 else RunCall.loadByte (v, iW) 1141 end 1142 1143 fun update (Array (l, v), i: int, new) : unit = 1144 let 1145 val iW = 1146 if isShortInt i andalso i >= 0 1147 then intAsWord i 1148 else raise General.Subscript 1149 in 1150 if iW >= l 1151 then raise General.Subscript 1152 else RunCall.storeByte (v, iW, new) 1153 end; 1154 1155 (* Create an array from a list. *) 1156 local 1157 fun fromList' (l : char list) : word*address = 1158 let 1159 (* List has not yet been defined. The length is limited by the 1160 memory so this won't overflow. *) 1161 fun listLength([], n) = n 1162 | listLength(_::l, n) = listLength(l, n+0w1) 1163 val length = listLength(l, 0w0) 1164 1165 (* Make a array initialised to zero. *) 1166 val vec = LibrarySupport.allocBytes length 1167 1168 (* Copy the list elements into the array. *) 1169 fun init (v, i, a :: l) = (RunCall.storeByte(v, i, a); init(v, i + 0w1, l)) 1170 | init (_, _, []) = () 1171 1172 in 1173 init(vec, 0w0, l); 1174 (length, vec) 1175 end 1176 in 1177 fun fromList (l : elem list) : array = Array(fromList' l) 1178 end 1179 1180 fun tabulate (length: int , f : int->elem): array = 1181 let 1182 val len = unsignedShortOrRaiseSize length 1183 val vec = LibrarySupport.allocBytes len 1184 (* Initialise it to the function values. *) 1185 fun init i = 1186 if len <= i then () 1187 else (RunCall.storeByte(vec, i, f(wordAsInt i)); init(i+0w1)) 1188 in 1189 init 0w0; 1190 Array(len, vec) 1191 end 1192 1193 fun vector (Array(len, vec)) = 1194 if len = 0w0 then "" 1195 else if len = 0w1 1196 then (* Single character string. *) 1197 charAsString (RunCall.loadByte (vec, 0w0)) 1198 else 1199 let 1200 (* Make an array initialised to zero. *) 1201 val new_vec = LibrarySupport.allocString len 1202 in 1203 System_move_bytesA(vec, RunCall.unsafeCast new_vec, 0w0, wordSize, len); 1204 RunCall.clearMutableBit new_vec; 1205 new_vec 1206 end 1207 1208 (* Copy an array into another. It's possible for the arrays to be 1209 the same but in that case diW must be zero and the copy is a no-op. *) 1210 fun copy {src=Array (len, s), dst=Array (dlen, d), di: int} = 1211 let 1212 val diW = unsignedShortOrRaiseSubscript di 1213 in 1214 if diW+len > dlen 1215 then raise General.Subscript 1216 else System_move_bytesA(s, d, 0w0, diW, len) 1217 end 1218 1219 (* Copy a vector into an array. *) 1220 (* Since the source is actually a string we have to start the 1221 copy from si+wordSize. *) 1222 fun copyVec {src, dst=Array (dlen, d), di: int} = 1223 let 1224 val len = sizeAsWord src 1225 val diW = unsignedShortOrRaiseSubscript di 1226 in 1227 if diW + len > dlen 1228 then raise General.Subscript 1229 else System_move_bytesA(RunCall.unsafeCast src, d, wordSize, diW, len) 1230 end 1231 1232 (* Create the other functions. *) 1233 structure ArrayOps = 1234 VectorOperations( 1235 struct 1236 type vector = array and elem = elem 1237 fun length(Array(len, _)) = len 1238 fun unsafeSub(Array(_, v), i) = RunCall.loadByte(v, i) 1239 and unsafeSet(Array(_, v), i, c) = RunCall.storeByte(v, i, c) 1240 end); 1241 1242 open ArrayOps; 1243 1244 local 1245 (* Install the pretty printer for CharArray.array *) 1246 (* We may have to do this outside the structure if we 1247 have opaque signature matching. *) 1248 fun pretty _ _ x = 1249 PolyML.PrettyString(String.concat["\"", String.toString(vector x), "\""]) 1250 in 1251 val () = PolyML.addPrettyPrinter pretty 1252 end 1253 end; 1254 1255 structure Substring :> 1256 sig 1257 type substring 1258 eqtype char 1259 eqtype string 1260 val size : substring -> int 1261 val base : substring -> (string * int * int) 1262 val isEmpty : substring -> bool 1263 1264 val sub : (substring * int) -> char 1265 val getc : substring -> (char * substring) option 1266 val first : substring -> char option 1267 1268 val extract : (string * int * int option) -> substring 1269 val substring : (string * int * int) -> substring 1270 (*val slice : (substring * int * int option) -> substring*) 1271 val full: string -> substring 1272 val string : substring -> string 1273 1274 val concat: substring list ->string 1275 val concatWith: string -> substring list ->string 1276 1277 val explode : substring -> char list 1278 val translate : (char -> string) -> substring -> string 1279 val app : (char -> unit) -> substring -> unit 1280 val foldl : ((char * 'a) -> 'a) -> 'a -> substring -> 'a 1281 val foldr : ((char * 'a) -> 'a) -> 'a -> substring -> 'a 1282 val tokens : (char -> bool) -> substring -> substring list 1283 val fields : (char -> bool) -> substring -> substring list 1284 val isPrefix: string -> substring -> bool 1285 val isSubstring: string -> substring -> bool 1286 val isSuffix: string -> substring -> bool 1287 1288 val compare : (substring * substring) -> General.order 1289 val collate : ((char * char) -> General.order) -> 1290 (substring * substring) -> General.order 1291 1292 val triml : int -> substring -> substring 1293 val trimr : int -> substring -> substring 1294 val splitl : (char -> bool) -> substring -> (substring * substring) 1295 val splitr : (char -> bool) -> substring -> (substring * substring) 1296 val splitAt : (substring * int) -> (substring * substring) 1297 val dropl : (char -> bool) -> substring -> substring 1298 val dropr : (char -> bool) -> substring -> substring 1299 val takel : (char -> bool) -> substring -> substring 1300 val taker : (char -> bool) -> substring -> substring 1301 val position : string -> substring -> (substring * substring) 1302 val span : (substring * substring) -> substring 1303 1304 type vector 1305 type elem 1306 type slice 1307 1308 val length : slice -> int 1309 val subslice: slice * int * int option -> slice 1310 val slice: vector * int * int option -> slice 1311 val vector: slice -> vector 1312 val getItem: slice -> (elem * slice) option 1313 val appi : ((int * elem) -> unit) -> slice -> unit 1314 val mapi : ((int * elem) -> elem) -> slice -> vector 1315 val map : (elem -> elem) -> slice -> vector 1316 val foldli : ((int * elem * 'a) -> 'a) -> 'a -> slice -> 'a 1317 val foldri : ((int * elem * 'a) -> 'a) -> 'a -> slice -> 'a 1318 val findi: (int * elem -> bool) -> slice -> (int * elem) option 1319 val find: (elem -> bool) -> slice -> elem option 1320 val exists: (elem -> bool) -> slice -> bool 1321 val all: (elem -> bool) -> slice -> bool 1322 sharing type slice = substring 1323 end 1324 where type elem = char where type vector = string where type char = char where type string = string = 1325 struct 1326 type vector = string and elem = char 1327 1328 structure VectorSliceOps = 1329 VectorSliceOperations( 1330 struct 1331 type vector = vector and elem = char 1332 val vecLength = sizeAsWord 1333 fun unsafeVecSub(s, i: word) = RunCall.loadByteFromImmutable(s, i + wordSize) 1334 fun unsafeVecUpdate _ = raise Fail "Should not be called" (* Not applicable *) 1335 end); 1336 1337 open VectorSliceOps; 1338 1339 (* vector: get the slice out. Since the underlying vector is implemented using the basic 1340 string type we can use substring here. *) 1341 fun vector slice : vector = 1342 let 1343 val (vector, start, length) = base slice 1344 in 1345 unsafeSubstring(vector, intAsWord start, intAsWord length) 1346 end 1347 1348 (* It would be more efficient to do these as single operations but it's probably too complicated. *) 1349 fun concat L = String.concat(List.map vector L) 1350 fun concatWith s L = String.concatWith s (List.map vector L) 1351 fun map f slice = String.map f (vector slice) 1352 fun mapi f slice = CharVector.mapi f (vector slice) 1353 1354 (* Substring operations. *) 1355 type substring = slice 1356 type char = elem 1357 type string = vector 1358 1359 val size = length 1360 1361 (* Since we've already checked the bounds we don't need to do it here. *) 1362 fun string(Slice{vector=s, start=i, length=l}) = unsafeSubstring(s, i, l) 1363 1364 (* Check that the index and length are valid. *) 1365 fun substring(s, i, j) = 1366 if i < 0 orelse j < 0 orelse String.size s < i+j 1367 then raise General.Subscript 1368 else Slice{vector=s, start=intAsWord i, length=intAsWord j} 1369 1370 fun extract(s, i, NONE) = substring(s, i, String.size s-i) 1371 | extract(s, i, SOME j) = substring(s, i, j) 1372 1373 fun triml k = 1374 if k < 0 then raise General.Subscript 1375 else fn (Slice{vector=s, start=i, length=l}) => 1376 if k > wordAsInt l then Slice{vector=s, start=i+l, length=0w0} 1377 else Slice{vector=s, start=i + intAsWord k, length=l - intAsWord k} 1378 1379 fun trimr k = 1380 if k < 0 then raise General.Subscript 1381 else fn (Slice{vector=s, start=i, length=l}) => 1382 if k > wordAsInt l then Slice{vector=s, start=i, length=0w0} 1383 else Slice{vector=s, start=i, length=l - intAsWord k} 1384 1385 fun explode (Slice{vector=s, start=i, length=l}) : char list = stringExplode(s, i, l) 1386 1387 (* Compare two strings. We could define compare in terms of collate and it 1388 would be just as efficient provided we set PolyML.Compiler.maxInlineSize 1389 to a large enough value that collate was inlined, and hence Char.compare 1390 would be inlined. *) 1391 fun compare (Slice{vector=s, start=j, length=l}, Slice{vector=s', start=j', length=l'}) = 1392 let 1393 fun comp' i = 1394 if i = l 1395 then 1396 ( 1397 if l = l' then General.EQUAL 1398 else (* l < l' *) General.LESS 1399 ) 1400 else if i = l' (* and not l *) then General.GREATER 1401 else 1402 case Char.compare(unsafeStringSub(s, i+j), unsafeStringSub(s', i+j')) of 1403 General.EQUAL => comp' (i+0w1) 1404 | General.LESS => General.LESS 1405 | General.GREATER => General.GREATER 1406 in 1407 comp' 0w0 1408 end 1409 1410 fun isPrefix (s1: string) (Slice{vector=s2, start=i, length=l}) = 1411 let 1412 val size_s1 = sizeAsWord s1 1413 in 1414 if size_s1 > l 1415 then false 1416 else byteMatch s1 s2 0w0 i size_s1 1417 end 1418 1419 (* True if s1 is a suffix of s2 *) 1420 fun isSuffix s1 (Slice{vector=s2, start=i, length=l}) = 1421 let 1422 val size_s1 = sizeAsWord s1 1423 in 1424 if size_s1 > l 1425 then false 1426 else byteMatch s1 s2 0w0 (l + i - size_s1) size_s1 1427 end 1428 1429 (* True if s1 is a substring of s2 *) 1430 fun isSubstring s1 (Slice{vector=s2, start, length}) = 1431 let 1432 val size_s1 = sizeAsWord s1 1433 (* Start at the beginning and compare until we get a match. *) 1434 fun doMatch i s = 1435 if s < size_s1 then false (* The remainder of the string is too small to match. *) 1436 else if byteMatch s1 s2 0w0 i size_s1 1437 then true 1438 else doMatch (i+0w1) (s-0w1) 1439 in 1440 doMatch start length 1441 end 1442 1443 (* TODO: This would be quicker with an RTS function to scan for a 1444 character in a string. *) 1445 fun splitl f (Slice{vector=s, start=i, length=l}) = 1446 let 1447 fun find j = 1448 if j = i+l 1449 then (* All chars satisfy f *) (Slice{vector=s, start=i, length=l}, Slice{vector=s, start=j, length=0w0}) 1450 else if f(unsafeStringSub(s, j)) then find (j+0w1) 1451 else (* Found a separator *) 1452 (Slice{vector=s, start=i, length=j-i}, Slice{vector=s, start=j, length=l+i-j}) 1453 in 1454 find i 1455 end 1456 1457 (* TODO: This would be quicker with an RTS function to scan for a 1458 character in a string. *) 1459 fun splitr f (Slice{vector=s, start=i, length=l}) = 1460 let 1461 fun find j = 1462 if j = i 1463 then (* All chars satisfy f *) (Slice{vector=s, start=j, length=0w0}, Slice{vector=s, start=i, length=l}) 1464 else if f(unsafeStringSub(s, j-0w1)) then find (j-0w1) 1465 else (* Found a separator *) 1466 (Slice{vector=s, start=i, length=j-i}, Slice{vector=s, start=j, length=l+i-j}) 1467 in 1468 find (i+l) 1469 end 1470 1471 fun splitAt (Slice{vector=s, start=i, length=l}, j) = 1472 let 1473 val j' = unsignedShortOrRaiseSubscript j 1474 in 1475 if j' > l then raise General.Subscript 1476 else (Slice{vector=s, start=i, length=j'}, Slice{vector=s, start=i+j', length=l-j'}) 1477 end 1478 1479 (* TODO: Define these directly rather than via split. It's not so expensive 1480 doing it this way for substrings because we don't actually copy the strings. *) 1481 fun takel p s = #1(splitl p s) 1482 and dropl p s = #2(splitl p s) 1483 and taker p s = #2(splitr p s) 1484 and dropr p s = #1(splitr p s) 1485 1486 (* NOTE: There's an error in the web page. The example function uses "trim" 1487 rather than "triml". 1488 QUESTION: The check i'+n' >= i does not guarantee that ss is to the left of ss', 1489 merely that the end of ss' is to the right of the beginning of ss. 1490 I can't remember my reasoning about this at the moment. *) 1491 1492 fun span (Slice{vector=s, start=i, length=_}, Slice{vector=s', start=i', length=n'}) = 1493 (* First check with pointer equality and only if that fails do we use the 1494 string equality function. *) 1495 if (RunCall.pointerEq(s, s') orelse s = s') andalso i'+n' >= i 1496 then Slice{vector=s, start=i, length=i'+n'-i} 1497 else raise General.Span 1498 1499 (* tokens and fields are very similar except that tokens does not return 1500 empty strings for adjacent delimiters whereas fields does. 1501 This definition is almost the same as String.tokens and String.fields. *) 1502 (* QUESTION: Are these defined always to return the results as substrings 1503 of the original base string? That's important if we want to be able to 1504 use "span" to join them up again. *) 1505 fun tokens p (Slice{vector=s, start=j, length}) = 1506 let 1507 val ends = j+length 1508 fun tok' i l = (* i is the character to examine. l is the start of a token *) 1509 if i = ends 1510 then (* Finished the input. Return any partially completed string. *) 1511 ( 1512 if l = i then [] else [Slice{vector=s, start=l, length=i-l}] 1513 ) 1514 else if p (unsafeStringSub(s, i)) 1515 then (* It's a delimiter. If we have more than one character in the 1516 string we create a string otherwise we just continue. *) 1517 ( 1518 if l = i then tok' (i+0w1) (i+0w1) 1519 else Slice{vector=s, start=l, length=i-l} :: tok' (i+0w1) (i+0w1) 1520 ) 1521 else (* Token: Keep accumulating characters. *) tok' (i+0w1) l 1522 in 1523 tok' j j 1524 end 1525 1526 fun fields p (Slice{vector=s, start=j, length}) = 1527 let 1528 val ends = j+length 1529 1530 fun field' i l = (* i is the character to examine. l is the start of a token *) 1531 if i = ends 1532 then (* Finished the input. Return any partially completed string. *) 1533 [Slice{vector=s, start=l, length=i-l}] 1534 else if p (unsafeStringSub(s, i)) 1535 then (* It's a delimiter. Finish the partially completed string and 1536 start another. *) 1537 Slice{vector=s, start=l, length=i-l} :: field' (i+0w1) (i+0w1) 1538 else (* Field: Keep accumulating characters. *) field' (i+0w1) l 1539 in 1540 field' j j 1541 end 1542 1543 (* TODO: Could be defined more efficiently. *) 1544 (* map and translate are defined to apply f from left to right. *) 1545 fun translate f s = String.concat(List.map f (explode s)) 1546 1547 fun position s (Slice{vector=s', start=i, length=n}) = 1548 let 1549 val m = sizeAsWord s (* Length of string to match. *) 1550 fun pos k = 1551 if k > n-m then (* No match *) (Slice{vector=s', start=i, length=n}, Slice{vector=s', start=i+n, length=0w0}) 1552 else if compare(full s, Slice{vector=s', start=i+k, length=m}) = EQUAL 1553 then (* Match *) (Slice{vector=s', start=i, length=k}, Slice{vector=s', start=k+i, length=n-k}) 1554 else pos (k+0w1) 1555 in 1556 (* Because m and n are word values n-m is UNSIGNED so we have to check 1557 this before we call "pos". *) 1558 if m > n then (Slice{vector=s', start=i, length=n}, Slice{vector=s', start=i+n, length=0w0}) 1559 else pos 0w0 1560 end 1561 1562 (* Return the first character of the string together with the rest of the 1563 string. *) 1564 fun getc(Slice{length=0w0, ...}) = NONE 1565 | getc(Slice{vector=s, start=i, length=l}) = SOME(unsafeStringSub(s, i), Slice{vector=s, start=i+0w1, length=l-0w1}) 1566 1567 fun first(Slice{length=0w0, ...}) = NONE 1568 | first(Slice{vector=s, start=i, length=_}) = SOME(unsafeStringSub(s, i)) 1569 1570 end; 1571 1572 (* CharVectorSlice. *) 1573 structure CharVectorSlice: MONO_VECTOR_SLICE where type elem = char where type vector = string = Substring; 1574 1575 structure Substring : SUBSTRING = 1576 struct open Substring; 1577 val slice = subslice 1578 end 1579 1580 local 1581 (* Install the pretty printer for CharVector.slice (and substring) *) 1582 (* We may have to do this outside the structure if we 1583 have opaque signature matching. *) 1584 fun pretty _ _ s = 1585 PolyML.PrettyString(String.concat["\"", String.toString(Substring.string s), "\""]) 1586 in 1587 val _ = PolyML.addPrettyPrinter pretty 1588 end; 1589 1590 structure CharArraySlice:> MONO_ARRAY_SLICE where type elem = char where type vector = string 1591 where type vector_slice = CharVectorSlice.slice where type array = CharArray.array = 1592 struct 1593 type elem = char 1594 type vector = string 1595 datatype array = datatype LibrarySupport.CharArray.array 1596 (* N.B. This representation is hard-wired into TextIO. Don't 1597 change this representation without changing that as well. *) 1598 type vector_slice = CharVectorSlice.slice 1599 1600 structure ArraySliceOps = 1601 VectorSliceOperations( 1602 struct 1603 type vector = array and elem = char 1604 fun unsafeVecSub(Array(_, s: LibrarySupport.address), i) = RunCall.loadByte(s, i) 1605 and unsafeVecUpdate(Array(_, s), i, x) = RunCall.storeByte (s, i, x) 1606 and vecLength(Array(l, _)) = l 1607 end); 1608 1609 open ArraySliceOps; 1610 1611 (* vector: get the slice out. *) 1612 fun vector slice: vector = 1613 let 1614 val (Array(_, vec), start, length) = base slice 1615 in 1616 if length = 0 then "" 1617 else if length = 1 1618 then (* Optimise single character strings. *) 1619 charAsString(RunCall.loadByte (vec, intAsWord start)) 1620 else 1621 let 1622 val len = intAsWord length 1623 (* Make an array initialised to zero. *) 1624 val new_vec = LibrarySupport.allocString len 1625 in 1626 System_move_bytesA(vec, RunCall.unsafeCast new_vec, intAsWord start, wordSize, len); 1627 RunCall.clearMutableBit new_vec; 1628 new_vec 1629 end 1630 end 1631 1632 (* Copy a slice into an array. N.B. The arrays could be the same. *) 1633 fun copy {src, dst, di: int} = 1634 let 1635 val (src, start, length) = base src 1636 in 1637 if di < 0 orelse di+length > CharArray.length dst 1638 then raise General.Subscript 1639 else (* We can't use System_move_bytes because of the potential overlap problem. 1640 Instead we use explicit copying choosing to copy up or down depending 1641 on the index whether the source and destination are the same or not. *) 1642 let 1643 fun copyUp n = 1644 if n = length then () 1645 else (CharArray.update(dst, n+di, CharArray.sub(src, n+start)); copyUp(n+1)) 1646 1647 and copyDown n = 1648 if n < 0 then () 1649 else (CharArray.update(dst, n+di, CharArray.sub(src, n+start)); copyDown(n-1)) 1650 in 1651 if di > start then copyDown(length-1) else copyUp 0 1652 end 1653 end 1654 1655 (* Copy a vector slice into an array. *) 1656 fun copyVec {src: CharVectorSlice.slice, dst=Array (dlen, d), di: int} = 1657 let 1658 val (source, i, l) = CharVectorSlice.base src 1659 val len = intAsWord l and offset = intAsWord i 1660 val diW = unsignedShortOrRaiseSubscript di 1661 in 1662 if diW + len > dlen 1663 then raise General.Subscript 1664 (* The source is represented by a string whose first word is the length. *) 1665 else System_move_bytesA(RunCall.unsafeCast source, d, offset + wordSize, diW, len) 1666 end 1667 1668 end (* CharArraySlice *); 1669 1670 local 1671 (* Install the pretty printer for CharArraySlice.slice *) 1672 (* We may have to do this outside the structure if we 1673 have opaque signature matching. *) 1674 fun pretty _ _ x = 1675 PolyML.PrettyString(String.concat["\"", CharArraySlice.vector x, "\""]) 1676 in 1677 val _ = PolyML.addPrettyPrinter pretty 1678 end 1679 1680 structure StringCvt : STRING_CVT = StringCvt 1681end; 1682 1683val () = RunCall.addOverload Char.>= ">=" 1684and () = RunCall.addOverload Char.<= "<=" 1685and () = RunCall.addOverload Char.> ">" 1686and () = RunCall.addOverload Char.< "<"; 1687 1688val () = RunCall.addOverload String.>= ">=" 1689and () = RunCall.addOverload String.<= "<=" 1690and () = RunCall.addOverload String.> ">" 1691and () = RunCall.addOverload String.< "<"; 1692 1693(* Values available unqualified at the top level. *) 1694val ord : char -> int = Char.ord 1695val chr : int -> char = Char.chr 1696val concat : string list -> string =String.concat 1697val implode : char list -> string = String.implode 1698val explode : string -> char list = String.explode 1699val substring : string * int * int -> string = String.substring; 1700val op ^ : string * string -> string = String.^; 1701type substring = Substring.substring; 1702val size: string -> int = String.size; 1703val str: char -> string = String.str; 1704 1705(* These are declared in the prelude. *) 1706(* val size : string -> int = String.size 1707 val str : char -> string = String.str *) 1708