1// Written in the D programming language. 2 3/++ 4 $(P The `std.uni` module provides an implementation 5 of fundamental Unicode algorithms and data structures. 6 This doesn't include UTF encoding and decoding primitives, 7 see $(REF decode, std,_utf) and $(REF encode, std,_utf) in $(MREF std, utf) 8 for this functionality. ) 9 10$(SCRIPT inhibitQuickIndex = 1;) 11$(DIVC quickindex, 12$(BOOKTABLE, 13$(TR $(TH Category) $(TH Functions)) 14$(TR $(TD Decode) $(TD 15 $(LREF byCodePoint) 16 $(LREF byGrapheme) 17 $(LREF decodeGrapheme) 18 $(LREF graphemeStride) 19)) 20$(TR $(TD Comparison) $(TD 21 $(LREF icmp) 22 $(LREF sicmp) 23)) 24$(TR $(TD Classification) $(TD 25 $(LREF isAlpha) 26 $(LREF isAlphaNum) 27 $(LREF isCodepointSet) 28 $(LREF isControl) 29 $(LREF isFormat) 30 $(LREF isGraphical) 31 $(LREF isIntegralPair) 32 $(LREF isMark) 33 $(LREF isNonCharacter) 34 $(LREF isNumber) 35 $(LREF isPrivateUse) 36 $(LREF isPunctuation) 37 $(LREF isSpace) 38 $(LREF isSurrogate) 39 $(LREF isSurrogateHi) 40 $(LREF isSurrogateLo) 41 $(LREF isSymbol) 42 $(LREF isWhite) 43)) 44$(TR $(TD Normalization) $(TD 45 $(LREF NFC) 46 $(LREF NFD) 47 $(LREF NFKD) 48 $(LREF NormalizationForm) 49 $(LREF normalize) 50)) 51$(TR $(TD Decompose) $(TD 52 $(LREF decompose) 53 $(LREF decomposeHangul) 54 $(LREF UnicodeDecomposition) 55)) 56$(TR $(TD Compose) $(TD 57 $(LREF compose) 58 $(LREF composeJamo) 59)) 60$(TR $(TD Sets) $(TD 61 $(LREF CodepointInterval) 62 $(LREF CodepointSet) 63 $(LREF InversionList) 64 $(LREF unicode) 65)) 66$(TR $(TD Trie) $(TD 67 $(LREF codepointSetTrie) 68 $(LREF CodepointSetTrie) 69 $(LREF codepointTrie) 70 $(LREF CodepointTrie) 71 $(LREF toTrie) 72 $(LREF toDelegate) 73)) 74$(TR $(TD Casing) $(TD 75 $(LREF asCapitalized) 76 $(LREF asLowerCase) 77 $(LREF asUpperCase) 78 $(LREF isLower) 79 $(LREF isUpper) 80 $(LREF toLower) 81 $(LREF toLowerInPlace) 82 $(LREF toUpper) 83 $(LREF toUpperInPlace) 84)) 85$(TR $(TD Utf8Matcher) $(TD 86 $(LREF isUtfMatcher) 87 $(LREF MatcherConcept) 88 $(LREF utfMatcher) 89)) 90$(TR $(TD Separators) $(TD 91 $(LREF lineSep) 92 $(LREF nelSep) 93 $(LREF paraSep) 94)) 95$(TR $(TD Building blocks) $(TD 96 $(LREF allowedIn) 97 $(LREF combiningClass) 98 $(LREF Grapheme) 99)) 100)) 101 102 $(P All primitives listed operate on Unicode characters and 103 sets of characters. For functions which operate on ASCII characters 104 and ignore Unicode $(CHARACTERS), see $(MREF std, ascii). 105 For definitions of Unicode $(CHARACTER), $(CODEPOINT) and other terms 106 used throughout this module see the $(S_LINK Terminology, terminology) section 107 below. 108 ) 109 $(P The focus of this module is the core needs of developing Unicode-aware 110 applications. To that effect it provides the following optimized primitives: 111 ) 112 $(UL 113 $(LI Character classification by category and common properties: 114 $(LREF isAlpha), $(LREF isWhite) and others. 115 ) 116 $(LI 117 Case-insensitive string comparison ($(LREF sicmp), $(LREF icmp)). 118 ) 119 $(LI 120 Converting text to any of the four normalization forms via $(LREF normalize). 121 ) 122 $(LI 123 Decoding ($(LREF decodeGrapheme)) and iteration ($(LREF byGrapheme), $(LREF graphemeStride)) 124 by user-perceived characters, that is by $(LREF Grapheme) clusters. 125 ) 126 $(LI 127 Decomposing and composing of individual character(s) according to canonical 128 or compatibility rules, see $(LREF compose) and $(LREF decompose), 129 including the specific version for Hangul syllables $(LREF composeJamo) 130 and $(LREF decomposeHangul). 131 ) 132 ) 133 $(P It's recognized that an application may need further enhancements 134 and extensions, such as less commonly known algorithms, 135 or tailoring existing ones for region specific needs. To help users 136 with building any extra functionality beyond the core primitives, 137 the module provides: 138 ) 139 $(UL 140 $(LI 141 $(LREF CodepointSet), a type for easy manipulation of sets of characters. 142 Besides the typical set algebra it provides an unusual feature: 143 a D source code generator for detection of $(CODEPOINTS) in this set. 144 This is a boon for meta-programming parser frameworks, 145 and is used internally to power classification in small 146 sets like $(LREF isWhite). 147 ) 148 $(LI 149 A way to construct optimal packed multi-stage tables also known as a 150 special case of $(LINK2 https://en.wikipedia.org/wiki/Trie, Trie). 151 The functions $(LREF codepointTrie), $(LREF codepointSetTrie) 152 construct custom tries that map dchar to value. 153 The end result is a fast and predictable $(BIGOH 1) lookup that powers 154 functions like $(LREF isAlpha) and $(LREF combiningClass), 155 but for user-defined data sets. 156 ) 157 $(LI 158 A useful technique for Unicode-aware parsers that perform 159 character classification of encoded $(CODEPOINTS) 160 is to avoid unnecassary decoding at all costs. 161 $(LREF utfMatcher) provides an improvement over the usual workflow 162 of decode-classify-process, combining the decoding and classification 163 steps. By extracting necessary bits directly from encoded 164 $(S_LINK Code unit, code units) matchers achieve 165 significant performance improvements. See $(LREF MatcherConcept) for 166 the common interface of UTF matchers. 167 ) 168 $(LI 169 Generally useful building blocks for customized normalization: 170 $(LREF combiningClass) for querying combining class 171 and $(LREF allowedIn) for testing the Quick_Check 172 property of a given normalization form. 173 ) 174 $(LI 175 Access to a large selection of commonly used sets of $(CODEPOINTS). 176 $(S_LINK Unicode properties, Supported sets) include Script, 177 Block and General Category. The exact contents of a set can be 178 observed in the CLDR utility, on the 179 $(HTTP www.unicode.org/cldr/utility/properties.jsp, property index) page 180 of the Unicode website. 181 See $(LREF unicode) for easy and (optionally) compile-time checked set 182 queries. 183 ) 184 ) 185 $(SECTION Synopsis) 186 --- 187 import std.uni; 188 void main() 189 { 190 // initialize code point sets using script/block or property name 191 // now 'set' contains code points from both scripts. 192 auto set = unicode("Cyrillic") | unicode("Armenian"); 193 // same thing but simpler and checked at compile-time 194 auto ascii = unicode.ASCII; 195 auto currency = unicode.Currency_Symbol; 196 197 // easy set ops 198 auto a = set & ascii; 199 assert(a.empty); // as it has no intersection with ascii 200 a = set | ascii; 201 auto b = currency - a; // subtract all ASCII, Cyrillic and Armenian 202 203 // some properties of code point sets 204 assert(b.length > 45); // 46 items in Unicode 6.1, even more in 6.2 205 // testing presence of a code point in a set 206 // is just fine, it is O(logN) 207 assert(!b['$']); 208 assert(!b['\u058F']); // Armenian dram sign 209 assert(b['��']); 210 211 // building fast lookup tables, these guarantee O(1) complexity 212 // 1-level Trie lookup table essentially a huge bit-set ~262Kb 213 auto oneTrie = toTrie!1(b); 214 // 2-level far more compact but typically slightly slower 215 auto twoTrie = toTrie!2(b); 216 // 3-level even smaller, and a bit slower yet 217 auto threeTrie = toTrie!3(b); 218 assert(oneTrie['��']); 219 assert(twoTrie['��']); 220 assert(threeTrie['��']); 221 222 // build the trie with the most sensible trie level 223 // and bind it as a functor 224 auto cyrillicOrArmenian = toDelegate(set); 225 auto balance = find!(cyrillicOrArmenian)("Hello ����������!"); 226 assert(balance == "����������!"); 227 // compatible with bool delegate(dchar) 228 bool delegate(dchar) bindIt = cyrillicOrArmenian; 229 230 // Normalization 231 string s = "Plain ascii (and not only), is always normalized!"; 232 assert(s is normalize(s));// is the same string 233 234 string nonS = "A\u0308ffin"; // A ligature 235 auto nS = normalize(nonS); // to NFC, the W3C endorsed standard 236 assert(nS == "��ffin"); 237 assert(nS != nonS); 238 string composed = "��ffin"; 239 240 assert(normalize!NFD(composed) == "A\u0308ffin"); 241 // to NFKD, compatibility decomposition useful for fuzzy matching/searching 242 assert(normalize!NFKD("2�����") == "210"); 243 } 244 --- 245 $(SECTION Terminology) 246 $(P The following is a list of important Unicode notions 247 and definitions. Any conventions used specifically in this 248 module alone are marked as such. The descriptions are based on the formal 249 definition as found in $(HTTP www.unicode.org/versions/Unicode6.2.0/ch03.pdf, 250 chapter three of The Unicode Standard Core Specification.) 251 ) 252 $(P $(DEF Abstract character) A unit of information used for the organization, 253 control, or representation of textual data. 254 Note that: 255 $(UL 256 $(LI When representing data, the nature of that data 257 is generally symbolic as opposed to some other 258 kind of data (for example, visual). 259 ) 260 $(LI An abstract character has no concrete form 261 and should not be confused with a $(S_LINK Glyph, glyph). 262 ) 263 $(LI An abstract character does not necessarily 264 correspond to what a user thinks of as a ���character��� 265 and should not be confused with a $(LREF Grapheme). 266 ) 267 $(LI The abstract characters encoded (see Encoded character) 268 are known as Unicode abstract characters. 269 ) 270 $(LI Abstract characters not directly 271 encoded by the Unicode Standard can often be 272 represented by the use of combining character sequences. 273 ) 274 ) 275 ) 276 $(P $(DEF Canonical decomposition) 277 The decomposition of a character or character sequence 278 that results from recursively applying the canonical 279 mappings found in the Unicode Character Database 280 and these described in Conjoining Jamo Behavior 281 (section 12 of 282 $(HTTP www.unicode.org/uni2book/ch03.pdf, Unicode Conformance)). 283 ) 284 $(P $(DEF Canonical composition) 285 The precise definition of the Canonical composition 286 is the algorithm as specified in $(HTTP www.unicode.org/uni2book/ch03.pdf, 287 Unicode Conformance) section 11. 288 Informally it's the process that does the reverse of the canonical 289 decomposition with the addition of certain rules 290 that e.g. prevent legacy characters from appearing in the composed result. 291 ) 292 $(P $(DEF Canonical equivalent) 293 Two character sequences are said to be canonical equivalents if 294 their full canonical decompositions are identical. 295 ) 296 $(P $(DEF Character) Typically differs by context. 297 For the purpose of this documentation the term $(I character) 298 implies $(I encoded character), that is, a code point having 299 an assigned abstract character (a symbolic meaning). 300 ) 301 $(P $(DEF Code point) Any value in the Unicode codespace; 302 that is, the range of integers from 0 to 10FFFF (hex). 303 Not all code points are assigned to encoded characters. 304 ) 305 $(P $(DEF Code unit) The minimal bit combination that can represent 306 a unit of encoded text for processing or interchange. 307 Depending on the encoding this could be: 308 8-bit code units in the UTF-8 (`char`), 309 16-bit code units in the UTF-16 (`wchar`), 310 and 32-bit code units in the UTF-32 (`dchar`). 311 $(I Note that in UTF-32, a code unit is a code point 312 and is represented by the D `dchar` type.) 313 ) 314 $(P $(DEF Combining character) A character with the General Category 315 of Combining Mark(M). 316 $(UL 317 $(LI All characters with non-zero canonical combining class 318 are combining characters, but the reverse is not the case: 319 there are combining characters with a zero combining class. 320 ) 321 $(LI These characters are not normally used in isolation 322 unless they are being described. They include such characters 323 as accents, diacritics, Hebrew points, Arabic vowel signs, 324 and Indic matras. 325 ) 326 ) 327 ) 328 $(P $(DEF Combining class) 329 A numerical value used by the Unicode Canonical Ordering Algorithm 330 to determine which sequences of combining marks are to be 331 considered canonically equivalent and which are not. 332 ) 333 $(P $(DEF Compatibility decomposition) 334 The decomposition of a character or character sequence that results 335 from recursively applying both the compatibility mappings and 336 the canonical mappings found in the Unicode Character Database, and those 337 described in Conjoining Jamo Behavior no characters 338 can be further decomposed. 339 ) 340 $(P $(DEF Compatibility equivalent) 341 Two character sequences are said to be compatibility 342 equivalents if their full compatibility decompositions are identical. 343 ) 344 $(P $(DEF Encoded character) An association (or mapping) 345 between an abstract character and a code point. 346 ) 347 $(P $(DEF Glyph) The actual, concrete image of a glyph representation 348 having been rasterized or otherwise imaged onto some display surface. 349 ) 350 $(P $(DEF Grapheme base) A character with the property 351 Grapheme_Base, or any standard Korean syllable block. 352 ) 353 $(P $(DEF Grapheme cluster) Defined as the text between 354 grapheme boundaries as specified by Unicode Standard Annex #29, 355 $(HTTP www.unicode.org/reports/tr29/, Unicode text segmentation). 356 Important general properties of a grapheme: 357 $(UL 358 $(LI The grapheme cluster represents a horizontally segmentable 359 unit of text, consisting of some grapheme base (which may 360 consist of a Korean syllable) together with any number of 361 nonspacing marks applied to it. 362 ) 363 $(LI A grapheme cluster typically starts with a grapheme base 364 and then extends across any subsequent sequence of nonspacing marks. 365 A grapheme cluster is most directly relevant to text rendering and 366 processes such as cursor placement and text selection in editing, 367 but may also be relevant to comparison and searching. 368 ) 369 $(LI For many processes, a grapheme cluster behaves as if it was a 370 single character with the same properties as its grapheme base. 371 Effectively, nonspacing marks apply $(I graphically) to the base, 372 but do not change its properties. 373 ) 374 ) 375 $(P This module defines a number of primitives that work with graphemes: 376 $(LREF Grapheme), $(LREF decodeGrapheme) and $(LREF graphemeStride). 377 All of them are using $(I extended grapheme) boundaries 378 as defined in the aforementioned standard annex. 379 ) 380 ) 381 $(P $(DEF Nonspacing mark) A combining character with the 382 General Category of Nonspacing Mark (Mn) or Enclosing Mark (Me). 383 ) 384 $(P $(DEF Spacing mark) A combining character that is not a nonspacing mark. 385 ) 386 $(SECTION Normalization) 387 $(P The concepts of $(S_LINK Canonical equivalent, canonical equivalent) 388 or $(S_LINK Compatibility equivalent, compatibility equivalent) 389 characters in the Unicode Standard make it necessary to have a full, formal 390 definition of equivalence for Unicode strings. 391 String equivalence is determined by a process called normalization, 392 whereby strings are converted into forms which are compared 393 directly for identity. This is the primary goal of the normalization process, 394 see the function $(LREF normalize) to convert into any of 395 the four defined forms. 396 ) 397 $(P A very important attribute of the Unicode Normalization Forms 398 is that they must remain stable between versions of the Unicode Standard. 399 A Unicode string normalized to a particular Unicode Normalization Form 400 in one version of the standard is guaranteed to remain in that Normalization 401 Form for implementations of future versions of the standard. 402 ) 403 $(P The Unicode Standard specifies four normalization forms. 404 Informally, two of these forms are defined by maximal decomposition 405 of equivalent sequences, and two of these forms are defined 406 by maximal $(I composition) of equivalent sequences. 407 $(UL 408 $(LI Normalization Form D (NFD): The $(S_LINK Canonical decomposition, 409 canonical decomposition) of a character sequence.) 410 $(LI Normalization Form KD (NFKD): The $(S_LINK Compatibility decomposition, 411 compatibility decomposition) of a character sequence.) 412 $(LI Normalization Form C (NFC): The canonical composition of the 413 $(S_LINK Canonical decomposition, canonical decomposition) 414 of a coded character sequence.) 415 $(LI Normalization Form KC (NFKC): The canonical composition 416 of the $(S_LINK Compatibility decomposition, 417 compatibility decomposition) of a character sequence) 418 ) 419 ) 420 $(P The choice of the normalization form depends on the particular use case. 421 NFC is the best form for general text, since it's more compatible with 422 strings converted from legacy encodings. NFKC is the preferred form for 423 identifiers, especially where there are security concerns. NFD and NFKD 424 are the most useful for internal processing. 425 ) 426 $(SECTION Construction of lookup tables) 427 $(P The Unicode standard describes a set of algorithms that 428 depend on having the ability to quickly look up various properties 429 of a code point. Given the the codespace of about 1 million $(CODEPOINTS), 430 it is not a trivial task to provide a space-efficient solution for 431 the multitude of properties. 432 ) 433 $(P Common approaches such as hash-tables or binary search over 434 sorted code point intervals (as in $(LREF InversionList)) are insufficient. 435 Hash-tables have enormous memory footprint and binary search 436 over intervals is not fast enough for some heavy-duty algorithms. 437 ) 438 $(P The recommended solution (see Unicode Implementation Guidelines) 439 is using multi-stage tables that are an implementation of the 440 $(HTTP en.wikipedia.org/wiki/Trie, Trie) data structure with integer 441 keys and a fixed number of stages. For the remainder of the section 442 this will be called a fixed trie. The following describes a particular 443 implementation that is aimed for the speed of access at the expense 444 of ideal size savings. 445 ) 446 $(P Taking a 2-level Trie as an example the principle of operation is as follows. 447 Split the number of bits in a key (code point, 21 bits) into 2 components 448 (e.g. 15 and 8). The first is the number of bits in the index of the trie 449 and the other is number of bits in each page of the trie. 450 The layout of the trie is then an array of size 2^^bits-of-index followed 451 an array of memory chunks of size 2^^bits-of-page/bits-per-element. 452 ) 453 $(P The number of pages is variable (but not less then 1) 454 unlike the number of entries in the index. The slots of the index 455 all have to contain a number of a page that is present. The lookup is then 456 just a couple of operations - slice the upper bits, 457 lookup an index for these, take a page at this index and use 458 the lower bits as an offset within this page. 459 460 Assuming that pages are laid out consequently 461 in one array at `pages`, the pseudo-code is: 462 ) 463 --- 464 auto elemsPerPage = (2 ^^ bits_per_page) / Value.sizeOfInBits; 465 pages[index[n >> bits_per_page]][n & (elemsPerPage - 1)]; 466 --- 467 $(P Where if `elemsPerPage` is a power of 2 the whole process is 468 a handful of simple instructions and 2 array reads. Subsequent levels 469 of the trie are introduced by recursing on this notion - the index array 470 is treated as values. The number of bits in index is then again 471 split into 2 parts, with pages over 'current-index' and the new 'upper-index'. 472 ) 473 474 $(P For completeness a level 1 trie is simply an array. 475 The current implementation takes advantage of bit-packing values 476 when the range is known to be limited in advance (such as `bool`). 477 See also $(LREF BitPacked) for enforcing it manually. 478 The major size advantage however comes from the fact 479 that multiple $(B identical pages on every level are merged) by construction. 480 ) 481 $(P The process of constructing a trie is more involved and is hidden from 482 the user in a form of the convenience functions $(LREF codepointTrie), 483 $(LREF codepointSetTrie) and the even more convenient $(LREF toTrie). 484 In general a set or built-in AA with `dchar` type 485 can be turned into a trie. The trie object in this module 486 is read-only (immutable); it's effectively frozen after construction. 487 ) 488 $(SECTION Unicode properties) 489 $(P This is a full list of Unicode properties accessible through $(LREF unicode) 490 with specific helpers per category nested within. Consult the 491 $(HTTP www.unicode.org/cldr/utility/properties.jsp, CLDR utility) 492 when in doubt about the contents of a particular set. 493 ) 494 $(P General category sets listed below are only accessible with the 495 $(LREF unicode) shorthand accessor.) 496 $(BOOKTABLE $(B General category ), 497 $(TR $(TH Abb.) $(TH Long form) 498 $(TH Abb.) $(TH Long form)$(TH Abb.) $(TH Long form)) 499 $(TR $(TD L) $(TD Letter) 500 $(TD Cn) $(TD Unassigned) $(TD Po) $(TD Other_Punctuation)) 501 $(TR $(TD Ll) $(TD Lowercase_Letter) 502 $(TD Co) $(TD Private_Use) $(TD Ps) $(TD Open_Punctuation)) 503 $(TR $(TD Lm) $(TD Modifier_Letter) 504 $(TD Cs) $(TD Surrogate) $(TD S) $(TD Symbol)) 505 $(TR $(TD Lo) $(TD Other_Letter) 506 $(TD N) $(TD Number) $(TD Sc) $(TD Currency_Symbol)) 507 $(TR $(TD Lt) $(TD Titlecase_Letter) 508 $(TD Nd) $(TD Decimal_Number) $(TD Sk) $(TD Modifier_Symbol)) 509 $(TR $(TD Lu) $(TD Uppercase_Letter) 510 $(TD Nl) $(TD Letter_Number) $(TD Sm) $(TD Math_Symbol)) 511 $(TR $(TD M) $(TD Mark) 512 $(TD No) $(TD Other_Number) $(TD So) $(TD Other_Symbol)) 513 $(TR $(TD Mc) $(TD Spacing_Mark) 514 $(TD P) $(TD Punctuation) $(TD Z) $(TD Separator)) 515 $(TR $(TD Me) $(TD Enclosing_Mark) 516 $(TD Pc) $(TD Connector_Punctuation) $(TD Zl) $(TD Line_Separator)) 517 $(TR $(TD Mn) $(TD Nonspacing_Mark) 518 $(TD Pd) $(TD Dash_Punctuation) $(TD Zp) $(TD Paragraph_Separator)) 519 $(TR $(TD C) $(TD Other) 520 $(TD Pe) $(TD Close_Punctuation) $(TD Zs) $(TD Space_Separator)) 521 $(TR $(TD Cc) $(TD Control) $(TD Pf) 522 $(TD Final_Punctuation) $(TD -) $(TD Any)) 523 $(TR $(TD Cf) $(TD Format) 524 $(TD Pi) $(TD Initial_Punctuation) $(TD -) $(TD ASCII)) 525 ) 526 $(P Sets for other commonly useful properties that are 527 accessible with $(LREF unicode):) 528 $(BOOKTABLE $(B Common binary properties), 529 $(TR $(TH Name) $(TH Name) $(TH Name)) 530 $(TR $(TD Alphabetic) $(TD Ideographic) $(TD Other_Uppercase)) 531 $(TR $(TD ASCII_Hex_Digit) $(TD IDS_Binary_Operator) $(TD Pattern_Syntax)) 532 $(TR $(TD Bidi_Control) $(TD ID_Start) $(TD Pattern_White_Space)) 533 $(TR $(TD Cased) $(TD IDS_Trinary_Operator) $(TD Quotation_Mark)) 534 $(TR $(TD Case_Ignorable) $(TD Join_Control) $(TD Radical)) 535 $(TR $(TD Dash) $(TD Logical_Order_Exception) $(TD Soft_Dotted)) 536 $(TR $(TD Default_Ignorable_Code_Point) $(TD Lowercase) $(TD STerm)) 537 $(TR $(TD Deprecated) $(TD Math) $(TD Terminal_Punctuation)) 538 $(TR $(TD Diacritic) $(TD Noncharacter_Code_Point) $(TD Unified_Ideograph)) 539 $(TR $(TD Extender) $(TD Other_Alphabetic) $(TD Uppercase)) 540 $(TR $(TD Grapheme_Base) $(TD Other_Default_Ignorable_Code_Point) $(TD Variation_Selector)) 541 $(TR $(TD Grapheme_Extend) $(TD Other_Grapheme_Extend) $(TD White_Space)) 542 $(TR $(TD Grapheme_Link) $(TD Other_ID_Continue) $(TD XID_Continue)) 543 $(TR $(TD Hex_Digit) $(TD Other_ID_Start) $(TD XID_Start)) 544 $(TR $(TD Hyphen) $(TD Other_Lowercase) ) 545 $(TR $(TD ID_Continue) $(TD Other_Math) ) 546 ) 547 $(P Below is the table with block names accepted by $(LREF unicode.block). 548 Note that the shorthand version $(LREF unicode) requires "In" 549 to be prepended to the names of blocks so as to disambiguate 550 scripts and blocks. 551 ) 552 $(BOOKTABLE $(B Blocks), 553 $(TR $(TD Aegean Numbers) $(TD Ethiopic Extended) $(TD Mongolian)) 554 $(TR $(TD Alchemical Symbols) $(TD Ethiopic Extended-A) $(TD Musical Symbols)) 555 $(TR $(TD Alphabetic Presentation Forms) $(TD Ethiopic Supplement) $(TD Myanmar)) 556 $(TR $(TD Ancient Greek Musical Notation) $(TD General Punctuation) $(TD Myanmar Extended-A)) 557 $(TR $(TD Ancient Greek Numbers) $(TD Geometric Shapes) $(TD New Tai Lue)) 558 $(TR $(TD Ancient Symbols) $(TD Georgian) $(TD NKo)) 559 $(TR $(TD Arabic) $(TD Georgian Supplement) $(TD Number Forms)) 560 $(TR $(TD Arabic Extended-A) $(TD Glagolitic) $(TD Ogham)) 561 $(TR $(TD Arabic Mathematical Alphabetic Symbols) $(TD Gothic) $(TD Ol Chiki)) 562 $(TR $(TD Arabic Presentation Forms-A) $(TD Greek and Coptic) $(TD Old Italic)) 563 $(TR $(TD Arabic Presentation Forms-B) $(TD Greek Extended) $(TD Old Persian)) 564 $(TR $(TD Arabic Supplement) $(TD Gujarati) $(TD Old South Arabian)) 565 $(TR $(TD Armenian) $(TD Gurmukhi) $(TD Old Turkic)) 566 $(TR $(TD Arrows) $(TD Halfwidth and Fullwidth Forms) $(TD Optical Character Recognition)) 567 $(TR $(TD Avestan) $(TD Hangul Compatibility Jamo) $(TD Oriya)) 568 $(TR $(TD Balinese) $(TD Hangul Jamo) $(TD Osmanya)) 569 $(TR $(TD Bamum) $(TD Hangul Jamo Extended-A) $(TD Phags-pa)) 570 $(TR $(TD Bamum Supplement) $(TD Hangul Jamo Extended-B) $(TD Phaistos Disc)) 571 $(TR $(TD Basic Latin) $(TD Hangul Syllables) $(TD Phoenician)) 572 $(TR $(TD Batak) $(TD Hanunoo) $(TD Phonetic Extensions)) 573 $(TR $(TD Bengali) $(TD Hebrew) $(TD Phonetic Extensions Supplement)) 574 $(TR $(TD Block Elements) $(TD High Private Use Surrogates) $(TD Playing Cards)) 575 $(TR $(TD Bopomofo) $(TD High Surrogates) $(TD Private Use Area)) 576 $(TR $(TD Bopomofo Extended) $(TD Hiragana) $(TD Rejang)) 577 $(TR $(TD Box Drawing) $(TD Ideographic Description Characters) $(TD Rumi Numeral Symbols)) 578 $(TR $(TD Brahmi) $(TD Imperial Aramaic) $(TD Runic)) 579 $(TR $(TD Braille Patterns) $(TD Inscriptional Pahlavi) $(TD Samaritan)) 580 $(TR $(TD Buginese) $(TD Inscriptional Parthian) $(TD Saurashtra)) 581 $(TR $(TD Buhid) $(TD IPA Extensions) $(TD Sharada)) 582 $(TR $(TD Byzantine Musical Symbols) $(TD Javanese) $(TD Shavian)) 583 $(TR $(TD Carian) $(TD Kaithi) $(TD Sinhala)) 584 $(TR $(TD Chakma) $(TD Kana Supplement) $(TD Small Form Variants)) 585 $(TR $(TD Cham) $(TD Kanbun) $(TD Sora Sompeng)) 586 $(TR $(TD Cherokee) $(TD Kangxi Radicals) $(TD Spacing Modifier Letters)) 587 $(TR $(TD CJK Compatibility) $(TD Kannada) $(TD Specials)) 588 $(TR $(TD CJK Compatibility Forms) $(TD Katakana) $(TD Sundanese)) 589 $(TR $(TD CJK Compatibility Ideographs) $(TD Katakana Phonetic Extensions) $(TD Sundanese Supplement)) 590 $(TR $(TD CJK Compatibility Ideographs Supplement) $(TD Kayah Li) $(TD Superscripts and Subscripts)) 591 $(TR $(TD CJK Radicals Supplement) $(TD Kharoshthi) $(TD Supplemental Arrows-A)) 592 $(TR $(TD CJK Strokes) $(TD Khmer) $(TD Supplemental Arrows-B)) 593 $(TR $(TD CJK Symbols and Punctuation) $(TD Khmer Symbols) $(TD Supplemental Mathematical Operators)) 594 $(TR $(TD CJK Unified Ideographs) $(TD Lao) $(TD Supplemental Punctuation)) 595 $(TR $(TD CJK Unified Ideographs Extension A) $(TD Latin-1 Supplement) $(TD Supplementary Private Use Area-A)) 596 $(TR $(TD CJK Unified Ideographs Extension B) $(TD Latin Extended-A) $(TD Supplementary Private Use Area-B)) 597 $(TR $(TD CJK Unified Ideographs Extension C) $(TD Latin Extended Additional) $(TD Syloti Nagri)) 598 $(TR $(TD CJK Unified Ideographs Extension D) $(TD Latin Extended-B) $(TD Syriac)) 599 $(TR $(TD Combining Diacritical Marks) $(TD Latin Extended-C) $(TD Tagalog)) 600 $(TR $(TD Combining Diacritical Marks for Symbols) $(TD Latin Extended-D) $(TD Tagbanwa)) 601 $(TR $(TD Combining Diacritical Marks Supplement) $(TD Lepcha) $(TD Tags)) 602 $(TR $(TD Combining Half Marks) $(TD Letterlike Symbols) $(TD Tai Le)) 603 $(TR $(TD Common Indic Number Forms) $(TD Limbu) $(TD Tai Tham)) 604 $(TR $(TD Control Pictures) $(TD Linear B Ideograms) $(TD Tai Viet)) 605 $(TR $(TD Coptic) $(TD Linear B Syllabary) $(TD Tai Xuan Jing Symbols)) 606 $(TR $(TD Counting Rod Numerals) $(TD Lisu) $(TD Takri)) 607 $(TR $(TD Cuneiform) $(TD Low Surrogates) $(TD Tamil)) 608 $(TR $(TD Cuneiform Numbers and Punctuation) $(TD Lycian) $(TD Telugu)) 609 $(TR $(TD Currency Symbols) $(TD Lydian) $(TD Thaana)) 610 $(TR $(TD Cypriot Syllabary) $(TD Mahjong Tiles) $(TD Thai)) 611 $(TR $(TD Cyrillic) $(TD Malayalam) $(TD Tibetan)) 612 $(TR $(TD Cyrillic Extended-A) $(TD Mandaic) $(TD Tifinagh)) 613 $(TR $(TD Cyrillic Extended-B) $(TD Mathematical Alphanumeric Symbols) $(TD Transport And Map Symbols)) 614 $(TR $(TD Cyrillic Supplement) $(TD Mathematical Operators) $(TD Ugaritic)) 615 $(TR $(TD Deseret) $(TD Meetei Mayek) $(TD Unified Canadian Aboriginal Syllabics)) 616 $(TR $(TD Devanagari) $(TD Meetei Mayek Extensions) $(TD Unified Canadian Aboriginal Syllabics Extended)) 617 $(TR $(TD Devanagari Extended) $(TD Meroitic Cursive) $(TD Vai)) 618 $(TR $(TD Dingbats) $(TD Meroitic Hieroglyphs) $(TD Variation Selectors)) 619 $(TR $(TD Domino Tiles) $(TD Miao) $(TD Variation Selectors Supplement)) 620 $(TR $(TD Egyptian Hieroglyphs) $(TD Miscellaneous Mathematical Symbols-A) $(TD Vedic Extensions)) 621 $(TR $(TD Emoticons) $(TD Miscellaneous Mathematical Symbols-B) $(TD Vertical Forms)) 622 $(TR $(TD Enclosed Alphanumerics) $(TD Miscellaneous Symbols) $(TD Yijing Hexagram Symbols)) 623 $(TR $(TD Enclosed Alphanumeric Supplement) $(TD Miscellaneous Symbols and Arrows) $(TD Yi Radicals)) 624 $(TR $(TD Enclosed CJK Letters and Months) $(TD Miscellaneous Symbols And Pictographs) $(TD Yi Syllables)) 625 $(TR $(TD Enclosed Ideographic Supplement) $(TD Miscellaneous Technical) ) 626 $(TR $(TD Ethiopic) $(TD Modifier Tone Letters) ) 627 ) 628 $(P Below is the table with script names accepted by $(LREF unicode.script) 629 and by the shorthand version $(LREF unicode):) 630 $(BOOKTABLE $(B Scripts), 631 $(TR $(TD Arabic) $(TD Hanunoo) $(TD Old_Italic)) 632 $(TR $(TD Armenian) $(TD Hebrew) $(TD Old_Persian)) 633 $(TR $(TD Avestan) $(TD Hiragana) $(TD Old_South_Arabian)) 634 $(TR $(TD Balinese) $(TD Imperial_Aramaic) $(TD Old_Turkic)) 635 $(TR $(TD Bamum) $(TD Inherited) $(TD Oriya)) 636 $(TR $(TD Batak) $(TD Inscriptional_Pahlavi) $(TD Osmanya)) 637 $(TR $(TD Bengali) $(TD Inscriptional_Parthian) $(TD Phags_Pa)) 638 $(TR $(TD Bopomofo) $(TD Javanese) $(TD Phoenician)) 639 $(TR $(TD Brahmi) $(TD Kaithi) $(TD Rejang)) 640 $(TR $(TD Braille) $(TD Kannada) $(TD Runic)) 641 $(TR $(TD Buginese) $(TD Katakana) $(TD Samaritan)) 642 $(TR $(TD Buhid) $(TD Kayah_Li) $(TD Saurashtra)) 643 $(TR $(TD Canadian_Aboriginal) $(TD Kharoshthi) $(TD Sharada)) 644 $(TR $(TD Carian) $(TD Khmer) $(TD Shavian)) 645 $(TR $(TD Chakma) $(TD Lao) $(TD Sinhala)) 646 $(TR $(TD Cham) $(TD Latin) $(TD Sora_Sompeng)) 647 $(TR $(TD Cherokee) $(TD Lepcha) $(TD Sundanese)) 648 $(TR $(TD Common) $(TD Limbu) $(TD Syloti_Nagri)) 649 $(TR $(TD Coptic) $(TD Linear_B) $(TD Syriac)) 650 $(TR $(TD Cuneiform) $(TD Lisu) $(TD Tagalog)) 651 $(TR $(TD Cypriot) $(TD Lycian) $(TD Tagbanwa)) 652 $(TR $(TD Cyrillic) $(TD Lydian) $(TD Tai_Le)) 653 $(TR $(TD Deseret) $(TD Malayalam) $(TD Tai_Tham)) 654 $(TR $(TD Devanagari) $(TD Mandaic) $(TD Tai_Viet)) 655 $(TR $(TD Egyptian_Hieroglyphs) $(TD Meetei_Mayek) $(TD Takri)) 656 $(TR $(TD Ethiopic) $(TD Meroitic_Cursive) $(TD Tamil)) 657 $(TR $(TD Georgian) $(TD Meroitic_Hieroglyphs) $(TD Telugu)) 658 $(TR $(TD Glagolitic) $(TD Miao) $(TD Thaana)) 659 $(TR $(TD Gothic) $(TD Mongolian) $(TD Thai)) 660 $(TR $(TD Greek) $(TD Myanmar) $(TD Tibetan)) 661 $(TR $(TD Gujarati) $(TD New_Tai_Lue) $(TD Tifinagh)) 662 $(TR $(TD Gurmukhi) $(TD Nko) $(TD Ugaritic)) 663 $(TR $(TD Han) $(TD Ogham) $(TD Vai)) 664 $(TR $(TD Hangul) $(TD Ol_Chiki) $(TD Yi)) 665 ) 666 $(P Below is the table of names accepted by $(LREF unicode.hangulSyllableType).) 667 $(BOOKTABLE $(B Hangul syllable type), 668 $(TR $(TH Abb.) $(TH Long form)) 669 $(TR $(TD L) $(TD Leading_Jamo)) 670 $(TR $(TD LV) $(TD LV_Syllable)) 671 $(TR $(TD LVT) $(TD LVT_Syllable) ) 672 $(TR $(TD T) $(TD Trailing_Jamo)) 673 $(TR $(TD V) $(TD Vowel_Jamo)) 674 ) 675 References: 676 $(HTTP www.digitalmars.com/d/ascii-table.html, ASCII Table), 677 $(HTTP en.wikipedia.org/wiki/Unicode, Wikipedia), 678 $(HTTP www.unicode.org, The Unicode Consortium), 679 $(HTTP www.unicode.org/reports/tr15/, Unicode normalization forms), 680 $(HTTP www.unicode.org/reports/tr29/, Unicode text segmentation) 681 $(HTTP www.unicode.org/uni2book/ch05.pdf, 682 Unicode Implementation Guidelines) 683 $(HTTP www.unicode.org/uni2book/ch03.pdf, 684 Unicode Conformance) 685 Trademarks: 686 Unicode(tm) is a trademark of Unicode, Inc. 687 688 Copyright: Copyright 2013 - 689 License: $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0). 690 Authors: Dmitry Olshansky 691 Source: $(PHOBOSSRC std/uni/package.d) 692 Standards: $(HTTP www.unicode.org/versions/Unicode6.2.0/, Unicode v6.2) 693 694Macros: 695 696SECTION = <h3><a id="$1">$0</a></h3> 697DEF = <div><a id="$1"><i>$0</i></a></div> 698S_LINK = <a href="#$1">$+</a> 699CODEPOINT = $(S_LINK Code point, code point) 700CODEPOINTS = $(S_LINK Code point, code points) 701CHARACTER = $(S_LINK Character, character) 702CHARACTERS = $(S_LINK Character, characters) 703CLUSTER = $(S_LINK Grapheme cluster, grapheme cluster) 704+/ 705module std.uni; 706 707import std.meta : AliasSeq; 708import std.range.primitives : back, ElementEncodingType, ElementType, empty, 709 front, hasLength, hasSlicing, isForwardRange, isInputRange, 710 isRandomAccessRange, popFront, put, save; 711import std.traits : isConvertibleToString, isIntegral, isSomeChar, 712 isSomeString, Unqual, isDynamicArray; 713// debug = std_uni; 714 715debug(std_uni) import std.stdio; // writefln, writeln 716 717private: 718 719 720void copyBackwards(T,U)(T[] src, U[] dest) 721{ 722 assert(src.length == dest.length); 723 for (size_t i=src.length; i-- > 0; ) 724 dest[i] = src[i]; 725} 726 727void copyForward(T,U)(T[] src, U[] dest) 728{ 729 assert(src.length == dest.length); 730 for (size_t i=0; i<src.length; i++) 731 dest[i] = src[i]; 732} 733 734// TODO: update to reflect all major CPUs supporting unaligned reads 735version (X86) 736 enum hasUnalignedReads = true; 737else version (X86_64) 738 enum hasUnalignedReads = true; 739else version (SystemZ) 740 enum hasUnalignedReads = true; 741else 742 enum hasUnalignedReads = false; // better be safe then sorry 743 744public enum dchar lineSep = '\u2028'; /// Constant $(CODEPOINT) (0x2028) - line separator. 745public enum dchar paraSep = '\u2029'; /// Constant $(CODEPOINT) (0x2029) - paragraph separator. 746public enum dchar nelSep = '\u0085'; /// Constant $(CODEPOINT) (0x0085) - next line. 747 748// test the intro example 749@safe unittest 750{ 751 import std.algorithm.searching : find; 752 // initialize code point sets using script/block or property name 753 // set contains code points from both scripts. 754 auto set = unicode("Cyrillic") | unicode("Armenian"); 755 // or simpler and statically-checked look 756 auto ascii = unicode.ASCII; 757 auto currency = unicode.Currency_Symbol; 758 759 // easy set ops 760 auto a = set & ascii; 761 assert(a.empty); // as it has no intersection with ascii 762 a = set | ascii; 763 auto b = currency - a; // subtract all ASCII, Cyrillic and Armenian 764 765 // some properties of code point sets 766 assert(b.length > 45); // 46 items in Unicode 6.1, even more in 6.2 767 // testing presence of a code point in a set 768 // is just fine, it is O(logN) 769 assert(!b['$']); 770 assert(!b['\u058F']); // Armenian dram sign 771 assert(b['��']); 772 773 // building fast lookup tables, these guarantee O(1) complexity 774 // 1-level Trie lookup table essentially a huge bit-set ~262Kb 775 auto oneTrie = toTrie!1(b); 776 // 2-level far more compact but typically slightly slower 777 auto twoTrie = toTrie!2(b); 778 // 3-level even smaller, and a bit slower yet 779 auto threeTrie = toTrie!3(b); 780 assert(oneTrie['��']); 781 assert(twoTrie['��']); 782 assert(threeTrie['��']); 783 784 // build the trie with the most sensible trie level 785 // and bind it as a functor 786 auto cyrillicOrArmenian = toDelegate(set); 787 auto balance = find!(cyrillicOrArmenian)("Hello ����������!"); 788 assert(balance == "����������!"); 789 // compatible with bool delegate(dchar) 790 bool delegate(dchar) bindIt = cyrillicOrArmenian; 791 792 // Normalization 793 string s = "Plain ascii (and not only), is always normalized!"; 794 assert(s is normalize(s));// is the same string 795 796 string nonS = "A\u0308ffin"; // A ligature 797 auto nS = normalize(nonS); // to NFC, the W3C endorsed standard 798 assert(nS == "��ffin"); 799 assert(nS != nonS); 800 string composed = "��ffin"; 801 802 assert(normalize!NFD(composed) == "A\u0308ffin"); 803 // to NFKD, compatibility decomposition useful for fuzzy matching/searching 804 assert(normalize!NFKD("2�����") == "210"); 805} 806 807enum lastDchar = 0x10FFFF; 808 809auto force(T, F)(F from) 810if (isIntegral!T && !is(T == F)) 811{ 812 assert(from <= T.max && from >= T.min); 813 return cast(T) from; 814} 815 816auto force(T, F)(F from) 817if (isBitPacked!T && !is(T == F)) 818{ 819 assert(from <= 2^^bitSizeOf!T-1); 820 return T(cast(TypeOfBitPacked!T) from); 821} 822 823auto force(T, F)(F from) 824if (is(T == F)) 825{ 826 return from; 827} 828 829// repeat X times the bit-pattern in val assuming it's length is 'bits' 830size_t replicateBits(size_t times, size_t bits)(size_t val) @safe pure nothrow @nogc 831{ 832 static if (times == 1) 833 return val; 834 else static if (bits == 1) 835 { 836 static if (times == size_t.sizeof*8) 837 return val ? size_t.max : 0; 838 else 839 return val ? (1 << times)-1 : 0; 840 } 841 else static if (times % 2) 842 return (replicateBits!(times-1, bits)(val)<<bits) | val; 843 else 844 return replicateBits!(times/2, bits*2)((val << bits) | val); 845} 846 847@safe pure nothrow @nogc unittest // for replicate 848{ 849 import std.algorithm.iteration : sum, map; 850 import std.range : iota; 851 size_t m = 0b111; 852 size_t m2 = 0b01; 853 static foreach (i; AliasSeq!(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)) 854 { 855 assert(replicateBits!(i, 3)(m)+1 == (1<<(3*i))); 856 assert(replicateBits!(i, 2)(m2) == iota(0, i).map!"2^^(2*a)"().sum()); 857 } 858} 859 860// multiple arrays squashed into one memory block 861struct MultiArray(Types...) 862{ 863 import std.range.primitives : isOutputRange; 864 this(size_t[] sizes...) @safe pure nothrow 865 { 866 assert(dim == sizes.length); 867 size_t full_size; 868 foreach (i, v; Types) 869 { 870 full_size += spaceFor!(bitSizeOf!v)(sizes[i]); 871 sz[i] = sizes[i]; 872 static if (i >= 1) 873 offsets[i] = offsets[i-1] + 874 spaceFor!(bitSizeOf!(Types[i-1]))(sizes[i-1]); 875 } 876 877 storage = new size_t[full_size]; 878 } 879 880 this(const(size_t)[] raw_offsets, 881 const(size_t)[] raw_sizes, 882 return scope const(size_t)[] data) return scope const @safe pure nothrow @nogc 883 { 884 offsets[] = raw_offsets[]; 885 sz[] = raw_sizes[]; 886 storage = data; 887 } 888 889 @property auto slice(size_t n)()inout pure nothrow @nogc 890 { 891 auto ptr = raw_ptr!n; 892 return packedArrayView!(Types[n])(ptr, sz[n]); 893 } 894 895 @property auto ptr(size_t n)()inout pure nothrow @nogc 896 { 897 auto ptr = raw_ptr!n; 898 return inout(PackedPtr!(Types[n]))(ptr); 899 } 900 901 template length(size_t n) 902 { 903 @property size_t length()const @safe pure nothrow @nogc{ return sz[n]; } 904 905 @property void length(size_t new_size) 906 { 907 if (new_size > sz[n]) 908 {// extend 909 size_t delta = (new_size - sz[n]); 910 sz[n] += delta; 911 delta = spaceFor!(bitSizeOf!(Types[n]))(delta); 912 storage.length += delta;// extend space at end 913 // raw_slice!x must follow resize as it could be moved! 914 // next stmts move all data past this array, last-one-goes-first 915 static if (n != dim-1) 916 { 917 auto start = raw_ptr!(n+1); 918 // len includes delta 919 size_t len = (storage.ptr+storage.length-start); 920 921 copyBackwards(start[0 .. len-delta], start[delta .. len]); 922 923 start[0 .. delta] = 0; 924 // offsets are used for raw_slice, ptr etc. 925 foreach (i; n+1 .. dim) 926 offsets[i] += delta; 927 } 928 } 929 else if (new_size < sz[n]) 930 {// shrink 931 size_t delta = (sz[n] - new_size); 932 sz[n] -= delta; 933 delta = spaceFor!(bitSizeOf!(Types[n]))(delta); 934 // move all data past this array, forward direction 935 static if (n != dim-1) 936 { 937 auto start = raw_ptr!(n+1); 938 size_t len = (storage.ptr+storage.length-start); 939 copyForward(start[0 .. len-delta], start[delta .. len]); 940 941 // adjust offsets last, they affect raw_slice 942 foreach (i; n+1 .. dim) 943 offsets[i] -= delta; 944 } 945 storage.length -= delta; 946 } 947 // else - NOP 948 } 949 } 950 951 @property size_t bytes(size_t n=size_t.max)() const @safe 952 { 953 static if (n == size_t.max) 954 return storage.length*size_t.sizeof; 955 else static if (n != Types.length-1) 956 return (raw_ptr!(n+1)-raw_ptr!n)*size_t.sizeof; 957 else 958 return (storage.ptr+storage.length - raw_ptr!n)*size_t.sizeof; 959 } 960 961 void store(OutRange)(scope OutRange sink) const 962 if (isOutputRange!(OutRange, char)) 963 { 964 import std.format.write : formattedWrite; 965 formattedWrite(sink, "[%( 0x%x, %)]", offsets[]); 966 formattedWrite(sink, ", [%( 0x%x, %)]", sz[]); 967 formattedWrite(sink, ", [%( 0x%x, %)]", storage); 968 } 969 970private: 971 import std.meta : staticMap; 972 @property auto raw_ptr(size_t n)()inout pure nothrow @nogc 973 { 974 static if (n == 0) 975 return storage.ptr; 976 else 977 { 978 return storage.ptr+offsets[n]; 979 } 980 } 981 enum dim = Types.length; 982 size_t[dim] offsets;// offset for level x 983 size_t[dim] sz;// size of level x 984 alias bitWidth = staticMap!(bitSizeOf, Types); 985 size_t[] storage; 986} 987 988@system unittest 989{ 990 import std.conv : text; 991 enum dg = (){ 992 // sizes are: 993 // lvl0: 3, lvl1 : 2, lvl2: 1 994 auto m = MultiArray!(int, ubyte, int)(3,2,1); 995 996 static void check(size_t k, T)(ref T m, int n) 997 { 998 foreach (i; 0 .. n) 999 assert(m.slice!(k)[i] == i+1, text("level:",i," : ",m.slice!(k)[0 .. n])); 1000 } 1001 1002 static void checkB(size_t k, T)(ref T m, int n) 1003 { 1004 foreach (i; 0 .. n) 1005 assert(m.slice!(k)[i] == n-i, text("level:",i," : ",m.slice!(k)[0 .. n])); 1006 } 1007 1008 static void fill(size_t k, T)(ref T m, int n) 1009 { 1010 foreach (i; 0 .. n) 1011 m.slice!(k)[i] = force!ubyte(i+1); 1012 } 1013 1014 static void fillB(size_t k, T)(ref T m, int n) 1015 { 1016 foreach (i; 0 .. n) 1017 m.slice!(k)[i] = force!ubyte(n-i); 1018 } 1019 1020 m.length!1 = 100; 1021 fill!1(m, 100); 1022 check!1(m, 100); 1023 1024 m.length!0 = 220; 1025 fill!0(m, 220); 1026 check!1(m, 100); 1027 check!0(m, 220); 1028 1029 m.length!2 = 17; 1030 fillB!2(m, 17); 1031 checkB!2(m, 17); 1032 check!0(m, 220); 1033 check!1(m, 100); 1034 1035 m.length!2 = 33; 1036 checkB!2(m, 17); 1037 fillB!2(m, 33); 1038 checkB!2(m, 33); 1039 check!0(m, 220); 1040 check!1(m, 100); 1041 1042 m.length!1 = 195; 1043 fillB!1(m, 195); 1044 checkB!1(m, 195); 1045 checkB!2(m, 33); 1046 check!0(m, 220); 1047 1048 auto marr = MultiArray!(BitPacked!(uint, 4), BitPacked!(uint, 6))(20, 10); 1049 marr.length!0 = 15; 1050 marr.length!1 = 30; 1051 fill!1(marr, 30); 1052 fill!0(marr, 15); 1053 check!1(marr, 30); 1054 check!0(marr, 15); 1055 return 0; 1056 }; 1057 enum ct = dg(); 1058 auto rt = dg(); 1059} 1060 1061@system unittest 1062{// more bitpacking tests 1063 import std.conv : text; 1064 1065 alias Bitty = 1066 MultiArray!(BitPacked!(size_t, 3) 1067 , BitPacked!(size_t, 4) 1068 , BitPacked!(size_t, 3) 1069 , BitPacked!(size_t, 6) 1070 , bool); 1071 alias fn1 = sliceBits!(13, 16); 1072 alias fn2 = sliceBits!( 9, 13); 1073 alias fn3 = sliceBits!( 6, 9); 1074 alias fn4 = sliceBits!( 0, 6); 1075 static void check(size_t lvl, MA)(ref MA arr){ 1076 for (size_t i = 0; i< arr.length!lvl; i++) 1077 assert(arr.slice!(lvl)[i] == i, text("Mismatch on lvl ", lvl, " idx ", i, " value: ", arr.slice!(lvl)[i])); 1078 } 1079 1080 static void fillIdx(size_t lvl, MA)(ref MA arr){ 1081 for (size_t i = 0; i< arr.length!lvl; i++) 1082 arr.slice!(lvl)[i] = i; 1083 } 1084 Bitty m1; 1085 1086 m1.length!4 = 10; 1087 m1.length!3 = 2^^6; 1088 m1.length!2 = 2^^3; 1089 m1.length!1 = 2^^4; 1090 m1.length!0 = 2^^3; 1091 1092 m1.length!4 = 2^^16; 1093 1094 for (size_t i = 0; i< m1.length!4; i++) 1095 m1.slice!(4)[i] = i % 2; 1096 1097 fillIdx!1(m1); 1098 check!1(m1); 1099 fillIdx!2(m1); 1100 check!2(m1); 1101 fillIdx!3(m1); 1102 check!3(m1); 1103 fillIdx!0(m1); 1104 check!0(m1); 1105 check!3(m1); 1106 check!2(m1); 1107 check!1(m1); 1108 for (size_t i=0; i < 2^^16; i++) 1109 { 1110 m1.slice!(4)[i] = i % 2; 1111 m1.slice!(0)[fn1(i)] = fn1(i); 1112 m1.slice!(1)[fn2(i)] = fn2(i); 1113 m1.slice!(2)[fn3(i)] = fn3(i); 1114 m1.slice!(3)[fn4(i)] = fn4(i); 1115 } 1116 for (size_t i=0; i < 2^^16; i++) 1117 { 1118 assert(m1.slice!(4)[i] == i % 2); 1119 assert(m1.slice!(0)[fn1(i)] == fn1(i)); 1120 assert(m1.slice!(1)[fn2(i)] == fn2(i)); 1121 assert(m1.slice!(2)[fn3(i)] == fn3(i)); 1122 assert(m1.slice!(3)[fn4(i)] == fn4(i)); 1123 } 1124} 1125 1126size_t spaceFor(size_t _bits)(size_t new_len) @safe pure nothrow @nogc 1127{ 1128 import std.math.algebraic : nextPow2; 1129 enum bits = _bits == 1 ? 1 : nextPow2(_bits - 1);// see PackedArrayView 1130 static if (bits > 8*size_t.sizeof) 1131 { 1132 static assert(bits % (size_t.sizeof*8) == 0); 1133 return new_len * bits/(8*size_t.sizeof); 1134 } 1135 else 1136 { 1137 enum factor = size_t.sizeof*8/bits; 1138 return (new_len+factor-1)/factor; // rounded up 1139 } 1140} 1141 1142template isBitPackableType(T) 1143{ 1144 enum isBitPackableType = isBitPacked!T 1145 || isIntegral!T || is(T == bool) || isSomeChar!T; 1146} 1147 1148//============================================================================ 1149template PackedArrayView(T) 1150if ((is(T dummy == BitPacked!(U, sz), U, size_t sz) 1151 && isBitPackableType!U) || isBitPackableType!T) 1152{ 1153 import std.math.algebraic : nextPow2; 1154 private enum bits = bitSizeOf!T; 1155 alias PackedArrayView = PackedArrayViewImpl!(T, bits > 1 ? nextPow2(bits - 1) : 1); 1156} 1157 1158//unsafe and fast access to a chunk of RAM as if it contains packed values 1159template PackedPtr(T) 1160if ((is(T dummy == BitPacked!(U, sz), U, size_t sz) 1161 && isBitPackableType!U) || isBitPackableType!T) 1162{ 1163 import std.math.algebraic : nextPow2; 1164 private enum bits = bitSizeOf!T; 1165 alias PackedPtr = PackedPtrImpl!(T, bits > 1 ? nextPow2(bits - 1) : 1); 1166} 1167 1168struct PackedPtrImpl(T, size_t bits) 1169{ 1170pure nothrow: 1171 static assert(isPow2OrZero(bits)); 1172 1173 this(inout(size_t)* ptr)inout @safe @nogc 1174 { 1175 origin = ptr; 1176 } 1177 1178 private T simpleIndex(size_t n) inout 1179 { 1180 immutable q = n / factor; 1181 immutable r = n % factor; 1182 return cast(T)((origin[q] >> bits*r) & mask); 1183 } 1184 1185 private void simpleWrite(TypeOfBitPacked!T val, size_t n) 1186 in 1187 { 1188 static if (isIntegral!T) 1189 assert(val <= mask); 1190 } 1191 do 1192 { 1193 immutable q = n / factor; 1194 immutable r = n % factor; 1195 immutable tgt_shift = bits*r; 1196 immutable word = origin[q]; 1197 origin[q] = (word & ~(mask << tgt_shift)) 1198 | (cast(size_t) val << tgt_shift); 1199 } 1200 1201 static if (factor == bytesPerWord// can safely pack by byte 1202 || factor == 1 // a whole word at a time 1203 || ((factor == bytesPerWord/2 || factor == bytesPerWord/4) 1204 && hasUnalignedReads)) // this needs unaligned reads 1205 { 1206 static if (factor == bytesPerWord) 1207 alias U = ubyte; 1208 else static if (factor == bytesPerWord/2) 1209 alias U = ushort; 1210 else static if (factor == bytesPerWord/4) 1211 alias U = uint; 1212 else static if (size_t.sizeof == 8 && factor == bytesPerWord/8) 1213 alias U = ulong; 1214 1215 T opIndex(size_t idx) inout 1216 { 1217 T ret; 1218 version (LittleEndian) 1219 ret = __ctfe ? simpleIndex(idx) : 1220 cast(inout(T))(cast(U*) origin)[idx]; 1221 else 1222 ret = simpleIndex(idx); 1223 return ret; 1224 } 1225 1226 static if (isBitPacked!T) // lack of user-defined implicit conversion 1227 { 1228 void opIndexAssign(T val, size_t idx) 1229 { 1230 return opIndexAssign(cast(TypeOfBitPacked!T) val, idx); 1231 } 1232 } 1233 1234 void opIndexAssign(TypeOfBitPacked!T val, size_t idx) 1235 { 1236 version (LittleEndian) 1237 { 1238 if (__ctfe) 1239 simpleWrite(val, idx); 1240 else 1241 (cast(U*) origin)[idx] = cast(U) val; 1242 } 1243 else 1244 simpleWrite(val, idx); 1245 } 1246 } 1247 else 1248 { 1249 T opIndex(size_t n) inout 1250 { 1251 return simpleIndex(n); 1252 } 1253 1254 static if (isBitPacked!T) // lack of user-defined implicit conversion 1255 { 1256 void opIndexAssign(T val, size_t idx) 1257 { 1258 return opIndexAssign(cast(TypeOfBitPacked!T) val, idx); 1259 } 1260 } 1261 1262 void opIndexAssign(TypeOfBitPacked!T val, size_t n) 1263 { 1264 return simpleWrite(val, n); 1265 } 1266 } 1267 1268private: 1269 // factor - number of elements in one machine word 1270 enum factor = size_t.sizeof*8/bits, mask = 2^^bits-1; 1271 enum bytesPerWord = size_t.sizeof; 1272 size_t* origin; 1273} 1274 1275// data is packed only by power of two sized packs per word, 1276// thus avoiding mul/div overhead at the cost of ultimate packing 1277// this construct doesn't own memory, only provides access, see MultiArray for usage 1278struct PackedArrayViewImpl(T, size_t bits) 1279{ 1280pure nothrow: 1281 1282 this(inout(size_t)* origin, size_t offset, size_t items) inout @safe 1283 { 1284 ptr = inout(PackedPtr!(T))(origin); 1285 ofs = offset; 1286 limit = items; 1287 } 1288 1289 bool zeros(size_t s, size_t e) 1290 in 1291 { 1292 assert(s <= e); 1293 } 1294 do 1295 { 1296 s += ofs; 1297 e += ofs; 1298 immutable pad_s = roundUp(s); 1299 if ( s >= e) 1300 { 1301 foreach (i; s .. e) 1302 if (ptr[i]) 1303 return false; 1304 return true; 1305 } 1306 immutable pad_e = roundDown(e); 1307 size_t i; 1308 for (i=s; i<pad_s; i++) 1309 if (ptr[i]) 1310 return false; 1311 // all in between is x*factor elements 1312 for (size_t j=i/factor; i<pad_e; i+=factor, j++) 1313 if (ptr.origin[j]) 1314 return false; 1315 for (; i<e; i++) 1316 if (ptr[i]) 1317 return false; 1318 return true; 1319 } 1320 1321 T opIndex(size_t idx) inout 1322 in 1323 { 1324 assert(idx < limit); 1325 } 1326 do 1327 { 1328 return ptr[ofs + idx]; 1329 } 1330 1331 static if (isBitPacked!T) // lack of user-defined implicit conversion 1332 { 1333 void opIndexAssign(T val, size_t idx) 1334 { 1335 return opIndexAssign(cast(TypeOfBitPacked!T) val, idx); 1336 } 1337 } 1338 1339 void opIndexAssign(TypeOfBitPacked!T val, size_t idx) 1340 in 1341 { 1342 assert(idx < limit); 1343 } 1344 do 1345 { 1346 ptr[ofs + idx] = val; 1347 } 1348 1349 static if (isBitPacked!T) // lack of user-defined implicit conversions 1350 { 1351 void opSliceAssign(T val, size_t start, size_t end) 1352 { 1353 opSliceAssign(cast(TypeOfBitPacked!T) val, start, end); 1354 } 1355 } 1356 1357 void opSliceAssign(TypeOfBitPacked!T val, size_t start, size_t end) 1358 in 1359 { 1360 assert(start <= end); 1361 assert(end <= limit); 1362 } 1363 do 1364 { 1365 // account for ofsetted view 1366 start += ofs; 1367 end += ofs; 1368 // rounded to factor granularity 1369 immutable pad_start = roundUp(start);// rounded up 1370 if (pad_start >= end) //rounded up >= then end of slice 1371 { 1372 //nothing to gain, use per element assignment 1373 foreach (i; start .. end) 1374 ptr[i] = val; 1375 return; 1376 } 1377 immutable pad_end = roundDown(end); // rounded down 1378 size_t i; 1379 for (i=start; i<pad_start; i++) 1380 ptr[i] = val; 1381 // all in between is x*factor elements 1382 if (pad_start != pad_end) 1383 { 1384 immutable repval = replicateBits!(factor, bits)(val); 1385 for (size_t j=i/factor; i<pad_end; i+=factor, j++) 1386 ptr.origin[j] = repval;// so speed it up by factor 1387 } 1388 for (; i<end; i++) 1389 ptr[i] = val; 1390 } 1391 1392 auto opSlice(size_t from, size_t to)inout 1393 in 1394 { 1395 assert(from <= to); 1396 assert(ofs + to <= limit); 1397 } 1398 do 1399 { 1400 return typeof(this)(ptr.origin, ofs + from, to - from); 1401 } 1402 1403 auto opSlice(){ return opSlice(0, length); } 1404 1405 bool opEquals(T)(auto ref T arr) const 1406 { 1407 if (limit != arr.limit) 1408 return false; 1409 size_t s1 = ofs, s2 = arr.ofs; 1410 size_t e1 = s1 + limit, e2 = s2 + limit; 1411 if (s1 % factor == 0 && s2 % factor == 0 && length % factor == 0) 1412 { 1413 return ptr.origin[s1/factor .. e1/factor] 1414 == arr.ptr.origin[s2/factor .. e2/factor]; 1415 } 1416 for (size_t i=0;i<limit; i++) 1417 if (this[i] != arr[i]) 1418 return false; 1419 return true; 1420 } 1421 1422 @property size_t length()const{ return limit; } 1423 1424private: 1425 auto roundUp()(size_t val){ return (val+factor-1)/factor*factor; } 1426 auto roundDown()(size_t val){ return val/factor*factor; } 1427 // factor - number of elements in one machine word 1428 enum factor = size_t.sizeof*8/bits; 1429 PackedPtr!(T) ptr; 1430 size_t ofs, limit; 1431} 1432 1433 1434private struct SliceOverIndexed(T) 1435{ 1436 enum assignableIndex = is(typeof((){ T.init[0] = Item.init; })); 1437 enum assignableSlice = is(typeof((){ T.init[0 .. 0] = Item.init; })); 1438 auto opIndex(size_t idx)const 1439 in 1440 { 1441 assert(idx < to - from); 1442 } 1443 do 1444 { 1445 return (*arr)[from+idx]; 1446 } 1447 1448 static if (assignableIndex) 1449 void opIndexAssign(Item val, size_t idx) 1450 in 1451 { 1452 assert(idx < to - from); 1453 } 1454 do 1455 { 1456 (*arr)[from+idx] = val; 1457 } 1458 1459 auto opSlice(size_t a, size_t b) 1460 { 1461 return typeof(this)(from+a, from+b, arr); 1462 } 1463 1464 // static if (assignableSlice) 1465 void opSliceAssign(T)(T val, size_t start, size_t end) 1466 { 1467 (*arr)[start+from .. end+from] = val; 1468 } 1469 1470 auto opSlice() 1471 { 1472 return typeof(this)(from, to, arr); 1473 } 1474 1475 @property size_t length()const { return to-from;} 1476 1477 alias opDollar = length; 1478 1479 @property bool empty()const { return from == to; } 1480 1481 @property auto front()const { return (*arr)[from]; } 1482 1483 static if (assignableIndex) 1484 @property void front(Item val) { (*arr)[from] = val; } 1485 1486 @property auto back()const { return (*arr)[to-1]; } 1487 1488 static if (assignableIndex) 1489 @property void back(Item val) { (*arr)[to-1] = val; } 1490 1491 @property auto save() inout { return this; } 1492 1493 void popFront() { from++; } 1494 1495 void popBack() { to--; } 1496 1497 bool opEquals(T)(auto ref T arr) const 1498 { 1499 if (arr.length != length) 1500 return false; 1501 for (size_t i=0; i <length; i++) 1502 if (this[i] != arr[i]) 1503 return false; 1504 return true; 1505 } 1506private: 1507 alias Item = typeof(T.init[0]); 1508 size_t from, to; 1509 T* arr; 1510} 1511 1512@safe pure nothrow @nogc unittest 1513{ 1514 static assert(isRandomAccessRange!(SliceOverIndexed!(int[]))); 1515} 1516 1517SliceOverIndexed!(const(T)) sliceOverIndexed(T)(size_t a, size_t b, const(T)* x) 1518if (is(Unqual!T == T)) 1519{ 1520 return SliceOverIndexed!(const(T))(a, b, x); 1521} 1522 1523// BUG? inout is out of reach 1524//...SliceOverIndexed.arr only parameters or stack based variables can be inout 1525SliceOverIndexed!T sliceOverIndexed(T)(size_t a, size_t b, T* x) 1526if (is(Unqual!T == T)) 1527{ 1528 return SliceOverIndexed!T(a, b, x); 1529} 1530 1531@safe unittest 1532{ 1533 int[] idxArray = [2, 3, 5, 8, 13]; 1534 auto sliced = sliceOverIndexed(0, idxArray.length, &idxArray); 1535 1536 assert(!sliced.empty); 1537 assert(sliced.front == 2); 1538 sliced.front = 1; 1539 assert(sliced.front == 1); 1540 assert(sliced.back == 13); 1541 sliced.popFront(); 1542 assert(sliced.front == 3); 1543 assert(sliced.back == 13); 1544 sliced.back = 11; 1545 assert(sliced.back == 11); 1546 sliced.popBack(); 1547 1548 assert(sliced.front == 3); 1549 assert(sliced[$-1] == 8); 1550 sliced = sliced[]; 1551 assert(sliced[0] == 3); 1552 assert(sliced.back == 8); 1553 sliced = sliced[1..$]; 1554 assert(sliced.front == 5); 1555 sliced = sliced[0..$-1]; 1556 assert(sliced[$-1] == 5); 1557 1558 int[] other = [2, 5]; 1559 assert(sliced[] == sliceOverIndexed(1, 2, &other)); 1560 sliceOverIndexed(0, 2, &idxArray)[0 .. 2] = -1; 1561 assert(idxArray[0 .. 2] == [-1, -1]); 1562 uint[] nullArr = null; 1563 auto nullSlice = sliceOverIndexed(0, 0, &idxArray); 1564 assert(nullSlice.empty); 1565} 1566 1567private inout(PackedArrayView!T) packedArrayView(T)(inout(size_t)* ptr, size_t items) 1568{ 1569 return inout(PackedArrayView!T)(ptr, 0, items); 1570} 1571 1572 1573//============================================================================ 1574// Partially unrolled binary search using Shar's method 1575//============================================================================ 1576 1577string genUnrolledSwitchSearch(size_t size) @safe pure nothrow 1578{ 1579 import core.bitop : bsr; 1580 import std.array : replace; 1581 import std.conv : to; 1582 assert(isPow2OrZero(size)); 1583 string code = ` 1584 import core.bitop : bsr; 1585 auto power = bsr(m)+1; 1586 switch (power){`; 1587 size_t i = bsr(size); 1588 foreach_reverse (val; 0 .. bsr(size)) 1589 { 1590 auto v = 2^^val; 1591 code ~= ` 1592 case pow: 1593 if (pred(range[idx+m], needle)) 1594 idx += m; 1595 goto case; 1596 `.replace("m", to!string(v)) 1597 .replace("pow", to!string(i)); 1598 i--; 1599 } 1600 code ~= ` 1601 case 0: 1602 if (pred(range[idx], needle)) 1603 idx += 1; 1604 goto default; 1605 `; 1606 code ~= ` 1607 default: 1608 }`; 1609 return code; 1610} 1611 1612bool isPow2OrZero(size_t sz) @safe pure nothrow @nogc 1613{ 1614 // See also: std.math.isPowerOf2() 1615 return (sz & (sz-1)) == 0; 1616} 1617 1618size_t uniformLowerBound(alias pred, Range, T)(Range range, T needle) 1619if (is(T : ElementType!Range)) 1620{ 1621 assert(isPow2OrZero(range.length)); 1622 size_t idx = 0, m = range.length/2; 1623 while (m != 0) 1624 { 1625 if (pred(range[idx+m], needle)) 1626 idx += m; 1627 m /= 2; 1628 } 1629 if (pred(range[idx], needle)) 1630 idx += 1; 1631 return idx; 1632} 1633 1634size_t switchUniformLowerBound(alias pred, Range, T)(Range range, T needle) 1635if (is(T : ElementType!Range)) 1636{ 1637 assert(isPow2OrZero(range.length)); 1638 size_t idx = 0, m = range.length/2; 1639 enum max = 1 << 10; 1640 while (m >= max) 1641 { 1642 if (pred(range[idx+m], needle)) 1643 idx += m; 1644 m /= 2; 1645 } 1646 mixin(genUnrolledSwitchSearch(max)); 1647 return idx; 1648} 1649 1650template sharMethod(alias uniLowerBound) 1651{ 1652 size_t sharMethod(alias _pred="a<b", Range, T)(Range range, T needle) 1653 if (is(T : ElementType!Range)) 1654 { 1655 import std.functional : binaryFun; 1656 import std.math.algebraic : nextPow2, truncPow2; 1657 alias pred = binaryFun!_pred; 1658 if (range.length == 0) 1659 return 0; 1660 if (isPow2OrZero(range.length)) 1661 return uniLowerBound!pred(range, needle); 1662 size_t n = truncPow2(range.length); 1663 if (pred(range[n-1], needle)) 1664 {// search in another 2^^k area that fully covers the tail of range 1665 size_t k = nextPow2(range.length - n + 1); 1666 return range.length - k + uniLowerBound!pred(range[$-k..$], needle); 1667 } 1668 else 1669 return uniLowerBound!pred(range[0 .. n], needle); 1670 } 1671} 1672 1673alias sharLowerBound = sharMethod!uniformLowerBound; 1674alias sharSwitchLowerBound = sharMethod!switchUniformLowerBound; 1675 1676@safe unittest 1677{ 1678 import std.array : array; 1679 import std.range : assumeSorted, iota; 1680 1681 auto stdLowerBound(T)(T[] range, T needle) 1682 { 1683 return assumeSorted(range).lowerBound(needle).length; 1684 } 1685 immutable MAX = 5*1173; 1686 auto arr = array(iota(5, MAX, 5)); 1687 assert(arr.length == MAX/5-1); 1688 foreach (i; 0 .. MAX+5) 1689 { 1690 auto st = stdLowerBound(arr, i); 1691 assert(st == sharLowerBound(arr, i)); 1692 assert(st == sharSwitchLowerBound(arr, i)); 1693 } 1694 arr = []; 1695 auto st = stdLowerBound(arr, 33); 1696 assert(st == sharLowerBound(arr, 33)); 1697 assert(st == sharSwitchLowerBound(arr, 33)); 1698} 1699//============================================================================ 1700 1701@safe 1702{ 1703// hope to see simillar stuff in public interface... once Allocators are out 1704//@@@BUG moveFront and friends? dunno, for now it's POD-only 1705 1706@trusted size_t genericReplace(Policy=void, T, Range) 1707 (ref T dest, size_t from, size_t to, Range stuff) 1708{ 1709 import std.algorithm.mutation : copy; 1710 size_t delta = to - from; 1711 size_t stuff_end = from+stuff.length; 1712 if (stuff.length > delta) 1713 {// replace increases length 1714 delta = stuff.length - delta;// now, new is > old by delta 1715 static if (is(Policy == void)) 1716 dest.length = dest.length+delta;//@@@BUG lame @property 1717 else 1718 dest = Policy.realloc(dest, dest.length+delta); 1719 copyBackwards(dest[to .. dest.length-delta], 1720 dest[to+delta .. dest.length]); 1721 copyForward(stuff, dest[from .. stuff_end]); 1722 } 1723 else if (stuff.length == delta) 1724 { 1725 copy(stuff, dest[from .. to]); 1726 } 1727 else 1728 {// replace decreases length by delta 1729 delta = delta - stuff.length; 1730 copy(stuff, dest[from .. stuff_end]); 1731 copyForward(dest[to .. dest.length], 1732 dest[stuff_end .. dest.length-delta]); 1733 static if (is(Policy == void)) 1734 dest.length = dest.length - delta;//@@@BUG lame @property 1735 else 1736 dest = Policy.realloc(dest, dest.length-delta); 1737 } 1738 return stuff_end; 1739} 1740 1741 1742// Simple storage manipulation policy 1743@safe private struct GcPolicy 1744{ 1745 import std.traits : isDynamicArray; 1746 1747 static T[] dup(T)(const T[] arr) 1748 { 1749 return arr.dup; 1750 } 1751 1752 static T[] alloc(T)(size_t size) 1753 { 1754 return new T[size]; 1755 } 1756 1757 static T[] realloc(T)(T[] arr, size_t sz) 1758 { 1759 arr.length = sz; 1760 return arr; 1761 } 1762 1763 static void replaceImpl(T, Range)(ref T[] dest, size_t from, size_t to, Range stuff) 1764 { 1765 replaceInPlace(dest, from, to, stuff); 1766 } 1767 1768 static void append(T, V)(ref T[] arr, V value) 1769 if (!isInputRange!V) 1770 { 1771 arr ~= force!T(value); 1772 } 1773 1774 static void append(T, V)(ref T[] arr, V value) 1775 if (isInputRange!V) 1776 { 1777 insertInPlace(arr, arr.length, value); 1778 } 1779 1780 static void destroy(T)(ref T arr) pure // pure required for -dip25, inferred for -dip1000 1781 if (isDynamicArray!T && is(Unqual!T == T)) 1782 { 1783 debug 1784 { 1785 arr[] = cast(typeof(T.init[0]))(0xdead_beef); 1786 } 1787 arr = null; 1788 } 1789 1790 static void destroy(T)(ref T arr) pure // pure required for -dip25, inferred for -dip1000 1791 if (isDynamicArray!T && !is(Unqual!T == T)) 1792 { 1793 arr = null; 1794 } 1795} 1796 1797// ditto 1798@safe struct ReallocPolicy 1799{ 1800 import std.range.primitives : hasLength; 1801 1802 static T[] dup(T)(const T[] arr) 1803 { 1804 auto result = alloc!T(arr.length); 1805 result[] = arr[]; 1806 return result; 1807 } 1808 1809 static T[] alloc(T)(size_t size) @trusted 1810 { 1811 import std.internal.memory : enforceMalloc; 1812 1813 import core.checkedint : mulu; 1814 bool overflow; 1815 size_t nbytes = mulu(size, T.sizeof, overflow); 1816 if (overflow) assert(0); 1817 1818 auto ptr = cast(T*) enforceMalloc(nbytes); 1819 return ptr[0 .. size]; 1820 } 1821 1822 static T[] realloc(T)(return scope T[] arr, size_t size) @trusted 1823 { 1824 import std.internal.memory : enforceRealloc; 1825 if (!size) 1826 { 1827 destroy(arr); 1828 return null; 1829 } 1830 1831 import core.checkedint : mulu; 1832 bool overflow; 1833 size_t nbytes = mulu(size, T.sizeof, overflow); 1834 if (overflow) assert(0); 1835 1836 auto ptr = cast(T*) enforceRealloc(arr.ptr, nbytes); 1837 return ptr[0 .. size]; 1838 } 1839 1840 static void replaceImpl(T, Range)(ref T[] dest, size_t from, size_t to, Range stuff) 1841 { 1842 genericReplace!(ReallocPolicy)(dest, from, to, stuff); 1843 } 1844 1845 static void append(T, V)(ref T[] arr, V value) 1846 if (!isInputRange!V) 1847 { 1848 if (arr.length == size_t.max) assert(0); 1849 arr = realloc(arr, arr.length+1); 1850 arr[$-1] = force!T(value); 1851 } 1852 1853 pure @safe unittest 1854 { 1855 int[] arr; 1856 ReallocPolicy.append(arr, 3); 1857 1858 import std.algorithm.comparison : equal; 1859 assert(equal(arr, [3])); 1860 } 1861 1862 static void append(T, V)(ref T[] arr, V value) 1863 if (isInputRange!V && hasLength!V) 1864 { 1865 import core.checkedint : addu; 1866 bool overflow; 1867 size_t nelems = addu(arr.length, value.length, overflow); 1868 if (overflow) assert(0); 1869 1870 arr = realloc(arr, nelems); 1871 1872 import std.algorithm.mutation : copy; 1873 copy(value, arr[$-value.length..$]); 1874 } 1875 1876 pure @safe unittest 1877 { 1878 int[] arr; 1879 ReallocPolicy.append(arr, [1,2,3]); 1880 1881 import std.algorithm.comparison : equal; 1882 assert(equal(arr, [1,2,3])); 1883 } 1884 1885 static void destroy(T)(scope ref T[] arr) @trusted 1886 { 1887 import core.memory : pureFree; 1888 if (arr.ptr) 1889 pureFree(arr.ptr); 1890 arr = null; 1891 } 1892} 1893 1894//build hack 1895alias _RealArray = CowArray!ReallocPolicy; 1896 1897pure @safe unittest 1898{ 1899 import std.algorithm.comparison : equal; 1900 1901 with(ReallocPolicy) 1902 { 1903 bool test(T, U, V)(T orig, size_t from, size_t to, U toReplace, V result, 1904 string file = __FILE__, size_t line = __LINE__) 1905 { 1906 { 1907 replaceImpl(orig, from, to, toReplace); 1908 scope(exit) destroy(orig); 1909 if (!equal(orig, result)) 1910 return false; 1911 } 1912 return true; 1913 } 1914 static T[] arr(T)(T[] args... ) 1915 { 1916 return dup(args); 1917 } 1918 1919 assert(test(arr([1, 2, 3, 4]), 0, 0, [5, 6, 7], [5, 6, 7, 1, 2, 3, 4])); 1920 assert(test(arr([1, 2, 3, 4]), 0, 2, cast(int[])[], [3, 4])); 1921 assert(test(arr([1, 2, 3, 4]), 0, 4, [5, 6, 7], [5, 6, 7])); 1922 assert(test(arr([1, 2, 3, 4]), 0, 2, [5, 6, 7], [5, 6, 7, 3, 4])); 1923 assert(test(arr([1, 2, 3, 4]), 2, 3, [5, 6, 7], [1, 2, 5, 6, 7, 4])); 1924 } 1925} 1926 1927/** 1928 Tests if T is some kind a set of code points. Intended for template constraints. 1929*/ 1930public template isCodepointSet(T) 1931{ 1932 static if (is(T dummy == InversionList!(Args), Args...)) 1933 enum isCodepointSet = true; 1934 else 1935 enum isCodepointSet = false; 1936} 1937 1938/** 1939 Tests if `T` is a pair of integers that implicitly convert to `V`. 1940 The following code must compile for any pair `T`: 1941 --- 1942 (T x){ V a = x[0]; V b = x[1];} 1943 --- 1944 The following must not compile: 1945 --- 1946 (T x){ V c = x[2];} 1947 --- 1948*/ 1949public template isIntegralPair(T, V=uint) 1950{ 1951 enum isIntegralPair = is(typeof((T x){ V a = x[0]; V b = x[1];})) 1952 && !is(typeof((T x){ V c = x[2]; })); 1953} 1954 1955 1956/** 1957 The recommended default type for set of $(CODEPOINTS). 1958 For details, see the current implementation: $(LREF InversionList). 1959*/ 1960public alias CodepointSet = InversionList!GcPolicy; 1961 1962 1963//@@@BUG: std.typecons tuples depend on std.format to produce fields mixin 1964// which relies on std.uni.isGraphical and this chain blows up with Forward reference error 1965// hence below doesn't seem to work 1966// public alias CodepointInterval = Tuple!(uint, "a", uint, "b"); 1967 1968/** 1969 The recommended type of $(REF Tuple, std,_typecons) 1970 to represent [a, b$(RPAREN) intervals of $(CODEPOINTS). As used in $(LREF InversionList). 1971 Any interval type should pass $(LREF isIntegralPair) trait. 1972*/ 1973public struct CodepointInterval 1974{ 1975pure: 1976 uint[2] _tuple; 1977 alias _tuple this; 1978 1979@safe pure nothrow @nogc: 1980 1981 this(uint low, uint high) 1982 { 1983 _tuple[0] = low; 1984 _tuple[1] = high; 1985 } 1986 bool opEquals(T)(T val) const 1987 { 1988 return this[0] == val[0] && this[1] == val[1]; 1989 } 1990 @property ref inout(uint) a() return inout { return _tuple[0]; } 1991 @property ref inout(uint) b() return inout { return _tuple[1]; } 1992} 1993 1994/** 1995 $(P 1996 `InversionList` is a set of $(CODEPOINTS) 1997 represented as an array of open-right [a, b$(RPAREN) 1998 intervals (see $(LREF CodepointInterval) above). 1999 The name comes from the way the representation reads left to right. 2000 For instance a set of all values [10, 50$(RPAREN), [80, 90$(RPAREN), 2001 plus a singular value 60 looks like this: 2002 ) 2003 --- 2004 10, 50, 60, 61, 80, 90 2005 --- 2006 $(P 2007 The way to read this is: start with negative meaning that all numbers 2008 smaller then the next one are not present in this set (and positive - 2009 the contrary). Then switch positive/negative after each 2010 number passed from left to right. 2011 ) 2012 $(P This way negative spans until 10, then positive until 50, 2013 then negative until 60, then positive until 61, and so on. 2014 As seen this provides a space-efficient storage of highly redundant data 2015 that comes in long runs. A description which Unicode $(CHARACTER) 2016 properties fit nicely. The technique itself could be seen as a variation 2017 on $(LINK2 https://en.wikipedia.org/wiki/Run-length_encoding, RLE encoding). 2018 ) 2019 2020 $(P Sets are value types (just like `int` is) thus they 2021 are never aliased. 2022 ) 2023 Example: 2024 --- 2025 auto a = CodepointSet('a', 'z'+1); 2026 auto b = CodepointSet('A', 'Z'+1); 2027 auto c = a; 2028 a = a | b; 2029 assert(a == CodepointSet('A', 'Z'+1, 'a', 'z'+1)); 2030 assert(a != c); 2031 --- 2032 $(P See also $(LREF unicode) for simpler construction of sets 2033 from predefined ones. 2034 ) 2035 2036 $(P Memory usage is 8 bytes per each contiguous interval in a set. 2037 The value semantics are achieved by using the 2038 $(HTTP en.wikipedia.org/wiki/Copy-on-write, COW) technique 2039 and thus it's $(RED not) safe to cast this type to $(D_KEYWORD shared). 2040 ) 2041 2042 Note: 2043 $(P It's not recommended to rely on the template parameters 2044 or the exact type of a current $(CODEPOINT) set in `std.uni`. 2045 The type and parameters may change when the standard 2046 allocators design is finalized. 2047 Use $(LREF isCodepointSet) with templates or just stick with the default 2048 alias $(LREF CodepointSet) throughout the whole code base. 2049 ) 2050*/ 2051public struct InversionList(SP=GcPolicy) 2052{ 2053 import std.range : assumeSorted; 2054 2055 /** 2056 Construct from another code point set of any type. 2057 */ 2058 this(Set)(Set set) pure 2059 if (isCodepointSet!Set) 2060 { 2061 uint[] arr; 2062 foreach (v; set.byInterval) 2063 { 2064 arr ~= v.a; 2065 arr ~= v.b; 2066 } 2067 data = CowArray!(SP).reuse(arr); 2068 } 2069 2070 /** 2071 Construct a set from a forward range of code point intervals. 2072 */ 2073 this(Range)(Range intervals) pure 2074 if (isForwardRange!Range && isIntegralPair!(ElementType!Range)) 2075 { 2076 uint[] arr; 2077 foreach (v; intervals) 2078 { 2079 SP.append(arr, v.a); 2080 SP.append(arr, v.b); 2081 } 2082 data = CowArray!(SP).reuse(arr); 2083 sanitize(); //enforce invariant: sort intervals etc. 2084 } 2085 2086 //helper function that avoids sanity check to be CTFE-friendly 2087 private static fromIntervals(Range)(Range intervals) pure 2088 { 2089 import std.algorithm.iteration : map; 2090 import std.range : roundRobin; 2091 auto flattened = roundRobin(intervals.save.map!"a[0]"(), 2092 intervals.save.map!"a[1]"()); 2093 InversionList set; 2094 set.data = CowArray!(SP)(flattened); 2095 return set; 2096 } 2097 //ditto untill sort is CTFE-able 2098 private static fromIntervals()(uint[] intervals...) pure 2099 in 2100 { 2101 import std.conv : text; 2102 assert(intervals.length % 2 == 0, "Odd number of interval bounds [a, b)!"); 2103 for (uint i = 0; i < intervals.length; i += 2) 2104 { 2105 auto a = intervals[i], b = intervals[i+1]; 2106 assert(a < b, text("illegal interval [a, b): ", a, " > ", b)); 2107 } 2108 } 2109 do 2110 { 2111 InversionList set; 2112 set.data = CowArray!(SP)(intervals); 2113 return set; 2114 } 2115 2116 /** 2117 Construct a set from plain values of code point intervals. 2118 */ 2119 this()(uint[] intervals...) 2120 in 2121 { 2122 import std.conv : text; 2123 assert(intervals.length % 2 == 0, "Odd number of interval bounds [a, b)!"); 2124 for (uint i = 0; i < intervals.length; i += 2) 2125 { 2126 auto a = intervals[i], b = intervals[i+1]; 2127 assert(a < b, text("illegal interval [a, b): ", a, " > ", b)); 2128 } 2129 } 2130 do 2131 { 2132 data = CowArray!(SP)(intervals); 2133 sanitize(); //enforce invariant: sort intervals etc. 2134 } 2135 2136 /// 2137 pure @safe unittest 2138 { 2139 import std.algorithm.comparison : equal; 2140 2141 auto set = CodepointSet('a', 'z'+1, '��', '��'+1); 2142 foreach (v; 'a'..'z'+1) 2143 assert(set[v]); 2144 // Cyrillic lowercase interval 2145 foreach (v; '��'..'��'+1) 2146 assert(set[v]); 2147 //specific order is not required, intervals may interesect 2148 auto set2 = CodepointSet('��', '��'+1, 'a', 'd', 'b', 'z'+1); 2149 //the same end result 2150 assert(set2.byInterval.equal(set.byInterval)); 2151 // test constructor this(Range)(Range intervals) 2152 auto chessPiecesWhite = CodepointInterval(9812, 9818); 2153 auto chessPiecesBlack = CodepointInterval(9818, 9824); 2154 auto set3 = CodepointSet([chessPiecesWhite, chessPiecesBlack]); 2155 foreach (v; '���'..'���'+1) 2156 assert(set3[v]); 2157 } 2158 2159 /** 2160 Get range that spans all of the $(CODEPOINT) intervals in this $(LREF InversionList). 2161 */ 2162 @property auto byInterval() scope 2163 { 2164 // TODO: change this to data[] once the -dip1000 errors have been fixed 2165 // see e.g. https://github.com/dlang/phobos/pull/6638 2166 import std.array : array; 2167 return Intervals!(typeof(data.array))(data.array); 2168 } 2169 2170 @safe unittest 2171 { 2172 import std.algorithm.comparison : equal; 2173 import std.typecons : tuple; 2174 2175 auto set = CodepointSet('A', 'D'+1, 'a', 'd'+1); 2176 2177 assert(set.byInterval.equal([tuple('A','E'), tuple('a','e')])); 2178 } 2179 2180 package(std) @property const(CodepointInterval)[] intervals() const 2181 { 2182 import std.array : array; 2183 return Intervals!(typeof(data[]))(data[]).array; 2184 } 2185 2186 /** 2187 Tests the presence of code point `val` in this set. 2188 */ 2189 bool opIndex(uint val) const 2190 { 2191 // the <= ensures that searching in interval of [a, b) for 'a' you get .length == 1 2192 // return assumeSorted!((a,b) => a <= b)(data[]).lowerBound(val).length & 1; 2193 return sharSwitchLowerBound!"a <= b"(data[], val) & 1; 2194 } 2195 2196 /// 2197 pure @safe unittest 2198 { 2199 auto gothic = unicode.Gothic; 2200 // Gothic letter ahsa 2201 assert(gothic['\U00010330']); 2202 // no ascii in Gothic obviously 2203 assert(!gothic['$']); 2204 } 2205 2206 2207 // Linear scan for `ch`. Useful only for small sets. 2208 // TODO: 2209 // used internally in std.regex 2210 // should be properly exposed in a public API ? 2211 package(std) auto scanFor()(dchar ch) const 2212 { 2213 immutable len = data.length; 2214 for (size_t i = 0; i < len; i++) 2215 if (ch < data[i]) 2216 return i & 1; 2217 return 0; 2218 } 2219 2220 /// Number of $(CODEPOINTS) in this set 2221 @property size_t length() 2222 { 2223 size_t sum = 0; 2224 foreach (iv; byInterval) 2225 { 2226 sum += iv.b - iv.a; 2227 } 2228 return sum; 2229 } 2230 2231// bootstrap full set operations from 4 primitives (suitable as a template mixin): 2232// addInterval, skipUpTo, dropUpTo & byInterval iteration 2233//============================================================================ 2234public: 2235 /** 2236 $(P Sets support natural syntax for set algebra, namely: ) 2237 $(BOOKTABLE , 2238 $(TR $(TH Operator) $(TH Math notation) $(TH Description) ) 2239 $(TR $(TD &) $(TD a ��� b) $(TD intersection) ) 2240 $(TR $(TD |) $(TD a ��� b) $(TD union) ) 2241 $(TR $(TD -) $(TD a ��� b) $(TD subtraction) ) 2242 $(TR $(TD ~) $(TD a ~ b) $(TD symmetric set difference i.e. (a ��� b) \ (a ��� b)) ) 2243 ) 2244 */ 2245 This opBinary(string op, U)(U rhs) 2246 if (isCodepointSet!U || is(U:dchar)) 2247 { 2248 static if (op == "&" || op == "|" || op == "~") 2249 {// symmetric ops thus can swap arguments to reuse r-value 2250 static if (is(U:dchar)) 2251 { 2252 auto tmp = this; 2253 mixin("tmp "~op~"= rhs; "); 2254 return tmp; 2255 } 2256 else 2257 { 2258 static if (is(Unqual!U == U)) 2259 { 2260 // try hard to reuse r-value 2261 mixin("rhs "~op~"= this;"); 2262 return rhs; 2263 } 2264 else 2265 { 2266 auto tmp = this; 2267 mixin("tmp "~op~"= rhs;"); 2268 return tmp; 2269 } 2270 } 2271 } 2272 else static if (op == "-") // anti-symmetric 2273 { 2274 auto tmp = this; 2275 tmp -= rhs; 2276 return tmp; 2277 } 2278 else 2279 static assert(0, "no operator "~op~" defined for Set"); 2280 } 2281 2282 /// 2283 pure @safe unittest 2284 { 2285 import std.algorithm.comparison : equal; 2286 import std.range : iota; 2287 2288 auto lower = unicode.LowerCase; 2289 auto upper = unicode.UpperCase; 2290 auto ascii = unicode.ASCII; 2291 2292 assert((lower & upper).empty); // no intersection 2293 auto lowerASCII = lower & ascii; 2294 assert(lowerASCII.byCodepoint.equal(iota('a', 'z'+1))); 2295 // throw away all of the lowercase ASCII 2296 assert((ascii - lower).length == 128 - 26); 2297 2298 auto onlyOneOf = lower ~ ascii; 2299 assert(!onlyOneOf['��']); // not ASCII and not lowercase 2300 assert(onlyOneOf['$']); // ASCII and not lowercase 2301 assert(!onlyOneOf['a']); // ASCII and lowercase 2302 assert(onlyOneOf['��']); // not ASCII but lowercase 2303 2304 // throw away all cased letters from ASCII 2305 auto noLetters = ascii - (lower | upper); 2306 assert(noLetters.length == 128 - 26*2); 2307 } 2308 2309 /// The 'op=' versions of the above overloaded operators. 2310 ref This opOpAssign(string op, U)(U rhs) 2311 if (isCodepointSet!U || is(U:dchar)) 2312 { 2313 static if (op == "|") // union 2314 { 2315 static if (is(U:dchar)) 2316 { 2317 this.addInterval(rhs, rhs+1); 2318 return this; 2319 } 2320 else 2321 return this.add(rhs); 2322 } 2323 else static if (op == "&") // intersection 2324 return this.intersect(rhs);// overloaded 2325 else static if (op == "-") // set difference 2326 return this.sub(rhs);// overloaded 2327 else static if (op == "~") // symmetric set difference 2328 { 2329 auto copy = this & rhs; 2330 this |= rhs; 2331 this -= copy; 2332 return this; 2333 } 2334 else 2335 static assert(0, "no operator "~op~" defined for Set"); 2336 } 2337 2338 /** 2339 Tests the presence of codepoint `ch` in this set, 2340 the same as $(LREF opIndex). 2341 */ 2342 bool opBinaryRight(string op: "in", U)(U ch) const 2343 if (is(U : dchar)) 2344 { 2345 return this[ch]; 2346 } 2347 2348 /// 2349 pure @safe unittest 2350 { 2351 assert('��' in unicode.Cyrillic); 2352 assert(!('z' in unicode.Cyrillic)); 2353 } 2354 2355 2356 2357 /** 2358 * Obtains a set that is the inversion of this set. 2359 * 2360 * See_Also: $(LREF inverted) 2361 */ 2362 auto opUnary(string op: "!")() 2363 { 2364 return this.inverted; 2365 } 2366 2367 /** 2368 A range that spans each $(CODEPOINT) in this set. 2369 */ 2370 @property auto byCodepoint() 2371 { 2372 static struct CodepointRange 2373 { 2374 this(This set) 2375 { 2376 r = set.byInterval; 2377 if (!r.empty) 2378 cur = r.front.a; 2379 } 2380 2381 @property dchar front() const 2382 { 2383 return cast(dchar) cur; 2384 } 2385 2386 @property bool empty() const 2387 { 2388 return r.empty; 2389 } 2390 2391 void popFront() 2392 { 2393 cur++; 2394 while (cur >= r.front.b) 2395 { 2396 r.popFront(); 2397 if (r.empty) 2398 break; 2399 cur = r.front.a; 2400 } 2401 } 2402 private: 2403 uint cur; 2404 typeof(This.init.byInterval) r; 2405 } 2406 2407 return CodepointRange(this); 2408 } 2409 2410 /// 2411 pure @safe unittest 2412 { 2413 import std.algorithm.comparison : equal; 2414 import std.range : iota; 2415 2416 auto set = unicode.ASCII; 2417 set.byCodepoint.equal(iota(0, 0x80)); 2418 } 2419 2420 /** 2421 $(P Obtain textual representation of this set in from of 2422 open-right intervals and feed it to `sink`. 2423 ) 2424 $(P Used by various standard formatting facilities such as 2425 $(REF formattedWrite, std,format), $(REF write, std,stdio), 2426 $(REF writef, std,stdio), $(REF to, std,conv) and others. 2427 ) 2428 Example: 2429 --- 2430 import std.conv; 2431 assert(unicode.ASCII.to!string == "[0..128$(RPAREN)"); 2432 --- 2433 */ 2434 2435 private import std.format.spec : FormatSpec; 2436 2437 /*************************************** 2438 * Obtain a textual representation of this InversionList 2439 * in form of open-right intervals. 2440 * 2441 * The formatting flag is applied individually to each value, for example: 2442 * $(LI $(B %s) and $(B %d) format the intervals as a [low .. high$(RPAREN) range of integrals) 2443 * $(LI $(B %x) formats the intervals as a [low .. high$(RPAREN) range of lowercase hex characters) 2444 * $(LI $(B %X) formats the intervals as a [low .. high$(RPAREN) range of uppercase hex characters) 2445 */ 2446 void toString(Writer)(scope Writer sink, scope const ref FormatSpec!char fmt) /* const */ 2447 { 2448 import std.format.write : formatValue; 2449 auto range = byInterval; 2450 if (range.empty) 2451 return; 2452 2453 while (1) 2454 { 2455 auto i = range.front; 2456 range.popFront(); 2457 2458 put(sink, "["); 2459 formatValue(sink, i.a, fmt); 2460 put(sink, ".."); 2461 formatValue(sink, i.b, fmt); 2462 put(sink, ")"); 2463 if (range.empty) return; 2464 put(sink, " "); 2465 } 2466 } 2467 2468 /// 2469 pure @safe unittest 2470 { 2471 import std.conv : to; 2472 import std.format : format; 2473 import std.uni : unicode; 2474 2475 assert(unicode.Cyrillic.to!string == 2476 "[1024..1157) [1159..1320) [7467..7468) [7544..7545) [11744..11776) [42560..42648) [42655..42656)"); 2477 2478 // The specs '%s' and '%d' are equivalent to the to!string call above. 2479 assert(format("%d", unicode.Cyrillic) == unicode.Cyrillic.to!string); 2480 2481 assert(format("%#x", unicode.Cyrillic) == 2482 "[0x400..0x485) [0x487..0x528) [0x1d2b..0x1d2c) [0x1d78..0x1d79) [0x2de0..0x2e00) " 2483 ~"[0xa640..0xa698) [0xa69f..0xa6a0)"); 2484 2485 assert(format("%#X", unicode.Cyrillic) == 2486 "[0X400..0X485) [0X487..0X528) [0X1D2B..0X1D2C) [0X1D78..0X1D79) [0X2DE0..0X2E00) " 2487 ~"[0XA640..0XA698) [0XA69F..0XA6A0)"); 2488 } 2489 2490 pure @safe unittest 2491 { 2492 import std.exception : assertThrown; 2493 import std.format : format, FormatException; 2494 assertThrown!FormatException(format("%z", unicode.ASCII)); 2495 } 2496 2497 2498 /** 2499 Add an interval [a, b$(RPAREN) to this set. 2500 */ 2501 ref add()(uint a, uint b) 2502 { 2503 addInterval(a, b); 2504 return this; 2505 } 2506 2507 /// 2508 pure @safe unittest 2509 { 2510 CodepointSet someSet; 2511 someSet.add('0', '5').add('A','Z'+1); 2512 someSet.add('5', '9'+1); 2513 assert(someSet['0']); 2514 assert(someSet['5']); 2515 assert(someSet['9']); 2516 assert(someSet['Z']); 2517 } 2518 2519private: 2520 2521 package(std) // used from: std.regex.internal.parser 2522 ref intersect(U)(U rhs) 2523 if (isCodepointSet!U) 2524 { 2525 Marker mark; 2526 foreach ( i; rhs.byInterval) 2527 { 2528 mark = this.dropUpTo(i.a, mark); 2529 mark = this.skipUpTo(i.b, mark); 2530 } 2531 this.dropUpTo(uint.max, mark); 2532 return this; 2533 } 2534 2535 ref intersect()(dchar ch) 2536 { 2537 foreach (i; byInterval) 2538 if (i.a <= ch && ch < i.b) 2539 return this = This.init.add(ch, ch+1); 2540 this = This.init; 2541 return this; 2542 } 2543 2544 pure @safe unittest 2545 { 2546 assert(unicode.Cyrillic.intersect('-').byInterval.empty); 2547 } 2548 2549 ref sub()(dchar ch) 2550 { 2551 return subChar(ch); 2552 } 2553 2554 // same as the above except that skip & drop parts are swapped 2555 package(std) // used from: std.regex.internal.parser 2556 ref sub(U)(U rhs) 2557 if (isCodepointSet!U) 2558 { 2559 Marker mark; 2560 foreach (i; rhs.byInterval) 2561 { 2562 mark = this.skipUpTo(i.a, mark); 2563 mark = this.dropUpTo(i.b, mark); 2564 } 2565 return this; 2566 } 2567 2568 package(std) // used from: std.regex.internal.parse 2569 ref add(U)(U rhs) 2570 if (isCodepointSet!U) 2571 { 2572 Marker start; 2573 foreach (i; rhs.byInterval) 2574 { 2575 start = addInterval(i.a, i.b, start); 2576 } 2577 return this; 2578 } 2579 2580// end of mixin-able part 2581//============================================================================ 2582public: 2583 /** 2584 Obtains a set that is the inversion of this set. 2585 2586 See the '!' $(LREF opUnary) for the same but using operators. 2587 */ 2588 @property auto inverted() 2589 { 2590 InversionList inversion = this; 2591 if (inversion.data.length == 0) 2592 { 2593 inversion.addInterval(0, lastDchar+1); 2594 return inversion; 2595 } 2596 if (inversion.data[0] != 0) 2597 genericReplace(inversion.data, 0, 0, [0]); 2598 else 2599 genericReplace(inversion.data, 0, 1, cast(uint[]) null); 2600 if (data[data.length-1] != lastDchar+1) 2601 genericReplace(inversion.data, 2602 inversion.data.length, inversion.data.length, [lastDchar+1]); 2603 else 2604 genericReplace(inversion.data, 2605 inversion.data.length-1, inversion.data.length, cast(uint[]) null); 2606 2607 return inversion; 2608 } 2609 2610 /// 2611 pure @safe unittest 2612 { 2613 auto set = unicode.ASCII; 2614 // union with the inverse gets all of the code points in the Unicode 2615 assert((set | set.inverted).length == 0x110000); 2616 // no intersection with the inverse 2617 assert((set & set.inverted).empty); 2618 } 2619 2620 package(std) static string toSourceCode(const(CodepointInterval)[] range, string funcName) 2621 { 2622 import std.algorithm.searching : countUntil; 2623 import std.format : format; 2624 enum maxBinary = 3; 2625 static string linearScope(R)(R ivals, string indent) 2626 { 2627 string result = indent~"{\n"; 2628 string deeper = indent~" "; 2629 foreach (ival; ivals) 2630 { 2631 immutable span = ival[1] - ival[0]; 2632 assert(span != 0); 2633 if (span == 1) 2634 { 2635 result ~= format("%sif (ch == %s) return true;\n", deeper, ival[0]); 2636 } 2637 else if (span == 2) 2638 { 2639 result ~= format("%sif (ch == %s || ch == %s) return true;\n", 2640 deeper, ival[0], ival[0]+1); 2641 } 2642 else 2643 { 2644 if (ival[0] != 0) // dchar is unsigned and < 0 is useless 2645 result ~= format("%sif (ch < %s) return false;\n", deeper, ival[0]); 2646 result ~= format("%sif (ch < %s) return true;\n", deeper, ival[1]); 2647 } 2648 } 2649 result ~= format("%sreturn false;\n%s}\n", deeper, indent); // including empty range of intervals 2650 return result; 2651 } 2652 2653 static string binaryScope(R)(R ivals, string indent) @safe 2654 { 2655 // time to do unrolled comparisons? 2656 if (ivals.length < maxBinary) 2657 return linearScope(ivals, indent); 2658 else 2659 return bisect(ivals, ivals.length/2, indent); 2660 } 2661 2662 // not used yet if/elsebinary search is far better with DMD as of 2.061 2663 // and GDC is doing fine job either way 2664 static string switchScope(R)(R ivals, string indent) 2665 { 2666 string result = indent~"switch (ch){\n"; 2667 string deeper = indent~" "; 2668 foreach (ival; ivals) 2669 { 2670 if (ival[0]+1 == ival[1]) 2671 { 2672 result ~= format("%scase %s: return true;\n", 2673 deeper, ival[0]); 2674 } 2675 else 2676 { 2677 result ~= format("%scase %s: .. case %s: return true;\n", 2678 deeper, ival[0], ival[1]-1); 2679 } 2680 } 2681 result ~= deeper~"default: return false;\n"~indent~"}\n"; 2682 return result; 2683 } 2684 2685 static string bisect(R)(R range, size_t idx, string indent) 2686 { 2687 string deeper = indent ~ " "; 2688 // bisect on one [a, b) interval at idx 2689 string result = indent~"{\n"; 2690 // less branch, < a 2691 result ~= format("%sif (ch < %s)\n%s", 2692 deeper, range[idx][0], binaryScope(range[0 .. idx], deeper)); 2693 // middle point, >= a && < b 2694 result ~= format("%selse if (ch < %s) return true;\n", 2695 deeper, range[idx][1]); 2696 // greater or equal branch, >= b 2697 result ~= format("%selse\n%s", 2698 deeper, binaryScope(range[idx+1..$], deeper)); 2699 return result~indent~"}\n"; 2700 } 2701 2702 string code = format("bool %s(dchar ch) @safe pure nothrow @nogc\n", 2703 funcName.empty ? "function" : funcName); 2704 // special case first bisection to be on ASCII vs beyond 2705 auto tillAscii = countUntil!"a[0] > 0x80"(range); 2706 if (tillAscii <= 0) // everything is ASCII or nothing is ascii (-1 & 0) 2707 code ~= binaryScope(range, ""); 2708 else 2709 code ~= bisect(range, tillAscii, ""); 2710 return code; 2711 } 2712 2713 /** 2714 Generates string with D source code of unary function with name of 2715 `funcName` taking a single `dchar` argument. If `funcName` is empty 2716 the code is adjusted to be a lambda function. 2717 2718 The function generated tests if the $(CODEPOINT) passed 2719 belongs to this set or not. The result is to be used with string mixin. 2720 The intended usage area is aggressive optimization via meta programming 2721 in parser generators and the like. 2722 2723 Note: Use with care for relatively small or regular sets. It 2724 could end up being slower then just using multi-staged tables. 2725 2726 Example: 2727 --- 2728 import std.stdio; 2729 2730 // construct set directly from [a, b$RPAREN intervals 2731 auto set = CodepointSet(10, 12, 45, 65, 100, 200); 2732 writeln(set); 2733 writeln(set.toSourceCode("func")); 2734 --- 2735 2736 The above outputs something along the lines of: 2737 --- 2738 bool func(dchar ch) @safe pure nothrow @nogc 2739 { 2740 if (ch < 45) 2741 { 2742 if (ch == 10 || ch == 11) return true; 2743 return false; 2744 } 2745 else if (ch < 65) return true; 2746 else 2747 { 2748 if (ch < 100) return false; 2749 if (ch < 200) return true; 2750 return false; 2751 } 2752 } 2753 --- 2754 */ 2755 string toSourceCode(string funcName="") 2756 { 2757 import std.array : array; 2758 auto range = byInterval.array(); 2759 return toSourceCode(range, funcName); 2760 } 2761 2762 /** 2763 True if this set doesn't contain any $(CODEPOINTS). 2764 */ 2765 @property bool empty() const 2766 { 2767 return data.length == 0; 2768 } 2769 2770 /// 2771 pure @safe unittest 2772 { 2773 CodepointSet emptySet; 2774 assert(emptySet.length == 0); 2775 assert(emptySet.empty); 2776 } 2777 2778private: 2779 alias This = typeof(this); 2780 alias Marker = size_t; 2781 2782 // a random-access range of integral pairs 2783 static struct Intervals(Range) 2784 { 2785 import std.range.primitives : hasAssignableElements; 2786 2787 this(Range sp) scope 2788 { 2789 slice = sp; 2790 start = 0; 2791 end = sp.length; 2792 } 2793 2794 this(Range sp, size_t s, size_t e) scope 2795 { 2796 slice = sp; 2797 start = s; 2798 end = e; 2799 } 2800 2801 @property auto front()const 2802 { 2803 immutable a = slice[start]; 2804 immutable b = slice[start+1]; 2805 return CodepointInterval(a, b); 2806 } 2807 2808 //may break sorted property - but we need std.sort to access it 2809 //hence package(std) protection attribute 2810 static if (hasAssignableElements!Range) 2811 package(std) @property void front(CodepointInterval val) 2812 { 2813 slice[start] = val.a; 2814 slice[start+1] = val.b; 2815 } 2816 2817 @property auto back()const 2818 { 2819 immutable a = slice[end-2]; 2820 immutable b = slice[end-1]; 2821 return CodepointInterval(a, b); 2822 } 2823 2824 //ditto about package 2825 static if (hasAssignableElements!Range) 2826 package(std) @property void back(CodepointInterval val) 2827 { 2828 slice[end-2] = val.a; 2829 slice[end-1] = val.b; 2830 } 2831 2832 void popFront() 2833 { 2834 start += 2; 2835 } 2836 2837 void popBack() 2838 { 2839 end -= 2; 2840 } 2841 2842 auto opIndex(size_t idx) const 2843 { 2844 immutable a = slice[start+idx*2]; 2845 immutable b = slice[start+idx*2+1]; 2846 return CodepointInterval(a, b); 2847 } 2848 2849 //ditto about package 2850 static if (hasAssignableElements!Range) 2851 package(std) void opIndexAssign(CodepointInterval val, size_t idx) 2852 { 2853 slice[start+idx*2] = val.a; 2854 slice[start+idx*2+1] = val.b; 2855 } 2856 2857 auto opSlice(size_t s, size_t e) 2858 { 2859 return Intervals(slice, s*2+start, e*2+start); 2860 } 2861 2862 @property size_t length()const { return slice.length/2; } 2863 2864 @property bool empty()const { return start == end; } 2865 2866 @property auto save(){ return this; } 2867 private: 2868 size_t start, end; 2869 Range slice; 2870 } 2871 2872 // called after construction from intervals 2873 // to make sure invariants hold 2874 void sanitize() 2875 { 2876 import std.algorithm.comparison : max; 2877 import std.algorithm.mutation : SwapStrategy; 2878 import std.algorithm.sorting : sort; 2879 if (data.length == 0) 2880 return; 2881 alias Ival = CodepointInterval; 2882 //intervals wrapper for a _range_ over packed array 2883 auto ivals = Intervals!(typeof(data[]))(data[]); 2884 //@@@BUG@@@ can't use "a.a < b.a" see 2885 // https://issues.dlang.org/show_bug.cgi?id=12265 2886 sort!((a,b) => a.a < b.a, SwapStrategy.stable)(ivals); 2887 // what follows is a variation on stable remove 2888 // differences: 2889 // - predicate is binary, and is tested against 2890 // the last kept element (at 'i'). 2891 // - predicate mutates lhs (merges rhs into lhs) 2892 size_t len = ivals.length; 2893 size_t i = 0; 2894 size_t j = 1; 2895 while (j < len) 2896 { 2897 if (ivals[i].b >= ivals[j].a) 2898 { 2899 ivals[i] = Ival(ivals[i].a, max(ivals[i].b, ivals[j].b)); 2900 j++; 2901 } 2902 else //unmergable 2903 { 2904 // check if there is a hole after merges 2905 // (in the best case we do 0 writes to ivals) 2906 if (j != i+1) 2907 ivals[i+1] = ivals[j]; //copy over 2908 i++; 2909 j++; 2910 } 2911 } 2912 len = i + 1; 2913 for (size_t k=0; k + 1 < len; k++) 2914 { 2915 assert(ivals[k].a < ivals[k].b); 2916 assert(ivals[k].b < ivals[k+1].a); 2917 } 2918 data.length = len * 2; 2919 } 2920 2921 // special case for normal InversionList 2922 ref subChar(dchar ch) 2923 { 2924 auto mark = skipUpTo(ch); 2925 if (mark != data.length 2926 && data[mark] == ch && data[mark-1] == ch) 2927 { 2928 // it has split, meaning that ch happens to be in one of intervals 2929 data[mark] = data[mark]+1; 2930 } 2931 return this; 2932 } 2933 2934 // 2935 Marker addInterval(int a, int b, Marker hint=Marker.init) scope 2936 in 2937 { 2938 assert(a <= b); 2939 } 2940 do 2941 { 2942 import std.range : assumeSorted, SearchPolicy; 2943 auto range = assumeSorted(data[]); 2944 size_t pos; 2945 size_t a_idx = hint + range[hint..$].lowerBound!(SearchPolicy.gallop)(a).length; 2946 if (a_idx == range.length) 2947 { 2948 // [---+++----++++----++++++] 2949 // [ a b] 2950 data.append(a, b); 2951 return data.length-1; 2952 } 2953 size_t b_idx = range[a_idx .. range.length].lowerBound!(SearchPolicy.gallop)(b).length+a_idx; 2954 uint[3] buf = void; 2955 uint to_insert; 2956 debug(std_uni) 2957 { 2958 writefln("a_idx=%d; b_idx=%d;", a_idx, b_idx); 2959 } 2960 if (b_idx == range.length) 2961 { 2962 // [-------++++++++----++++++-] 2963 // [ s a b] 2964 if (a_idx & 1)// a in positive 2965 { 2966 buf[0] = b; 2967 to_insert = 1; 2968 } 2969 else// a in negative 2970 { 2971 buf[0] = a; 2972 buf[1] = b; 2973 to_insert = 2; 2974 } 2975 pos = genericReplace(data, a_idx, b_idx, buf[0 .. to_insert]); 2976 return pos - 1; 2977 } 2978 2979 uint top = data[b_idx]; 2980 2981 debug(std_uni) 2982 { 2983 writefln("a_idx=%d; b_idx=%d;", a_idx, b_idx); 2984 writefln("a=%s; b=%s; top=%s;", a, b, top); 2985 } 2986 if (a_idx & 1) 2987 {// a in positive 2988 if (b_idx & 1)// b in positive 2989 { 2990 // [-------++++++++----++++++-] 2991 // [ s a b ] 2992 buf[0] = top; 2993 to_insert = 1; 2994 } 2995 else // b in negative 2996 { 2997 // [-------++++++++----++++++-] 2998 // [ s a b ] 2999 if (top == b) 3000 { 3001 assert(b_idx+1 < data.length); 3002 buf[0] = data[b_idx+1]; 3003 pos = genericReplace(data, a_idx, b_idx+2, buf[0 .. 1]); 3004 return pos - 1; 3005 } 3006 buf[0] = b; 3007 buf[1] = top; 3008 to_insert = 2; 3009 } 3010 } 3011 else 3012 { // a in negative 3013 if (b_idx & 1) // b in positive 3014 { 3015 // [----------+++++----++++++-] 3016 // [ a b ] 3017 buf[0] = a; 3018 buf[1] = top; 3019 to_insert = 2; 3020 } 3021 else// b in negative 3022 { 3023 // [----------+++++----++++++-] 3024 // [ a s b ] 3025 if (top == b) 3026 { 3027 assert(b_idx+1 < data.length); 3028 buf[0] = a; 3029 buf[1] = data[b_idx+1]; 3030 pos = genericReplace(data, a_idx, b_idx+2, buf[0 .. 2]); 3031 return pos - 1; 3032 } 3033 buf[0] = a; 3034 buf[1] = b; 3035 buf[2] = top; 3036 to_insert = 3; 3037 } 3038 } 3039 pos = genericReplace(data, a_idx, b_idx+1, buf[0 .. to_insert]); 3040 debug(std_uni) 3041 { 3042 writefln("marker idx: %d; length=%d", pos, data[pos], data.length); 3043 writeln("inserting ", buf[0 .. to_insert]); 3044 } 3045 return pos - 1; 3046 } 3047 3048 // 3049 Marker dropUpTo(uint a, Marker pos=Marker.init) 3050 in 3051 { 3052 assert(pos % 2 == 0); // at start of interval 3053 } 3054 do 3055 { 3056 auto range = assumeSorted!"a <= b"(data[pos .. data.length]); 3057 if (range.empty) 3058 return pos; 3059 size_t idx = pos; 3060 idx += range.lowerBound(a).length; 3061 3062 debug(std_uni) 3063 { 3064 writeln("dropUpTo full length=", data.length); 3065 writeln(pos,"~~~", idx); 3066 } 3067 if (idx == data.length) 3068 return genericReplace(data, pos, idx, cast(uint[])[]); 3069 if (idx & 1) 3070 { // a in positive 3071 //[--+++----++++++----+++++++------...] 3072 // |<---si s a t 3073 genericReplace(data, pos, idx, [a]); 3074 } 3075 else 3076 { // a in negative 3077 //[--+++----++++++----+++++++-------+++...] 3078 // |<---si s a t 3079 genericReplace(data, pos, idx, cast(uint[])[]); 3080 } 3081 return pos; 3082 } 3083 3084 // 3085 Marker skipUpTo(uint a, Marker pos=Marker.init) 3086 out(result) 3087 { 3088 assert(result % 2 == 0);// always start of interval 3089 //(may be 0-width after-split) 3090 } 3091 do 3092 { 3093 assert(data.length % 2 == 0); 3094 auto range = assumeSorted!"a <= b"(data[pos .. data.length]); 3095 size_t idx = pos+range.lowerBound(a).length; 3096 3097 if (idx >= data.length) // could have Marker point to recently removed stuff 3098 return data.length; 3099 3100 if (idx & 1)// inside of interval, check for split 3101 { 3102 3103 immutable top = data[idx]; 3104 if (top == a)// no need to split, it's end 3105 return idx+1; 3106 immutable start = data[idx-1]; 3107 if (a == start) 3108 return idx-1; 3109 // split it up 3110 genericReplace(data, idx, idx+1, [a, a, top]); 3111 return idx+1; // avoid odd index 3112 } 3113 return idx; 3114 } 3115 3116 CowArray!SP data; 3117} 3118 3119pure @safe unittest 3120{ 3121 import std.conv : to; 3122 assert(unicode.ASCII.to!string() == "[0..128)"); 3123} 3124 3125// pedantic version for ctfe, and aligned-access only architectures 3126@system private uint safeRead24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc 3127{ 3128 idx *= 3; 3129 version (LittleEndian) 3130 return ptr[idx] + (cast(uint) ptr[idx+1]<<8) 3131 + (cast(uint) ptr[idx+2]<<16); 3132 else 3133 return (cast(uint) ptr[idx]<<16) + (cast(uint) ptr[idx+1]<<8) 3134 + ptr[idx+2]; 3135} 3136 3137// ditto 3138@system private void safeWrite24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc 3139{ 3140 idx *= 3; 3141 version (LittleEndian) 3142 { 3143 ptr[idx] = val & 0xFF; 3144 ptr[idx+1] = (val >> 8) & 0xFF; 3145 ptr[idx+2] = (val >> 16) & 0xFF; 3146 } 3147 else 3148 { 3149 ptr[idx] = (val >> 16) & 0xFF; 3150 ptr[idx+1] = (val >> 8) & 0xFF; 3151 ptr[idx+2] = val & 0xFF; 3152 } 3153} 3154 3155// unaligned x86-like read/write functions 3156@system private uint unalignedRead24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc 3157{ 3158 uint* src = cast(uint*)(ptr+3*idx); 3159 version (LittleEndian) 3160 return *src & 0xFF_FFFF; 3161 else 3162 return *src >> 8; 3163} 3164 3165// ditto 3166@system private void unalignedWrite24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc 3167{ 3168 uint* dest = cast(uint*)(cast(ubyte*) ptr + 3*idx); 3169 version (LittleEndian) 3170 *dest = val | (*dest & 0xFF00_0000); 3171 else 3172 *dest = (val << 8) | (*dest & 0xFF); 3173} 3174 3175@system private uint read24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc 3176{ 3177 static if (hasUnalignedReads) 3178 return __ctfe ? safeRead24(ptr, idx) : unalignedRead24(ptr, idx); 3179 else 3180 return safeRead24(ptr, idx); 3181} 3182 3183@system private void write24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc 3184{ 3185 static if (hasUnalignedReads) 3186 return __ctfe ? safeWrite24(ptr, val, idx) : unalignedWrite24(ptr, val, idx); 3187 else 3188 return safeWrite24(ptr, val, idx); 3189} 3190 3191struct CowArray(SP=GcPolicy) 3192{ 3193 import std.range.primitives : hasLength; 3194 3195 @safe: 3196 static auto reuse(uint[] arr) 3197 { 3198 CowArray cow; 3199 cow.data = arr; 3200 SP.append(cow.data, 1); 3201 assert(cow.refCount == 1); 3202 assert(cow.length == arr.length); 3203 return cow; 3204 } 3205 3206 this(Range)(Range range) 3207 if (isInputRange!Range && hasLength!Range) 3208 { 3209 import std.algorithm.mutation : copy; 3210 length = range.length; 3211 copy(range, data[0..$-1]); 3212 } 3213 3214 this(Range)(Range range) 3215 if (isForwardRange!Range && !hasLength!Range) 3216 { 3217 import std.algorithm.mutation : copy; 3218 import std.range.primitives : walkLength; 3219 immutable len = walkLength(range.save); 3220 length = len; 3221 copy(range, data[0..$-1]); 3222 } 3223 3224 this(this) 3225 { 3226 if (!empty) 3227 { 3228 refCount = refCount + 1; 3229 } 3230 } 3231 3232 ~this() 3233 { 3234 if (!empty) 3235 { 3236 immutable cnt = refCount; 3237 if (cnt == 1) 3238 SP.destroy(data); 3239 else 3240 refCount = cnt - 1; 3241 } 3242 } 3243 3244 // no ref-count for empty U24 array 3245 @property bool empty() const { return data.length == 0; } 3246 3247 // report one less then actual size 3248 @property size_t length() const 3249 { 3250 return data.length ? data.length - 1 : 0; 3251 } 3252 3253 //+ an extra slot for ref-count 3254 @property void length(size_t len) 3255 { 3256 import std.algorithm.comparison : min; 3257 import std.algorithm.mutation : copy; 3258 if (len == 0) 3259 { 3260 if (!empty) 3261 freeThisReference(); 3262 return; 3263 } 3264 immutable total = len + 1; // including ref-count 3265 if (empty) 3266 { 3267 data = SP.alloc!uint(total); 3268 refCount = 1; 3269 return; 3270 } 3271 immutable cur_cnt = refCount; 3272 if (cur_cnt != 1) // have more references to this memory 3273 { 3274 refCount = cur_cnt - 1; 3275 auto new_data = SP.alloc!uint(total); 3276 // take shrinking into account 3277 auto to_copy = min(total, data.length) - 1; 3278 copy(data[0 .. to_copy], new_data[0 .. to_copy]); 3279 data = new_data; // before setting refCount! 3280 refCount = 1; 3281 } 3282 else // 'this' is the only reference 3283 { 3284 // use the realloc (hopefully in-place operation) 3285 data = SP.realloc(data, total); 3286 refCount = 1; // setup a ref-count in the new end of the array 3287 } 3288 } 3289 3290 alias opDollar = length; 3291 3292 uint opIndex()(size_t idx)const 3293 { 3294 return data[idx]; 3295 } 3296 3297 void opIndexAssign(uint val, size_t idx) 3298 { 3299 auto cnt = refCount; 3300 if (cnt != 1) 3301 dupThisReference(cnt); 3302 data[idx] = val; 3303 } 3304 3305 // 3306 auto opSlice(size_t from, size_t to) 3307 { 3308 if (!empty) 3309 { 3310 auto cnt = refCount; 3311 if (cnt != 1) 3312 dupThisReference(cnt); 3313 } 3314 return data[from .. to]; 3315 3316 } 3317 3318 // 3319 auto opSlice(size_t from, size_t to) const 3320 { 3321 return data[from .. to]; 3322 } 3323 3324 // length slices before the ref count 3325 auto opSlice() 3326 { 3327 return opSlice(0, length); 3328 } 3329 3330 // ditto 3331 auto opSlice() const 3332 { 3333 return opSlice(0, length); 3334 } 3335 3336 void append(Range)(Range range) 3337 if (isInputRange!Range && hasLength!Range && is(ElementType!Range : uint)) 3338 { 3339 size_t nl = length + range.length; 3340 length = nl; 3341 copy(range, this[nl-range.length .. nl]); 3342 } 3343 3344 void append()(uint[] val...) 3345 { 3346 length = length + val.length; 3347 data[$-val.length-1 .. $-1] = val[]; 3348 } 3349 3350 bool opEquals()(auto const ref CowArray rhs)const 3351 { 3352 if (empty ^ rhs.empty) 3353 return false; // one is empty and the other isn't 3354 return empty || data[0..$-1] == rhs.data[0..$-1]; 3355 } 3356 3357private: 3358 // ref-count is right after the data 3359 @property uint refCount() const 3360 { 3361 return data[$-1]; 3362 } 3363 3364 @property void refCount(uint cnt) 3365 { 3366 data[$-1] = cnt; 3367 } 3368 3369 void freeThisReference() 3370 { 3371 immutable count = refCount; 3372 if (count != 1) // have more references to this memory 3373 { 3374 // dec shared ref-count 3375 refCount = count - 1; 3376 data = []; 3377 } 3378 else 3379 SP.destroy(data); 3380 assert(!data.ptr); 3381 } 3382 3383 void dupThisReference(uint count) 3384 in 3385 { 3386 assert(!empty && count != 1 && count == refCount); 3387 } 3388 do 3389 { 3390 import std.algorithm.mutation : copy; 3391 // dec shared ref-count 3392 refCount = count - 1; 3393 // copy to the new chunk of RAM 3394 auto new_data = SP.alloc!uint(data.length); 3395 // bit-blit old stuff except the counter 3396 copy(data[0..$-1], new_data[0..$-1]); 3397 data = new_data; // before setting refCount! 3398 refCount = 1; // so that this updates the right one 3399 } 3400 3401 uint[] data; 3402} 3403 3404pure @safe unittest// Uint24 tests 3405{ 3406 import std.algorithm.comparison : equal; 3407 import std.algorithm.mutation : copy; 3408 import std.conv : text; 3409 import std.range : iota, chain; 3410 import std.range.primitives : isBidirectionalRange, isOutputRange; 3411 void funcRef(T)(ref T u24) 3412 { 3413 u24.length = 2; 3414 u24[1] = 1024; 3415 T u24_c = u24; 3416 assert(u24[1] == 1024); 3417 u24.length = 0; 3418 assert(u24.empty); 3419 u24.append([1, 2]); 3420 assert(equal(u24[], [1, 2])); 3421 u24.append(111); 3422 assert(equal(u24[], [1, 2, 111])); 3423 assert(!u24_c.empty && u24_c[1] == 1024); 3424 u24.length = 3; 3425 copy(iota(0, 3), u24[]); 3426 assert(equal(u24[], iota(0, 3))); 3427 assert(u24_c[1] == 1024); 3428 } 3429 3430 void func2(T)(T u24) 3431 { 3432 T u24_2 = u24; 3433 T u24_3; 3434 u24_3 = u24_2; 3435 assert(u24_2 == u24_3); 3436 assert(equal(u24[], u24_2[])); 3437 assert(equal(u24_2[], u24_3[])); 3438 funcRef(u24_3); 3439 3440 assert(equal(u24_3[], iota(0, 3))); 3441 assert(!equal(u24_2[], u24_3[])); 3442 assert(equal(u24_2[], u24[])); 3443 u24_2 = u24_3; 3444 assert(equal(u24_2[], iota(0, 3))); 3445 // to test that passed arg is intact outside 3446 // plus try out opEquals 3447 u24 = u24_3; 3448 u24 = T.init; 3449 u24_3 = T.init; 3450 assert(u24.empty); 3451 assert(u24 == u24_3); 3452 assert(u24 != u24_2); 3453 } 3454 3455 static foreach (Policy; AliasSeq!(GcPolicy, ReallocPolicy)) 3456 {{ 3457 alias Range = typeof(CowArray!Policy.init[]); 3458 alias U24A = CowArray!Policy; 3459 static assert(isForwardRange!Range); 3460 static assert(isBidirectionalRange!Range); 3461 static assert(isOutputRange!(Range, uint)); 3462 static assert(isRandomAccessRange!(Range)); 3463 3464 auto arr = U24A([42u, 36, 100]); 3465 assert(arr[0] == 42); 3466 assert(arr[1] == 36); 3467 arr[0] = 72; 3468 arr[1] = 0xFE_FEFE; 3469 assert(arr[0] == 72); 3470 assert(arr[1] == 0xFE_FEFE); 3471 assert(arr[2] == 100); 3472 U24A arr2 = arr; 3473 assert(arr2[0] == 72); 3474 arr2[0] = 11; 3475 // test COW-ness 3476 assert(arr[0] == 72); 3477 assert(arr2[0] == 11); 3478 // set this to about 100M to stress-test COW memory management 3479 foreach (v; 0 .. 10_000) 3480 func2(arr); 3481 assert(equal(arr[], [72, 0xFE_FEFE, 100])); 3482 3483 auto r2 = U24A(iota(0, 100)); 3484 assert(equal(r2[], iota(0, 100)), text(r2[])); 3485 copy(iota(10, 170, 2), r2[10 .. 90]); 3486 assert(equal(r2[], chain(iota(0, 10), iota(10, 170, 2), iota(90, 100))) 3487 , text(r2[])); 3488 }} 3489} 3490 3491pure @safe unittest// core set primitives test 3492{ 3493 import std.conv : text; 3494 alias AllSets = AliasSeq!(InversionList!GcPolicy, InversionList!ReallocPolicy); 3495 foreach (CodeList; AllSets) 3496 { 3497 CodeList a; 3498 //"plug a hole" test 3499 a.add(10, 20).add(25, 30).add(15, 27); 3500 assert(a == CodeList(10, 30), text(a)); 3501 3502 auto x = CodeList.init; 3503 x.add(10, 20).add(30, 40).add(50, 60); 3504 3505 a = x; 3506 a.add(20, 49);//[10, 49) [50, 60) 3507 assert(a == CodeList(10, 49, 50 ,60)); 3508 3509 a = x; 3510 a.add(20, 50); 3511 assert(a == CodeList(10, 60), text(a)); 3512 3513 // simple unions, mostly edge effects 3514 x = CodeList.init; 3515 x.add(10, 20).add(40, 60); 3516 3517 a = x; 3518 a.add(10, 25); //[10, 25) [40, 60) 3519 assert(a == CodeList(10, 25, 40, 60)); 3520 3521 a = x; 3522 a.add(5, 15); //[5, 20) [40, 60) 3523 assert(a == CodeList(5, 20, 40, 60)); 3524 3525 a = x; 3526 a.add(0, 10); // [0, 20) [40, 60) 3527 assert(a == CodeList(0, 20, 40, 60)); 3528 3529 a = x; 3530 a.add(0, 5); // prepand 3531 assert(a == CodeList(0, 5, 10, 20, 40, 60), text(a)); 3532 3533 a = x; 3534 a.add(5, 20); 3535 assert(a == CodeList(5, 20, 40, 60)); 3536 3537 a = x; 3538 a.add(3, 37); 3539 assert(a == CodeList(3, 37, 40, 60)); 3540 3541 a = x; 3542 a.add(37, 65); 3543 assert(a == CodeList(10, 20, 37, 65)); 3544 3545 // some tests on helpers for set intersection 3546 x = CodeList.init.add(10, 20).add(40, 60).add(100, 120); 3547 a = x; 3548 3549 auto m = a.skipUpTo(60); 3550 a.dropUpTo(110, m); 3551 assert(a == CodeList(10, 20, 40, 60, 110, 120), text(a.data[])); 3552 3553 a = x; 3554 a.dropUpTo(100); 3555 assert(a == CodeList(100, 120), text(a.data[])); 3556 3557 a = x; 3558 m = a.skipUpTo(50); 3559 a.dropUpTo(140, m); 3560 assert(a == CodeList(10, 20, 40, 50), text(a.data[])); 3561 a = x; 3562 a.dropUpTo(60); 3563 assert(a == CodeList(100, 120), text(a.data[])); 3564 } 3565} 3566 3567 3568//test constructor to work with any order of intervals 3569pure @safe unittest 3570{ 3571 import std.algorithm.comparison : equal; 3572 import std.conv : text, to; 3573 import std.range : chain, iota; 3574 import std.typecons : tuple; 3575 //ensure constructor handles bad ordering and overlap 3576 auto c1 = CodepointSet('��', '��'+1, '��','��'+1); 3577 foreach (ch; chain(iota('��', '��'+1), iota('��','��'+1))) 3578 assert(ch in c1, to!string(ch)); 3579 3580 //contiguos 3581 assert(CodepointSet(1000, 1006, 1006, 1009) 3582 .byInterval.equal([tuple(1000, 1009)])); 3583 //contains 3584 assert(CodepointSet(900, 1200, 1000, 1100) 3585 .byInterval.equal([tuple(900, 1200)])); 3586 //intersect left 3587 assert(CodepointSet(900, 1100, 1000, 1200) 3588 .byInterval.equal([tuple(900, 1200)])); 3589 //intersect right 3590 assert(CodepointSet(1000, 1200, 900, 1100) 3591 .byInterval.equal([tuple(900, 1200)])); 3592 3593 //ditto with extra items at end 3594 assert(CodepointSet(1000, 1200, 900, 1100, 800, 850) 3595 .byInterval.equal([tuple(800, 850), tuple(900, 1200)])); 3596 assert(CodepointSet(900, 1100, 1000, 1200, 800, 850) 3597 .byInterval.equal([tuple(800, 850), tuple(900, 1200)])); 3598 3599 //"plug a hole" test 3600 auto c2 = CodepointSet(20, 40, 3601 60, 80, 100, 140, 150, 200, 3602 40, 60, 80, 100, 140, 150 3603 ); 3604 assert(c2.byInterval.equal([tuple(20, 200)])); 3605 3606 auto c3 = CodepointSet( 3607 20, 40, 60, 80, 100, 140, 150, 200, 3608 0, 10, 15, 100, 10, 20, 200, 220); 3609 assert(c3.byInterval.equal([tuple(0, 140), tuple(150, 220)])); 3610} 3611 3612 3613pure @safe unittest 3614{ // full set operations 3615 import std.conv : text; 3616 alias AllSets = AliasSeq!(InversionList!GcPolicy, InversionList!ReallocPolicy); 3617 foreach (CodeList; AllSets) 3618 { 3619 CodeList a, b, c, d; 3620 3621 //"plug a hole" 3622 a.add(20, 40).add(60, 80).add(100, 140).add(150, 200); 3623 b.add(40, 60).add(80, 100).add(140, 150); 3624 c = a | b; 3625 d = b | a; 3626 assert(c == CodeList(20, 200), text(CodeList.stringof," ", c)); 3627 assert(c == d, text(c," vs ", d)); 3628 3629 b = CodeList.init.add(25, 45).add(65, 85).add(95,110).add(150, 210); 3630 c = a | b; //[20,45) [60, 85) [95, 140) [150, 210) 3631 d = b | a; 3632 assert(c == CodeList(20, 45, 60, 85, 95, 140, 150, 210), text(c)); 3633 assert(c == d, text(c," vs ", d)); 3634 3635 b = CodeList.init.add(10, 20).add(30,100).add(145,200); 3636 c = a | b;//[10, 140) [145, 200) 3637 d = b | a; 3638 assert(c == CodeList(10, 140, 145, 200)); 3639 assert(c == d, text(c," vs ", d)); 3640 3641 b = CodeList.init.add(0, 10).add(15, 100).add(10, 20).add(200, 220); 3642 c = a | b;//[0, 140) [150, 220) 3643 d = b | a; 3644 assert(c == CodeList(0, 140, 150, 220)); 3645 assert(c == d, text(c," vs ", d)); 3646 3647 3648 a = CodeList.init.add(20, 40).add(60, 80); 3649 b = CodeList.init.add(25, 35).add(65, 75); 3650 c = a & b; 3651 d = b & a; 3652 assert(c == CodeList(25, 35, 65, 75), text(c)); 3653 assert(c == d, text(c," vs ", d)); 3654 3655 a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200); 3656 b = CodeList.init.add(25, 35).add(65, 75).add(110, 130).add(160, 180); 3657 c = a & b; 3658 d = b & a; 3659 assert(c == CodeList(25, 35, 65, 75, 110, 130, 160, 180), text(c)); 3660 assert(c == d, text(c," vs ", d)); 3661 3662 a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200); 3663 b = CodeList.init.add(10, 30).add(60, 120).add(135, 160); 3664 c = a & b;//[20, 30)[60, 80) [100, 120) [135, 140) [150, 160) 3665 d = b & a; 3666 3667 assert(c == CodeList(20, 30, 60, 80, 100, 120, 135, 140, 150, 160),text(c)); 3668 assert(c == d, text(c, " vs ",d)); 3669 assert((c & a) == c); 3670 assert((d & b) == d); 3671 assert((c & d) == d); 3672 3673 b = CodeList.init.add(40, 60).add(80, 100).add(140, 200); 3674 c = a & b; 3675 d = b & a; 3676 assert(c == CodeList(150, 200), text(c)); 3677 assert(c == d, text(c, " vs ",d)); 3678 assert((c & a) == c); 3679 assert((d & b) == d); 3680 assert((c & d) == d); 3681 3682 assert((a & a) == a); 3683 assert((b & b) == b); 3684 3685 a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200); 3686 b = CodeList.init.add(30, 60).add(75, 120).add(190, 300); 3687 c = a - b;// [30, 40) [60, 75) [120, 140) [150, 190) 3688 d = b - a;// [40, 60) [80, 100) [200, 300) 3689 assert(c == CodeList(20, 30, 60, 75, 120, 140, 150, 190), text(c)); 3690 assert(d == CodeList(40, 60, 80, 100, 200, 300), text(d)); 3691 assert(c - d == c, text(c-d, " vs ", c)); 3692 assert(d - c == d, text(d-c, " vs ", d)); 3693 assert(c - c == CodeList.init); 3694 assert(d - d == CodeList.init); 3695 3696 a = CodeList.init.add(20, 40).add( 60, 80).add(100, 140).add(150, 200); 3697 b = CodeList.init.add(10, 50).add(60, 160).add(190, 300); 3698 c = a - b;// [160, 190) 3699 d = b - a;// [10, 20) [40, 50) [80, 100) [140, 150) [200, 300) 3700 assert(c == CodeList(160, 190), text(c)); 3701 assert(d == CodeList(10, 20, 40, 50, 80, 100, 140, 150, 200, 300), text(d)); 3702 assert(c - d == c, text(c-d, " vs ", c)); 3703 assert(d - c == d, text(d-c, " vs ", d)); 3704 assert(c - c == CodeList.init); 3705 assert(d - d == CodeList.init); 3706 3707 a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200); 3708 b = CodeList.init.add(10, 30).add(45, 100).add(130, 190); 3709 c = a ~ b; // [10, 20) [30, 40) [45, 60) [80, 130) [140, 150) [190, 200) 3710 d = b ~ a; 3711 assert(c == CodeList(10, 20, 30, 40, 45, 60, 80, 130, 140, 150, 190, 200), 3712 text(c)); 3713 assert(c == d, text(c, " vs ", d)); 3714 } 3715} 3716 3717} 3718 3719pure @safe unittest// vs single dchar 3720{ 3721 import std.conv : text; 3722 CodepointSet a = CodepointSet(10, 100, 120, 200); 3723 assert(a - 'A' == CodepointSet(10, 65, 66, 100, 120, 200), text(a - 'A')); 3724 assert((a & 'B') == CodepointSet(66, 67)); 3725} 3726 3727pure @safe unittest// iteration & opIndex 3728{ 3729 import std.algorithm.comparison : equal; 3730 import std.conv : text; 3731 import std.typecons : tuple, Tuple; 3732 3733 static foreach (CodeList; AliasSeq!(InversionList!(ReallocPolicy))) 3734 {{ 3735 auto arr = "ABCDEFGHIJKLMabcdefghijklm"d; 3736 auto a = CodeList('A','N','a', 'n'); 3737 assert(equal(a.byInterval, 3738 [tuple(cast(uint)'A', cast(uint)'N'), tuple(cast(uint)'a', cast(uint)'n')] 3739 ), text(a.byInterval)); 3740 3741 // same @@@BUG as in https://issues.dlang.org/show_bug.cgi?id=8949 ? 3742 version (bug8949) 3743 { 3744 import std.range : retro; 3745 assert(equal(retro(a.byInterval), 3746 [tuple(cast(uint)'a', cast(uint)'n'), tuple(cast(uint)'A', cast(uint)'N')] 3747 ), text(retro(a.byInterval))); 3748 } 3749 auto achr = a.byCodepoint; 3750 assert(equal(achr, arr), text(a.byCodepoint)); 3751 foreach (ch; a.byCodepoint) 3752 assert(a[ch]); 3753 auto x = CodeList(100, 500, 600, 900, 1200, 1500); 3754 assert(equal(x.byInterval, [ tuple(100, 500), tuple(600, 900), tuple(1200, 1500)]), text(x.byInterval)); 3755 foreach (ch; x.byCodepoint) 3756 assert(x[ch]); 3757 static if (is(CodeList == CodepointSet)) 3758 { 3759 auto y = CodeList(x.byInterval); 3760 assert(equal(x.byInterval, y.byInterval)); 3761 } 3762 assert(equal(CodepointSet.init.byInterval, cast(Tuple!(uint, uint)[])[])); 3763 assert(equal(CodepointSet.init.byCodepoint, cast(dchar[])[])); 3764 }} 3765} 3766 3767//============================================================================ 3768// Generic Trie template and various ways to build it 3769//============================================================================ 3770 3771// debug helper to get a shortened array dump 3772auto arrayRepr(T)(T x) 3773{ 3774 import std.conv : text; 3775 if (x.length > 32) 3776 { 3777 return text(x[0 .. 16],"~...~", x[x.length-16 .. x.length]); 3778 } 3779 else 3780 return text(x); 3781} 3782 3783/** 3784 Maps `Key` to a suitable integer index within the range of `size_t`. 3785 The mapping is constructed by applying predicates from `Prefix` left to right 3786 and concatenating the resulting bits. 3787 3788 The first (leftmost) predicate defines the most significant bits of 3789 the resulting index. 3790 */ 3791template mapTrieIndex(Prefix...) 3792{ 3793 size_t mapTrieIndex(Key)(Key key) 3794 if (isValidPrefixForTrie!(Key, Prefix)) 3795 { 3796 alias p = Prefix; 3797 size_t idx; 3798 foreach (i, v; p[0..$-1]) 3799 { 3800 idx |= p[i](key); 3801 idx <<= p[i+1].bitSize; 3802 } 3803 idx |= p[$-1](key); 3804 return idx; 3805 } 3806} 3807 3808/* 3809 `TrieBuilder` is a type used for incremental construction 3810 of $(LREF Trie)s. 3811 3812 See $(LREF buildTrie) for generic helpers built on top of it. 3813*/ 3814@trusted private struct TrieBuilder(Value, Key, Args...) 3815if (isBitPackableType!Value && isValidArgsForTrie!(Key, Args)) 3816{ 3817 import std.exception : enforce; 3818 3819private: 3820 // last index is not stored in table, it is used as an offset to values in a block. 3821 static if (is(Value == bool))// always pack bool 3822 alias V = BitPacked!(Value, 1); 3823 else 3824 alias V = Value; 3825 static auto deduceMaxIndex(Preds...)() 3826 { 3827 size_t idx = 1; 3828 foreach (v; Preds) 3829 idx *= 2^^v.bitSize; 3830 return idx; 3831 } 3832 3833 static if (is(typeof(Args[0]) : Key)) // Args start with upper bound on Key 3834 { 3835 alias Prefix = Args[1..$]; 3836 enum lastPageSize = 2^^Prefix[$-1].bitSize; 3837 enum translatedMaxIndex = mapTrieIndex!(Prefix)(Args[0]); 3838 enum roughedMaxIndex = 3839 (translatedMaxIndex + lastPageSize-1)/lastPageSize*lastPageSize; 3840 // check warp around - if wrapped, use the default deduction rule 3841 enum maxIndex = roughedMaxIndex < translatedMaxIndex ? 3842 deduceMaxIndex!(Prefix)() : roughedMaxIndex; 3843 } 3844 else 3845 { 3846 alias Prefix = Args; 3847 enum maxIndex = deduceMaxIndex!(Prefix)(); 3848 } 3849 3850 alias getIndex = mapTrieIndex!(Prefix); 3851 3852 enum lastLevel = Prefix.length-1; 3853 struct ConstructState 3854 { 3855 size_t idx_zeros, idx_ones; 3856 } 3857 // iteration over levels of Trie, each indexes its own level and thus a shortened domain 3858 size_t[Prefix.length] indices; 3859 // default filler value to use 3860 Value defValue; 3861 // this is a full-width index of next item 3862 size_t curIndex; 3863 // all-zeros page index, all-ones page index (+ indicator if there is such a page) 3864 ConstructState[Prefix.length] state; 3865 // the table being constructed 3866 MultiArray!(idxTypes!(Key, fullBitSize!(Prefix), Prefix[0..$]), V) table; 3867 3868 @disable this(); 3869 3870 //shortcut for index variable at level 'level' 3871 @property ref idx(size_t level)(){ return indices[level]; } 3872 3873 // this function assumes no holes in the input so 3874 // indices are going one by one 3875 void addValue(size_t level, T)(T val, size_t numVals) 3876 { 3877 alias j = idx!level; 3878 enum pageSize = 1 << Prefix[level].bitSize; 3879 if (numVals == 0) 3880 return; 3881 auto ptr = table.slice!(level); 3882 if (numVals == 1) 3883 { 3884 static if (level == Prefix.length-1) 3885 ptr[j] = val; 3886 else 3887 {// can incur narrowing conversion 3888 assert(j < ptr.length); 3889 ptr[j] = force!(typeof(ptr[j]))(val); 3890 } 3891 j++; 3892 if (j % pageSize == 0) 3893 spillToNextPage!level(ptr); 3894 return; 3895 } 3896 // longer row of values 3897 // get to the next page boundary 3898 immutable nextPB = (j + pageSize) & ~(pageSize-1); 3899 immutable n = nextPB - j;// can fill right in this page 3900 if (numVals < n) //fits in current page 3901 { 3902 ptr[j .. j+numVals] = val; 3903 j += numVals; 3904 return; 3905 } 3906 static if (level != 0)//on the first level it always fits 3907 { 3908 numVals -= n; 3909 //write till the end of current page 3910 ptr[j .. j+n] = val; 3911 j += n; 3912 //spill to the next page 3913 spillToNextPage!level(ptr); 3914 // page at once loop 3915 if (state[level].idx_zeros != size_t.max && val == T.init) 3916 { 3917 alias NextIdx = typeof(table.slice!(level-1)[0]); 3918 addValue!(level-1)(force!NextIdx(state[level].idx_zeros), 3919 numVals/pageSize); 3920 ptr = table.slice!level; //table structure might have changed 3921 numVals %= pageSize; 3922 } 3923 else 3924 { 3925 while (numVals >= pageSize) 3926 { 3927 numVals -= pageSize; 3928 ptr[j .. j+pageSize] = val; 3929 j += pageSize; 3930 spillToNextPage!level(ptr); 3931 } 3932 } 3933 if (numVals) 3934 { 3935 // the leftovers, an incomplete page 3936 ptr[j .. j+numVals] = val; 3937 j += numVals; 3938 } 3939 } 3940 } 3941 3942 void spillToNextPage(size_t level, Slice)(ref Slice ptr) 3943 { 3944 // last level (i.e. topmost) has 1 "page" 3945 // thus it need not to add a new page on upper level 3946 static if (level != 0) 3947 spillToNextPageImpl!(level)(ptr); 3948 } 3949 3950 // this can re-use the current page if duplicate or allocate a new one 3951 // it also makes sure that previous levels point to the correct page in this level 3952 void spillToNextPageImpl(size_t level, Slice)(ref Slice ptr) 3953 { 3954 alias NextIdx = typeof(table.slice!(level-1)[0]); 3955 NextIdx next_lvl_index; 3956 enum pageSize = 1 << Prefix[level].bitSize; 3957 assert(idx!level % pageSize == 0); 3958 immutable last = idx!level-pageSize; 3959 const slice = ptr[idx!level - pageSize .. idx!level]; 3960 size_t j; 3961 for (j=0; j<last; j+=pageSize) 3962 { 3963 if (ptr[j .. j+pageSize] == slice) 3964 { 3965 // get index to it, reuse ptr space for the next block 3966 next_lvl_index = force!NextIdx(j/pageSize); 3967 version (none) 3968 { 3969 import std.stdio : writefln, writeln; 3970 writefln("LEVEL(%s) page mapped idx: %s: 0..%s ---> [%s..%s]" 3971 ,level 3972 ,indices[level-1], pageSize, j, j+pageSize); 3973 writeln("LEVEL(", level 3974 , ") mapped page is: ", slice, ": ", arrayRepr(ptr[j .. j+pageSize])); 3975 writeln("LEVEL(", level 3976 , ") src page is :", ptr, ": ", arrayRepr(slice[0 .. pageSize])); 3977 } 3978 idx!level -= pageSize; // reuse this page, it is duplicate 3979 break; 3980 } 3981 } 3982 if (j == last) 3983 { 3984 L_allocate_page: 3985 next_lvl_index = force!NextIdx(idx!level/pageSize - 1); 3986 if (state[level].idx_zeros == size_t.max && ptr.zeros(j, j+pageSize)) 3987 { 3988 state[level].idx_zeros = next_lvl_index; 3989 } 3990 // allocate next page 3991 version (none) 3992 { 3993 import std.stdio : writefln; 3994 writefln("LEVEL(%s) page allocated: %s" 3995 , level, arrayRepr(slice[0 .. pageSize])); 3996 writefln("LEVEL(%s) index: %s ; page at this index %s" 3997 , level 3998 , next_lvl_index 3999 , arrayRepr( 4000 table.slice!(level) 4001 [pageSize*next_lvl_index..(next_lvl_index+1)*pageSize] 4002 )); 4003 } 4004 table.length!level = table.length!level + pageSize; 4005 } 4006 L_know_index: 4007 // for the previous level, values are indices to the pages in the current level 4008 addValue!(level-1)(next_lvl_index, 1); 4009 ptr = table.slice!level; //re-load the slice after moves 4010 } 4011 4012 // idx - full-width index to fill with v (full-width index != key) 4013 // fills everything in the range of [curIndex, idx) with filler 4014 void putAt(size_t idx, Value v) 4015 { 4016 assert(idx >= curIndex); 4017 immutable numFillers = idx - curIndex; 4018 addValue!lastLevel(defValue, numFillers); 4019 addValue!lastLevel(v, 1); 4020 curIndex = idx + 1; 4021 } 4022 4023 // ditto, but sets the range of [idxA, idxB) to v 4024 void putRangeAt(size_t idxA, size_t idxB, Value v) 4025 { 4026 assert(idxA >= curIndex); 4027 assert(idxB >= idxA); 4028 size_t numFillers = idxA - curIndex; 4029 addValue!lastLevel(defValue, numFillers); 4030 addValue!lastLevel(v, idxB - idxA); 4031 curIndex = idxB; // open-right 4032 } 4033 4034 enum errMsg = "non-monotonic prefix function(s), an unsorted range or "~ 4035 "duplicate key->value mapping"; 4036 4037public: 4038 /** 4039 Construct a builder, where `filler` is a value 4040 to indicate empty slots (or "not found" condition). 4041 */ 4042 this(Value filler) 4043 { 4044 curIndex = 0; 4045 defValue = filler; 4046 // zeros-page index, ones-page index 4047 foreach (ref v; state) 4048 v = ConstructState(size_t.max, size_t.max); 4049 table = typeof(table)(indices); 4050 // one page per level is a bootstrap minimum 4051 foreach (i, Pred; Prefix) 4052 table.length!i = (1 << Pred.bitSize); 4053 } 4054 4055 /** 4056 Put a value `v` into interval as 4057 mapped by keys from `a` to `b`. 4058 All slots prior to `a` are filled with 4059 the default filler. 4060 */ 4061 void putRange(Key a, Key b, Value v) 4062 { 4063 auto idxA = getIndex(a), idxB = getIndex(b); 4064 // indexes of key should always grow 4065 enforce(idxB >= idxA && idxA >= curIndex, errMsg); 4066 putRangeAt(idxA, idxB, v); 4067 } 4068 4069 /** 4070 Put a value `v` into slot mapped by `key`. 4071 All slots prior to `key` are filled with the 4072 default filler. 4073 */ 4074 void putValue(Key key, Value v) 4075 { 4076 auto idx = getIndex(key); 4077 enforce(idx >= curIndex, errMsg); 4078 putAt(idx, v); 4079 } 4080 4081 /// Finishes construction of Trie, yielding an immutable Trie instance. 4082 auto build() 4083 { 4084 static if (maxIndex != 0) // doesn't cover full range of size_t 4085 { 4086 assert(curIndex <= maxIndex); 4087 addValue!lastLevel(defValue, maxIndex - curIndex); 4088 } 4089 else 4090 { 4091 if (curIndex != 0 // couldn't wrap around 4092 || (Prefix.length != 1 && indices[lastLevel] == 0)) // can be just empty 4093 { 4094 addValue!lastLevel(defValue, size_t.max - curIndex); 4095 addValue!lastLevel(defValue, 1); 4096 } 4097 // else curIndex already completed the full range of size_t by wrapping around 4098 } 4099 return Trie!(V, Key, maxIndex, Prefix)(table); 4100 } 4101} 4102 4103/** 4104 $(P A generic Trie data-structure for a fixed number of stages. 4105 The design goal is optimal speed with smallest footprint size. 4106 ) 4107 $(P It's intentionally read-only and doesn't provide constructors. 4108 To construct one use a special builder, 4109 see $(LREF TrieBuilder) and $(LREF buildTrie). 4110 ) 4111 4112*/ 4113@trusted private struct Trie(Value, Key, Args...) 4114if (isValidPrefixForTrie!(Key, Args) 4115 || (isValidPrefixForTrie!(Key, Args[1..$]) 4116 && is(typeof(Args[0]) : size_t))) 4117{ 4118 import std.range.primitives : isOutputRange; 4119 static if (is(typeof(Args[0]) : size_t)) 4120 { 4121 private enum maxIndex = Args[0]; 4122 private enum hasBoundsCheck = true; 4123 private alias Prefix = Args[1..$]; 4124 } 4125 else 4126 { 4127 private enum hasBoundsCheck = false; 4128 private alias Prefix = Args; 4129 } 4130 4131 private this()(typeof(_table) table) 4132 { 4133 _table = table; 4134 } 4135 4136 // only for constant Tries constructed from precompiled tables 4137 private this()(const(size_t)[] offsets, const(size_t)[] sizes, 4138 const(size_t)[] data) const 4139 { 4140 _table = typeof(_table)(offsets, sizes, data); 4141 } 4142 4143 /** 4144 $(P Lookup the `key` in this `Trie`. ) 4145 4146 $(P The lookup always succeeds if key fits the domain 4147 provided during construction. The whole domain defined 4148 is covered so instead of not found condition 4149 the sentinel (filler) value could be used. ) 4150 4151 $(P See $(LREF buildTrie), $(LREF TrieBuilder) for how to 4152 define a domain of `Trie` keys and the sentinel value. ) 4153 4154 Note: 4155 Domain range-checking is only enabled in debug builds 4156 and results in assertion failure. 4157 */ 4158 TypeOfBitPacked!Value opIndex()(Key key) const 4159 { 4160 static if (hasBoundsCheck) 4161 assert(mapTrieIndex!Prefix(key) < maxIndex); 4162 size_t idx; 4163 alias p = Prefix; 4164 idx = cast(size_t) p[0](key); 4165 foreach (i, v; p[0..$-1]) 4166 idx = cast(size_t)((_table.ptr!i[idx]<<p[i+1].bitSize) + p[i+1](key)); 4167 return _table.ptr!(p.length-1)[idx]; 4168 } 4169 4170 /// 4171 @property size_t bytes(size_t n=size_t.max)() const 4172 { 4173 return _table.bytes!n; 4174 } 4175 4176 /// 4177 @property size_t pages(size_t n)() const 4178 { 4179 return (bytes!n+2^^(Prefix[n].bitSize-1)) 4180 /2^^Prefix[n].bitSize; 4181 } 4182 4183 /// 4184 void store(OutRange)(scope OutRange sink) const 4185 if (isOutputRange!(OutRange, char)) 4186 { 4187 _table.store(sink); 4188 } 4189 4190private: 4191 MultiArray!(idxTypes!(Key, fullBitSize!(Prefix), Prefix[0..$]), Value) _table; 4192} 4193 4194// create a tuple of 'sliceBits' that slice the 'top' of bits into pieces of sizes 'sizes' 4195// left-to-right, the most significant bits first 4196template GetBitSlicing(size_t top, sizes...) 4197{ 4198 static if (sizes.length > 0) 4199 alias GetBitSlicing = 4200 AliasSeq!(sliceBits!(top - sizes[0], top), 4201 GetBitSlicing!(top - sizes[0], sizes[1..$])); 4202 else 4203 alias GetBitSlicing = AliasSeq!(); 4204} 4205 4206template callableWith(T) 4207{ 4208 template callableWith(alias Pred) 4209 { 4210 static if (!is(typeof(Pred(T.init)))) 4211 enum callableWith = false; 4212 else 4213 { 4214 alias Result = typeof(Pred(T.init)); 4215 enum callableWith = isBitPackableType!(TypeOfBitPacked!(Result)); 4216 } 4217 } 4218} 4219 4220/* 4221 Check if `Prefix` is a valid set of predicates 4222 for `Trie` template having `Key` as the type of keys. 4223 This requires all predicates to be callable, take 4224 single argument of type `Key` and return unsigned value. 4225*/ 4226template isValidPrefixForTrie(Key, Prefix...) 4227{ 4228 import std.meta : allSatisfy; 4229 enum isValidPrefixForTrie = allSatisfy!(callableWith!Key, Prefix); // TODO: tighten the screws 4230} 4231 4232/* 4233 Check if `Args` is a set of maximum key value followed by valid predicates 4234 for `Trie` template having `Key` as the type of keys. 4235*/ 4236template isValidArgsForTrie(Key, Args...) 4237{ 4238 static if (Args.length > 1) 4239 { 4240 enum isValidArgsForTrie = isValidPrefixForTrie!(Key, Args) 4241 || (isValidPrefixForTrie!(Key, Args[1..$]) && is(typeof(Args[0]) : Key)); 4242 } 4243 else 4244 enum isValidArgsForTrie = isValidPrefixForTrie!Args; 4245} 4246 4247@property size_t sumOfIntegerTuple(ints...)() 4248{ 4249 size_t count=0; 4250 foreach (v; ints) 4251 count += v; 4252 return count; 4253} 4254 4255/** 4256 A shorthand for creating a custom multi-level fixed Trie 4257 from a `CodepointSet`. `sizes` are numbers of bits per level, 4258 with the most significant bits used first. 4259 4260 Note: The sum of `sizes` must be equal 21. 4261 4262 See_Also: $(LREF toTrie), which is even simpler. 4263 4264 Example: 4265 --- 4266 { 4267 import std.stdio; 4268 auto set = unicode("Number"); 4269 auto trie = codepointSetTrie!(8, 5, 8)(set); 4270 writeln("Input code points to test:"); 4271 foreach (line; stdin.byLine) 4272 { 4273 int count=0; 4274 foreach (dchar ch; line) 4275 if (trie[ch])// is number 4276 count++; 4277 writefln("Contains %d number code points.", count); 4278 } 4279 } 4280 --- 4281*/ 4282public template codepointSetTrie(sizes...) 4283if (sumOfIntegerTuple!sizes == 21) 4284{ 4285 auto codepointSetTrie(Set)(Set set) 4286 if (isCodepointSet!Set) 4287 { 4288 auto builder = TrieBuilder!(bool, dchar, lastDchar+1, GetBitSlicing!(21, sizes))(false); 4289 foreach (ival; set.byInterval) 4290 builder.putRange(ival[0], ival[1], true); 4291 return builder.build(); 4292 } 4293} 4294 4295/// Type of Trie generated by codepointSetTrie function. 4296public template CodepointSetTrie(sizes...) 4297if (sumOfIntegerTuple!sizes == 21) 4298{ 4299 alias Prefix = GetBitSlicing!(21, sizes); 4300 alias CodepointSetTrie = typeof(TrieBuilder!(bool, dchar, lastDchar+1, Prefix)(false).build()); 4301} 4302 4303/** 4304 A slightly more general tool for building fixed `Trie` 4305 for the Unicode data. 4306 4307 Specifically unlike `codepointSetTrie` it's allows creating mappings 4308 of `dchar` to an arbitrary type `T`. 4309 4310 Note: Overload taking `CodepointSet`s will naturally convert 4311 only to bool mapping `Trie`s. 4312 4313 CodepointTrie is the type of Trie as generated by codepointTrie function. 4314*/ 4315public template codepointTrie(T, sizes...) 4316if (sumOfIntegerTuple!sizes == 21) 4317{ 4318 alias Prefix = GetBitSlicing!(21, sizes); 4319 4320 static if (is(TypeOfBitPacked!T == bool)) 4321 { 4322 auto codepointTrie(Set)(const scope Set set) 4323 if (isCodepointSet!Set) 4324 { 4325 return codepointSetTrie(set); 4326 } 4327 } 4328 4329 /// 4330 auto codepointTrie()(T[dchar] map, T defValue=T.init) 4331 { 4332 return buildTrie!(T, dchar, Prefix)(map, defValue); 4333 } 4334 4335 // unsorted range of pairs 4336 /// 4337 auto codepointTrie(R)(R range, T defValue=T.init) 4338 if (isInputRange!R 4339 && is(typeof(ElementType!R.init[0]) : T) 4340 && is(typeof(ElementType!R.init[1]) : dchar)) 4341 { 4342 // build from unsorted array of pairs 4343 // TODO: expose index sorting functions for Trie 4344 return buildTrie!(T, dchar, Prefix)(range, defValue, true); 4345 } 4346} 4347 4348@system pure unittest 4349{ 4350 import std.algorithm.comparison : max; 4351 import std.algorithm.searching : count; 4352 4353 // pick characters from the Greek script 4354 auto set = unicode.Greek; 4355 4356 // a user-defined property (or an expensive function) 4357 // that we want to look up 4358 static uint luckFactor(dchar ch) 4359 { 4360 // here we consider a character lucky 4361 // if its code point has a lot of identical hex-digits 4362 // e.g. arabic letter DDAL (\u0688) has a "luck factor" of 2 4363 ubyte[6] nibbles; // 6 4-bit chunks of code point 4364 uint value = ch; 4365 foreach (i; 0 .. 6) 4366 { 4367 nibbles[i] = value & 0xF; 4368 value >>= 4; 4369 } 4370 uint luck; 4371 foreach (n; nibbles) 4372 luck = cast(uint) max(luck, count(nibbles[], n)); 4373 return luck; 4374 } 4375 4376 // only unsigned built-ins are supported at the moment 4377 alias LuckFactor = BitPacked!(uint, 3); 4378 4379 // create a temporary associative array (AA) 4380 LuckFactor[dchar] map; 4381 foreach (ch; set.byCodepoint) 4382 map[ch] = LuckFactor(luckFactor(ch)); 4383 4384 // bits per stage are chosen randomly, fell free to optimize 4385 auto trie = codepointTrie!(LuckFactor, 8, 5, 8)(map); 4386 4387 // from now on the AA is not needed 4388 foreach (ch; set.byCodepoint) 4389 assert(trie[ch] == luckFactor(ch)); // verify 4390 // CJK is not Greek, thus it has the default value 4391 assert(trie['\u4444'] == 0); 4392 // and here is a couple of quite lucky Greek characters: 4393 // Greek small letter epsilon with dasia 4394 assert(trie['\u1F11'] == 3); 4395 // Ancient Greek metretes sign 4396 assert(trie['\U00010181'] == 3); 4397 4398} 4399 4400/// ditto 4401public template CodepointTrie(T, sizes...) 4402if (sumOfIntegerTuple!sizes == 21) 4403{ 4404 alias Prefix = GetBitSlicing!(21, sizes); 4405 alias CodepointTrie = typeof(TrieBuilder!(T, dchar, lastDchar+1, Prefix)(T.init).build()); 4406} 4407 4408package(std) template cmpK0(alias Pred) 4409{ 4410 import std.typecons : Tuple; 4411 static bool cmpK0(Value, Key) 4412 (Tuple!(Value, Key) a, Tuple!(Value, Key) b) 4413 { 4414 return Pred(a[1]) < Pred(b[1]); 4415 } 4416} 4417 4418/** 4419 The most general utility for construction of `Trie`s 4420 short of using `TrieBuilder` directly. 4421 4422 Provides a number of convenience overloads. 4423 `Args` is tuple of maximum key value followed by 4424 predicates to construct index from key. 4425 4426 Alternatively if the first argument is not a value convertible to `Key` 4427 then the whole tuple of `Args` is treated as predicates 4428 and the maximum Key is deduced from predicates. 4429*/ 4430private template buildTrie(Value, Key, Args...) 4431if (isValidArgsForTrie!(Key, Args)) 4432{ 4433 static if (is(typeof(Args[0]) : Key)) // prefix starts with upper bound on Key 4434 { 4435 alias Prefix = Args[1..$]; 4436 } 4437 else 4438 alias Prefix = Args; 4439 4440 alias getIndex = mapTrieIndex!(Prefix); 4441 4442 // for multi-sort 4443 template GetComparators(size_t n) 4444 { 4445 static if (n > 0) 4446 alias GetComparators = 4447 AliasSeq!(GetComparators!(n-1), cmpK0!(Prefix[n-1])); 4448 else 4449 alias GetComparators = AliasSeq!(); 4450 } 4451 4452 /* 4453 Build `Trie` from a range of a Key-Value pairs, 4454 assuming it is sorted by Key as defined by the following lambda: 4455 ------ 4456 (a, b) => mapTrieIndex!(Prefix)(a) < mapTrieIndex!(Prefix)(b) 4457 ------ 4458 Exception is thrown if it's detected that the above order doesn't hold. 4459 4460 In other words $(LREF mapTrieIndex) should be a 4461 monotonically increasing function that maps `Key` to an integer. 4462 4463 See_Also: $(REF sort, std,_algorithm), 4464 $(REF SortedRange, std,range), 4465 $(REF setUnion, std,_algorithm). 4466 */ 4467 auto buildTrie(Range)(Range range, Value filler=Value.init) 4468 if (isInputRange!Range && is(typeof(Range.init.front[0]) : Value) 4469 && is(typeof(Range.init.front[1]) : Key)) 4470 { 4471 auto builder = TrieBuilder!(Value, Key, Prefix)(filler); 4472 foreach (v; range) 4473 builder.putValue(v[1], v[0]); 4474 return builder.build(); 4475 } 4476 4477 /* 4478 If `Value` is bool (or BitPacked!(bool, x)) then it's possible 4479 to build `Trie` from a range of open-right intervals of `Key`s. 4480 The requirement on the ordering of keys (and the behavior on the 4481 violation of it) is the same as for Key-Value range overload. 4482 4483 Intervals denote ranges of !`filler` i.e. the opposite of filler. 4484 If no filler provided keys inside of the intervals map to true, 4485 and `filler` is false. 4486 */ 4487 auto buildTrie(Range)(Range range, Value filler=Value.init) 4488 if (is(TypeOfBitPacked!Value == bool) 4489 && isInputRange!Range && is(typeof(Range.init.front[0]) : Key) 4490 && is(typeof(Range.init.front[1]) : Key)) 4491 { 4492 auto builder = TrieBuilder!(Value, Key, Prefix)(filler); 4493 foreach (ival; range) 4494 builder.putRange(ival[0], ival[1], !filler); 4495 return builder.build(); 4496 } 4497 4498 auto buildTrie(Range)(Range range, Value filler, bool unsorted) 4499 if (isInputRange!Range 4500 && is(typeof(Range.init.front[0]) : Value) 4501 && is(typeof(Range.init.front[1]) : Key)) 4502 { 4503 import std.algorithm.sorting : multiSort; 4504 alias Comps = GetComparators!(Prefix.length); 4505 if (unsorted) 4506 multiSort!(Comps)(range); 4507 return buildTrie(range, filler); 4508 } 4509 4510 /* 4511 If `Value` is bool (or BitPacked!(bool, x)) then it's possible 4512 to build `Trie` simply from an input range of `Key`s. 4513 The requirement on the ordering of keys (and the behavior on the 4514 violation of it) is the same as for Key-Value range overload. 4515 4516 Keys found in range denote !`filler` i.e. the opposite of filler. 4517 If no filler provided keys map to true, and `filler` is false. 4518 */ 4519 auto buildTrie(Range)(Range range, Value filler=Value.init) 4520 if (is(TypeOfBitPacked!Value == bool) 4521 && isInputRange!Range && is(typeof(Range.init.front) : Key)) 4522 { 4523 auto builder = TrieBuilder!(Value, Key, Prefix)(filler); 4524 foreach (v; range) 4525 builder.putValue(v, !filler); 4526 return builder.build(); 4527 } 4528 4529 /* 4530 If `Key` is unsigned integer `Trie` could be constructed from array 4531 of values where array index serves as key. 4532 */ 4533 auto buildTrie()(Value[] array, Value filler=Value.init) 4534 if (isUnsigned!Key) 4535 { 4536 auto builder = TrieBuilder!(Value, Key, Prefix)(filler); 4537 foreach (idx, v; array) 4538 builder.putValue(idx, v); 4539 return builder.build(); 4540 } 4541 4542 /* 4543 Builds `Trie` from associative array. 4544 */ 4545 auto buildTrie(Key, Value)(Value[Key] map, Value filler=Value.init) 4546 { 4547 import std.array : array; 4548 import std.range : zip; 4549 auto range = array(zip(map.values, map.keys)); 4550 return buildTrie(range, filler, true); // sort it 4551 } 4552} 4553 4554// helper in place of assumeSize to 4555//reduce mangled name & help DMD inline Trie functors 4556struct clamp(size_t bits) 4557{ 4558 static size_t opCall(T)(T arg){ return arg; } 4559 enum bitSize = bits; 4560} 4561 4562struct clampIdx(size_t idx, size_t bits) 4563{ 4564 static size_t opCall(T)(T arg){ return arg[idx]; } 4565 enum bitSize = bits; 4566} 4567 4568/** 4569 Conceptual type that outlines the common properties of all UTF Matchers. 4570 4571 Note: For illustration purposes only, every method 4572 call results in assertion failure. 4573 Use $(LREF utfMatcher) to obtain a concrete matcher 4574 for UTF-8 or UTF-16 encodings. 4575*/ 4576public struct MatcherConcept 4577{ 4578 /** 4579 $(P Perform a semantic equivalent 2 operations: 4580 decoding a $(CODEPOINT) at front of `inp` and testing if 4581 it belongs to the set of $(CODEPOINTS) of this matcher. ) 4582 4583 $(P The effect on `inp` depends on the kind of function called:) 4584 4585 $(P Match. If the codepoint is found in the set then range `inp` 4586 is advanced by its size in $(S_LINK Code unit, code units), 4587 otherwise the range is not modifed.) 4588 4589 $(P Skip. The range is always advanced by the size 4590 of the tested $(CODEPOINT) regardless of the result of test.) 4591 4592 $(P Test. The range is left unaffected regardless 4593 of the result of test.) 4594 */ 4595 public bool match(Range)(ref Range inp) 4596 if (isRandomAccessRange!Range && is(ElementType!Range : char)) 4597 { 4598 assert(false); 4599 } 4600 4601 ///ditto 4602 public bool skip(Range)(ref Range inp) 4603 if (isRandomAccessRange!Range && is(ElementType!Range : char)) 4604 { 4605 assert(false); 4606 } 4607 4608 ///ditto 4609 public bool test(Range)(ref Range inp) 4610 if (isRandomAccessRange!Range && is(ElementType!Range : char)) 4611 { 4612 assert(false); 4613 } 4614 /// 4615 pure @safe unittest 4616 { 4617 string truth = "2�� = 4"; 4618 auto m = utfMatcher!char(unicode.Number); 4619 assert(m.match(truth)); // '2' is a number all right 4620 assert(truth == "�� = 4"); // skips on match 4621 assert(m.match(truth)); // so is the superscript '2' 4622 assert(!m.match(truth)); // space is not a number 4623 assert(truth == " = 4"); // unaffected on no match 4624 assert(!m.skip(truth)); // same test ... 4625 assert(truth == "= 4"); // but skips a codepoint regardless 4626 assert(!m.test(truth)); // '=' is not a number 4627 assert(truth == "= 4"); // test never affects argument 4628 } 4629 4630 /** 4631 Advanced feature - provide direct access to a subset of matcher based a 4632 set of known encoding lengths. Lengths are provided in 4633 $(S_LINK Code unit, code units). The sub-matcher then may do less 4634 operations per any `test`/`match`. 4635 4636 Use with care as the sub-matcher won't match 4637 any $(CODEPOINTS) that have encoded length that doesn't belong 4638 to the selected set of lengths. Also the sub-matcher object references 4639 the parent matcher and must not be used past the liftetime 4640 of the latter. 4641 4642 Another caveat of using sub-matcher is that skip is not available 4643 preciesly because sub-matcher doesn't detect all lengths. 4644 */ 4645 @property auto subMatcher(Lengths...)() 4646 { 4647 assert(0); 4648 return this; 4649 } 4650 4651 pure @safe unittest 4652 { 4653 auto m = utfMatcher!char(unicode.Number); 4654 string square = "2��"; 4655 // about sub-matchers 4656 assert(!m.subMatcher!(2,3,4).test(square)); // ASCII no covered 4657 assert(m.subMatcher!1.match(square)); // ASCII-only, works 4658 assert(!m.subMatcher!1.test(square)); // unicode '��' 4659 assert(m.subMatcher!(2,3,4).match(square)); // 4660 assert(square == ""); 4661 wstring wsquare = "2��"; 4662 auto m16 = utfMatcher!wchar(unicode.Number); 4663 // may keep ref, but the orignal (m16) must be kept alive 4664 auto bmp = m16.subMatcher!1; 4665 assert(bmp.match(wsquare)); // Okay, in basic multilingual plan 4666 assert(bmp.match(wsquare)); // And '��' too 4667 } 4668} 4669 4670/** 4671 Test if `M` is an UTF Matcher for ranges of `Char`. 4672*/ 4673public enum isUtfMatcher(M, C) = __traits(compiles, (){ 4674 C[] s; 4675 auto d = s.decoder; 4676 M m; 4677 assert(is(typeof(m.match(d)) == bool)); 4678 assert(is(typeof(m.test(d)) == bool)); 4679 static if (is(typeof(m.skip(d)))) 4680 { 4681 assert(is(typeof(m.skip(d)) == bool)); 4682 assert(is(typeof(m.skip(s)) == bool)); 4683 } 4684 assert(is(typeof(m.match(s)) == bool)); 4685 assert(is(typeof(m.test(s)) == bool)); 4686}); 4687 4688pure @safe unittest 4689{ 4690 alias CharMatcher = typeof(utfMatcher!char(CodepointSet.init)); 4691 alias WcharMatcher = typeof(utfMatcher!wchar(CodepointSet.init)); 4692 static assert(isUtfMatcher!(CharMatcher, char)); 4693 static assert(isUtfMatcher!(CharMatcher, immutable(char))); 4694 static assert(isUtfMatcher!(WcharMatcher, wchar)); 4695 static assert(isUtfMatcher!(WcharMatcher, immutable(wchar))); 4696} 4697 4698enum Mode { 4699 alwaysSkip, 4700 neverSkip, 4701 skipOnMatch 4702} 4703 4704mixin template ForwardStrings() 4705{ 4706 private bool fwdStr(string fn, C)(ref C[] str) const @trusted 4707 { 4708 import std.utf : byCodeUnit; 4709 alias type = typeof(byCodeUnit(str)); 4710 return mixin(fn~"(*cast(type*)&str)"); 4711 } 4712} 4713 4714template Utf8Matcher() 4715{ 4716 enum validSize(int sz) = sz >= 1 && sz <= 4; 4717 4718 void badEncoding() pure @safe 4719 { 4720 import std.utf : UTFException; 4721 throw new UTFException("Invalid UTF-8 sequence"); 4722 } 4723 4724 //for 1-stage ASCII 4725 alias AsciiSpec = AliasSeq!(bool, char, clamp!7); 4726 //for 2-stage lookup of 2 byte UTF-8 sequences 4727 alias Utf8Spec2 = AliasSeq!(bool, char[2], 4728 clampIdx!(0, 5), clampIdx!(1, 6)); 4729 //ditto for 3 byte 4730 alias Utf8Spec3 = AliasSeq!(bool, char[3], 4731 clampIdx!(0, 4), 4732 clampIdx!(1, 6), 4733 clampIdx!(2, 6) 4734 ); 4735 //ditto for 4 byte 4736 alias Utf8Spec4 = AliasSeq!(bool, char[4], 4737 clampIdx!(0, 3), clampIdx!(1, 6), 4738 clampIdx!(2, 6), clampIdx!(3, 6) 4739 ); 4740 alias Tables = AliasSeq!( 4741 typeof(TrieBuilder!(AsciiSpec)(false).build()), 4742 typeof(TrieBuilder!(Utf8Spec2)(false).build()), 4743 typeof(TrieBuilder!(Utf8Spec3)(false).build()), 4744 typeof(TrieBuilder!(Utf8Spec4)(false).build()) 4745 ); 4746 alias Table(int size) = Tables[size-1]; 4747 4748 enum leadMask(size_t size) = (cast(size_t) 1<<(7 - size))-1; 4749 enum encMask(size_t size) = ((1 << size)-1)<<(8-size); 4750 4751 char truncate()(char ch) pure @safe 4752 { 4753 ch -= 0x80; 4754 if (ch < 0x40) 4755 { 4756 return ch; 4757 } 4758 else 4759 { 4760 badEncoding(); 4761 return cast(char) 0; 4762 } 4763 } 4764 4765 static auto encode(size_t sz)(dchar ch) 4766 if (sz > 1) 4767 { 4768 import std.utf : encodeUTF = encode; 4769 char[4] buf; 4770 encodeUTF(buf, ch); 4771 char[sz] ret; 4772 buf[0] &= leadMask!sz; 4773 foreach (n; 1 .. sz) 4774 buf[n] = buf[n] & 0x3f; //keep 6 lower bits 4775 ret[] = buf[0 .. sz]; 4776 return ret; 4777 } 4778 4779 auto build(Set)(Set set) 4780 { 4781 import std.algorithm.iteration : map; 4782 auto ascii = set & unicode.ASCII; 4783 auto utf8_2 = set & CodepointSet(0x80, 0x800); 4784 auto utf8_3 = set & CodepointSet(0x800, 0x1_0000); 4785 auto utf8_4 = set & CodepointSet(0x1_0000, lastDchar+1); 4786 auto asciiT = ascii.byCodepoint.map!(x=>cast(char) x).buildTrie!(AsciiSpec); 4787 auto utf8_2T = utf8_2.byCodepoint.map!(x=>encode!2(x)).buildTrie!(Utf8Spec2); 4788 auto utf8_3T = utf8_3.byCodepoint.map!(x=>encode!3(x)).buildTrie!(Utf8Spec3); 4789 auto utf8_4T = utf8_4.byCodepoint.map!(x=>encode!4(x)).buildTrie!(Utf8Spec4); 4790 alias Ret = Impl!(1,2,3,4); 4791 return Ret(asciiT, utf8_2T, utf8_3T, utf8_4T); 4792 } 4793 4794 // Bootstrap UTF-8 static matcher interface 4795 // from 3 primitives: tab!(size), lookup and Sizes 4796 mixin template DefMatcher() 4797 { 4798 import std.format : format; 4799 import std.meta : Erase, staticIndexOf; 4800 enum hasASCII = staticIndexOf!(1, Sizes) >= 0; 4801 alias UniSizes = Erase!(1, Sizes); 4802 4803 //generate dispatch code sequence for unicode parts 4804 static auto genDispatch() 4805 { 4806 string code; 4807 foreach (size; UniSizes) 4808 code ~= format(q{ 4809 if ((ch & ~leadMask!%d) == encMask!(%d)) 4810 return lookup!(%d, mode)(inp); 4811 else 4812 }, size, size, size); 4813 static if (Sizes.length == 4) //covers all code unit cases 4814 code ~= "{ badEncoding(); return false; }"; 4815 else 4816 code ~= "return false;"; //may be just fine but not covered 4817 return code; 4818 } 4819 enum dispatch = genDispatch(); 4820 4821 public bool match(Range)(ref Range inp) const 4822 if (isRandomAccessRange!Range && is(ElementType!Range : char) && 4823 !isDynamicArray!Range) 4824 { 4825 enum mode = Mode.skipOnMatch; 4826 assert(!inp.empty); 4827 immutable ch = inp[0]; 4828 static if (hasASCII) 4829 { 4830 if (ch < 0x80) 4831 { 4832 immutable r = tab!1[ch]; 4833 if (r) 4834 inp.popFront(); 4835 return r; 4836 } 4837 else 4838 mixin(dispatch); 4839 } 4840 else 4841 mixin(dispatch); 4842 } 4843 4844 static if (Sizes.length == 4) // can skip iff can detect all encodings 4845 { 4846 public bool skip(Range)(ref Range inp) const 4847 if (isRandomAccessRange!Range && is(ElementType!Range : char) && 4848 !isDynamicArray!Range) 4849 { 4850 enum mode = Mode.alwaysSkip; 4851 assert(!inp.empty); 4852 auto ch = inp[0]; 4853 static if (hasASCII) 4854 { 4855 if (ch < 0x80) 4856 { 4857 inp.popFront(); 4858 return tab!1[ch]; 4859 } 4860 else 4861 mixin(dispatch); 4862 } 4863 else 4864 mixin(dispatch); 4865 } 4866 } 4867 4868 public bool test(Range)(ref Range inp) const 4869 if (isRandomAccessRange!Range && is(ElementType!Range : char) && 4870 !isDynamicArray!Range) 4871 { 4872 enum mode = Mode.neverSkip; 4873 assert(!inp.empty); 4874 auto ch = inp[0]; 4875 static if (hasASCII) 4876 { 4877 if (ch < 0x80) 4878 return tab!1[ch]; 4879 else 4880 mixin(dispatch); 4881 } 4882 else 4883 mixin(dispatch); 4884 } 4885 4886 bool match(C)(ref C[] str) const 4887 if (isSomeChar!C) 4888 { 4889 return fwdStr!"match"(str); 4890 } 4891 4892 bool skip(C)(ref C[] str) const 4893 if (isSomeChar!C) 4894 { 4895 return fwdStr!"skip"(str); 4896 } 4897 4898 bool test(C)(ref C[] str) const 4899 if (isSomeChar!C) 4900 { 4901 return fwdStr!"test"(str); 4902 } 4903 4904 mixin ForwardStrings; 4905 } 4906 4907 struct Impl(Sizes...) 4908 { 4909 import std.meta : allSatisfy, staticMap; 4910 static assert(allSatisfy!(validSize, Sizes), 4911 "Only lengths of 1, 2, 3 and 4 code unit are possible for UTF-8"); 4912 private: 4913 //pick tables for chosen sizes 4914 alias OurTabs = staticMap!(Table, Sizes); 4915 OurTabs tables; 4916 mixin DefMatcher; 4917 //static disptach helper UTF size ==> table 4918 alias tab(int i) = tables[i - 1]; 4919 4920 package(std) @property CherryPick!(Impl, SizesToPick) subMatcher(SizesToPick...)() 4921 { 4922 return CherryPick!(Impl, SizesToPick)(&this); 4923 } 4924 4925 bool lookup(int size, Mode mode, Range)(ref Range inp) const 4926 { 4927 import std.range : popFrontN; 4928 if (inp.length < size) 4929 { 4930 badEncoding(); 4931 return false; 4932 } 4933 char[size] needle = void; 4934 needle[0] = leadMask!size & inp[0]; 4935 static foreach (i; 1 .. size) 4936 { 4937 needle[i] = truncate(inp[i]); 4938 } 4939 //overlong encoding checks 4940 static if (size == 2) 4941 { 4942 //0x80-0x7FF 4943 //got 6 bits in needle[1], must use at least 8 bits 4944 //must use at least 2 bits in needle[1] 4945 if (needle[0] < 2) badEncoding(); 4946 } 4947 else static if (size == 3) 4948 { 4949 //0x800-0xFFFF 4950 //got 6 bits in needle[2], must use at least 12bits 4951 //must use 6 bits in needle[1] or anything in needle[0] 4952 if (needle[0] == 0 && needle[1] < 0x20) badEncoding(); 4953 } 4954 else static if (size == 4) 4955 { 4956 //0x800-0xFFFF 4957 //got 2x6=12 bits in needle[2 .. 3] must use at least 17bits 4958 //must use 5 bits (or above) in needle[1] or anything in needle[0] 4959 if (needle[0] == 0 && needle[1] < 0x10) badEncoding(); 4960 } 4961 static if (mode == Mode.alwaysSkip) 4962 { 4963 inp.popFrontN(size); 4964 return tab!size[needle]; 4965 } 4966 else static if (mode == Mode.neverSkip) 4967 { 4968 return tab!size[needle]; 4969 } 4970 else 4971 { 4972 static assert(mode == Mode.skipOnMatch); 4973 if (tab!size[needle]) 4974 { 4975 inp.popFrontN(size); 4976 return true; 4977 } 4978 else 4979 return false; 4980 } 4981 } 4982 } 4983 4984 struct CherryPick(I, Sizes...) 4985 { 4986 import std.meta : allSatisfy; 4987 static assert(allSatisfy!(validSize, Sizes), 4988 "Only lengths of 1, 2, 3 and 4 code unit are possible for UTF-8"); 4989 private: 4990 I* m; 4991 @property auto tab(int i)() const { return m.tables[i - 1]; } 4992 bool lookup(int size, Mode mode, Range)(ref Range inp) const 4993 { 4994 return m.lookup!(size, mode)(inp); 4995 } 4996 mixin DefMatcher; 4997 } 4998} 4999 5000template Utf16Matcher() 5001{ 5002 enum validSize(int sz) = sz >= 1 && sz <= 2; 5003 5004 void badEncoding() pure @safe 5005 { 5006 import std.utf : UTFException; 5007 throw new UTFException("Invalid UTF-16 sequence"); 5008 } 5009 5010 // 1-stage ASCII 5011 alias AsciiSpec = AliasSeq!(bool, wchar, clamp!7); 5012 //2-stage BMP 5013 alias BmpSpec = AliasSeq!(bool, wchar, sliceBits!(7, 16), sliceBits!(0, 7)); 5014 //4-stage - full Unicode 5015 //assume that 0xD800 & 0xDC00 bits are cleared 5016 //thus leaving 10 bit per wchar to worry about 5017 alias UniSpec = AliasSeq!(bool, wchar[2], 5018 assumeSize!(x=>x[0]>>4, 6), assumeSize!(x=>x[0]&0xf, 4), 5019 assumeSize!(x=>x[1]>>6, 4), assumeSize!(x=>x[1]&0x3f, 6), 5020 ); 5021 alias Ascii = typeof(TrieBuilder!(AsciiSpec)(false).build()); 5022 alias Bmp = typeof(TrieBuilder!(BmpSpec)(false).build()); 5023 alias Uni = typeof(TrieBuilder!(UniSpec)(false).build()); 5024 5025 auto encode2(dchar ch) 5026 { 5027 ch -= 0x1_0000; 5028 assert(ch <= 0xF_FFFF); 5029 wchar[2] ret; 5030 //do not put surrogate bits, they are sliced off 5031 ret[0] = cast(wchar)(ch >> 10); 5032 ret[1] = (ch & 0xFFF); 5033 return ret; 5034 } 5035 5036 auto build(Set)(Set set) 5037 { 5038 import std.algorithm.iteration : map; 5039 auto ascii = set & unicode.ASCII; 5040 auto bmp = (set & CodepointSet.fromIntervals(0x80, 0xFFFF+1)) 5041 - CodepointSet.fromIntervals(0xD800, 0xDFFF+1); 5042 auto other = set - (bmp | ascii); 5043 auto asciiT = ascii.byCodepoint.map!(x=>cast(char) x).buildTrie!(AsciiSpec); 5044 auto bmpT = bmp.byCodepoint.map!(x=>cast(wchar) x).buildTrie!(BmpSpec); 5045 auto otherT = other.byCodepoint.map!(x=>encode2(x)).buildTrie!(UniSpec); 5046 alias Ret = Impl!(1,2); 5047 return Ret(asciiT, bmpT, otherT); 5048 } 5049 5050 //bootstrap full UTF-16 matcher interace from 5051 //sizeFlags, lookupUni and ascii 5052 mixin template DefMatcher() 5053 { 5054 public bool match(Range)(ref Range inp) const 5055 if (isRandomAccessRange!Range && is(ElementType!Range : wchar) && 5056 !isDynamicArray!Range) 5057 { 5058 enum mode = Mode.skipOnMatch; 5059 assert(!inp.empty); 5060 immutable ch = inp[0]; 5061 static if (sizeFlags & 1) 5062 { 5063 if (ch < 0x80) 5064 { 5065 if (ascii[ch]) 5066 { 5067 inp.popFront(); 5068 return true; 5069 } 5070 else 5071 return false; 5072 } 5073 return lookupUni!mode(inp); 5074 } 5075 else 5076 return lookupUni!mode(inp); 5077 } 5078 5079 static if (Sizes.length == 2) 5080 { 5081 public bool skip(Range)(ref Range inp) const 5082 if (isRandomAccessRange!Range && is(ElementType!Range : wchar) && 5083 !isDynamicArray!Range) 5084 { 5085 enum mode = Mode.alwaysSkip; 5086 assert(!inp.empty); 5087 immutable ch = inp[0]; 5088 static if (sizeFlags & 1) 5089 { 5090 if (ch < 0x80) 5091 { 5092 inp.popFront(); 5093 return ascii[ch]; 5094 } 5095 else 5096 return lookupUni!mode(inp); 5097 } 5098 else 5099 return lookupUni!mode(inp); 5100 } 5101 } 5102 5103 public bool test(Range)(ref Range inp) const 5104 if (isRandomAccessRange!Range && is(ElementType!Range : wchar) && 5105 !isDynamicArray!Range) 5106 { 5107 enum mode = Mode.neverSkip; 5108 assert(!inp.empty); 5109 auto ch = inp[0]; 5110 static if (sizeFlags & 1) 5111 return ch < 0x80 ? ascii[ch] : lookupUni!mode(inp); 5112 else 5113 return lookupUni!mode(inp); 5114 } 5115 5116 bool match(C)(ref C[] str) const 5117 if (isSomeChar!C) 5118 { 5119 return fwdStr!"match"(str); 5120 } 5121 5122 bool skip(C)(ref C[] str) const 5123 if (isSomeChar!C) 5124 { 5125 return fwdStr!"skip"(str); 5126 } 5127 5128 bool test(C)(ref C[] str) const 5129 if (isSomeChar!C) 5130 { 5131 return fwdStr!"test"(str); 5132 } 5133 5134 mixin ForwardStrings; //dispatch strings to range versions 5135 } 5136 5137 struct Impl(Sizes...) 5138 if (Sizes.length >= 1 && Sizes.length <= 2) 5139 { 5140 private: 5141 import std.meta : allSatisfy; 5142 static assert(allSatisfy!(validSize, Sizes), 5143 "Only lengths of 1 and 2 code units are possible in UTF-16"); 5144 static if (Sizes.length > 1) 5145 enum sizeFlags = Sizes[0] | Sizes[1]; 5146 else 5147 enum sizeFlags = Sizes[0]; 5148 5149 static if (sizeFlags & 1) 5150 { 5151 Ascii ascii; 5152 Bmp bmp; 5153 } 5154 static if (sizeFlags & 2) 5155 { 5156 Uni uni; 5157 } 5158 mixin DefMatcher; 5159 5160 package(std) @property CherryPick!(Impl, SizesToPick) subMatcher(SizesToPick...)() 5161 { 5162 return CherryPick!(Impl, SizesToPick)(&this); 5163 } 5164 5165 bool lookupUni(Mode mode, Range)(ref Range inp) const 5166 { 5167 wchar x = cast(wchar)(inp[0] - 0xD800); 5168 //not a high surrogate 5169 if (x > 0x3FF) 5170 { 5171 //low surrogate 5172 if (x <= 0x7FF) badEncoding(); 5173 static if (sizeFlags & 1) 5174 { 5175 auto ch = inp[0]; 5176 static if (mode == Mode.alwaysSkip) 5177 inp.popFront(); 5178 static if (mode == Mode.skipOnMatch) 5179 { 5180 if (bmp[ch]) 5181 { 5182 inp.popFront(); 5183 return true; 5184 } 5185 else 5186 return false; 5187 } 5188 else 5189 return bmp[ch]; 5190 } 5191 else //skip is not available for sub-matchers, so just false 5192 return false; 5193 } 5194 else 5195 { 5196 import std.range : popFrontN; 5197 static if (sizeFlags & 2) 5198 { 5199 if (inp.length < 2) 5200 badEncoding(); 5201 wchar y = cast(wchar)(inp[1] - 0xDC00); 5202 //not a low surrogate 5203 if (y > 0x3FF) 5204 badEncoding(); 5205 wchar[2] needle = [inp[0] & 0x3ff, inp[1] & 0x3ff]; 5206 static if (mode == Mode.alwaysSkip) 5207 inp.popFrontN(2); 5208 static if (mode == Mode.skipOnMatch) 5209 { 5210 if (uni[needle]) 5211 { 5212 inp.popFrontN(2); 5213 return true; 5214 } 5215 else 5216 return false; 5217 } 5218 else 5219 return uni[needle]; 5220 } 5221 else //ditto 5222 return false; 5223 } 5224 } 5225 } 5226 5227 struct CherryPick(I, Sizes...) 5228 if (Sizes.length >= 1 && Sizes.length <= 2) 5229 { 5230 private: 5231 import std.meta : allSatisfy; 5232 I* m; 5233 enum sizeFlags = I.sizeFlags; 5234 5235 static if (sizeFlags & 1) 5236 { 5237 @property auto ascii()() const { return m.ascii; } 5238 } 5239 5240 bool lookupUni(Mode mode, Range)(ref Range inp) const 5241 { 5242 return m.lookupUni!mode(inp); 5243 } 5244 mixin DefMatcher; 5245 static assert(allSatisfy!(validSize, Sizes), 5246 "Only lengths of 1 and 2 code units are possible in UTF-16"); 5247 } 5248} 5249 5250private auto utf8Matcher(Set)(Set set) 5251{ 5252 return Utf8Matcher!().build(set); 5253} 5254 5255private auto utf16Matcher(Set)(Set set) 5256{ 5257 return Utf16Matcher!().build(set); 5258} 5259 5260/** 5261 Constructs a matcher object 5262 to classify $(CODEPOINTS) from the `set` for encoding 5263 that has `Char` as code unit. 5264 5265 See $(LREF MatcherConcept) for API outline. 5266*/ 5267public auto utfMatcher(Char, Set)(Set set) 5268if (isCodepointSet!Set) 5269{ 5270 static if (is(Char : char)) 5271 return utf8Matcher(set); 5272 else static if (is(Char : wchar)) 5273 return utf16Matcher(set); 5274 else static if (is(Char : dchar)) 5275 static assert(false, "UTF-32 needs no decoding, 5276 and thus not supported by utfMatcher"); 5277 else 5278 static assert(false, "Only character types 'char' and 'wchar' are allowed"); 5279} 5280 5281 5282//a range of code units, packed with index to speed up forward iteration 5283package(std) auto decoder(C)(C[] s, size_t offset=0) 5284if (is(C : wchar) || is(C : char)) 5285{ 5286 static struct Decoder 5287 { 5288 pure nothrow: 5289 C[] str; 5290 size_t idx; 5291 @property C front(){ return str[idx]; } 5292 @property C back(){ return str[$-1]; } 5293 void popFront(){ idx++; } 5294 void popBack(){ str = str[0..$-1]; } 5295 void popFrontN(size_t n){ idx += n; } 5296 @property bool empty(){ return idx == str.length; } 5297 @property auto save(){ return this; } 5298 auto opIndex(size_t i){ return str[idx+i]; } 5299 @property size_t length(){ return str.length - idx; } 5300 alias opDollar = length; 5301 auto opSlice(size_t a, size_t b){ return Decoder(str[0 .. idx+b], idx+a); } 5302 } 5303 static assert(isRandomAccessRange!Decoder); 5304 static assert(is(ElementType!Decoder : C)); 5305 return Decoder(s, offset); 5306} 5307 5308pure @safe unittest 5309{ 5310 string rs = "hi! ���������������� ������������"; 5311 auto codec = rs.decoder; 5312 auto utf8 = utf8Matcher(unicode.Letter); 5313 auto asc = utf8.subMatcher!(1); 5314 auto uni = utf8.subMatcher!(2,3,4); 5315 assert(asc.test(codec)); 5316 assert(!uni.match(codec)); 5317 assert(utf8.skip(codec)); 5318 assert(codec.idx == 1); 5319 5320 assert(!uni.match(codec)); 5321 assert(asc.test(codec)); 5322 assert(utf8.skip(codec)); 5323 assert(codec.idx == 2); 5324 assert(!asc.match(codec)); 5325 5326 assert(!utf8.test(codec)); 5327 assert(!utf8.skip(codec)); 5328 5329 assert(!asc.test(codec)); 5330 assert(!utf8.test(codec)); 5331 assert(!utf8.skip(codec)); 5332 assert(utf8.test(codec)); 5333 foreach (i; 0 .. 7) 5334 { 5335 assert(!asc.test(codec)); 5336 assert(uni.test(codec)); 5337 assert(utf8.skip(codec)); 5338 } 5339 assert(!utf8.test(codec)); 5340 assert(!utf8.skip(codec)); 5341 //the same with match where applicable 5342 codec = rs.decoder; 5343 assert(utf8.match(codec)); 5344 assert(codec.idx == 1); 5345 assert(utf8.match(codec)); 5346 assert(codec.idx == 2); 5347 assert(!utf8.match(codec)); 5348 assert(codec.idx == 2); 5349 assert(!utf8.skip(codec)); 5350 assert(!utf8.skip(codec)); 5351 5352 foreach (i; 0 .. 7) 5353 { 5354 assert(!asc.test(codec)); 5355 assert(utf8.test(codec)); 5356 assert(utf8.match(codec)); 5357 } 5358 auto i = codec.idx; 5359 assert(!utf8.match(codec)); 5360 assert(codec.idx == i); 5361} 5362 5363pure @safe unittest 5364{ 5365 import std.range : stride; 5366 static bool testAll(Matcher, Range)(ref Matcher m, ref Range r) @safe 5367 { 5368 bool t = m.test(r); 5369 auto save = r.idx; 5370 assert(t == m.match(r)); 5371 assert(r.idx == save || t); //ether no change or was match 5372 r.idx = save; 5373 static if (is(typeof(m.skip(r)))) 5374 { 5375 assert(t == m.skip(r)); 5376 assert(r.idx != save); //always changed 5377 r.idx = save; 5378 } 5379 return t; 5380 } 5381 auto utf16 = utfMatcher!wchar(unicode.L); 5382 auto bmp = utf16.subMatcher!1; 5383 auto nonBmp = utf16.subMatcher!1; 5384 auto utf8 = utfMatcher!char(unicode.L); 5385 auto ascii = utf8.subMatcher!1; 5386 auto uni2 = utf8.subMatcher!2; 5387 auto uni3 = utf8.subMatcher!3; 5388 auto uni24 = utf8.subMatcher!(2,4); 5389 foreach (ch; unicode.L.byCodepoint.stride(3)) 5390 { 5391 import std.utf : encode; 5392 char[4] buf; 5393 wchar[2] buf16; 5394 auto len = encode(buf, ch); 5395 auto len16 = encode(buf16, ch); 5396 auto c8 = buf[0 .. len].decoder; 5397 auto c16 = buf16[0 .. len16].decoder; 5398 assert(testAll(utf16, c16)); 5399 assert(testAll(bmp, c16) || len16 != 1); 5400 assert(testAll(nonBmp, c16) || len16 != 2); 5401 5402 assert(testAll(utf8, c8)); 5403 5404 //submatchers return false on out of their domain 5405 assert(testAll(ascii, c8) || len != 1); 5406 assert(testAll(uni2, c8) || len != 2); 5407 assert(testAll(uni3, c8) || len != 3); 5408 assert(testAll(uni24, c8) || (len != 2 && len != 4)); 5409 } 5410} 5411 5412// cover decode fail cases of Matcher 5413pure @safe unittest 5414{ 5415 import std.algorithm.iteration : map; 5416 import std.exception : collectException; 5417 import std.format : format; 5418 auto utf16 = utfMatcher!wchar(unicode.L); 5419 auto utf8 = utfMatcher!char(unicode.L); 5420 //decode failure cases UTF-8 5421 alias fails8 = AliasSeq!("\xC1", "\x80\x00","\xC0\x00", "\xCF\x79", 5422 "\xFF\x00\0x00\0x00\x00", "\xC0\0x80\0x80\x80", "\x80\0x00\0x00\x00", 5423 "\xCF\x00\0x00\0x00\x00"); 5424 foreach (msg; fails8) 5425 { 5426 assert(collectException((){ 5427 auto s = msg; 5428 size_t idx = 0; 5429 utf8.test(s); 5430 }()), format("%( %2x %)", cast(immutable(ubyte)[]) msg)); 5431 } 5432 //decode failure cases UTF-16 5433 alias fails16 = AliasSeq!([0xD811], [0xDC02]); 5434 foreach (msg; fails16) 5435 { 5436 assert(collectException((){ 5437 auto s = msg.map!(x => cast(wchar) x); 5438 utf16.test(s); 5439 }())); 5440 } 5441} 5442 5443/++ 5444 Convenience function to construct optimal configurations for 5445 packed Trie from any `set` of $(CODEPOINTS). 5446 5447 The parameter `level` indicates the number of trie levels to use, 5448 allowed values are: 1, 2, 3 or 4. Levels represent different trade-offs 5449 speed-size wise. 5450 5451 $(P Level 1 is fastest and the most memory hungry (a bit array). ) 5452 $(P Level 4 is the slowest and has the smallest footprint. ) 5453 5454 See the $(S_LINK Synopsis, Synopsis) section for example. 5455 5456 Note: 5457 Level 4 stays very practical (being faster and more predictable) 5458 compared to using direct lookup on the `set` itself. 5459 5460 5461+/ 5462public auto toTrie(size_t level, Set)(Set set) 5463if (isCodepointSet!Set) 5464{ 5465 static if (level == 1) 5466 return codepointSetTrie!(21)(set); 5467 else static if (level == 2) 5468 return codepointSetTrie!(10, 11)(set); 5469 else static if (level == 3) 5470 return codepointSetTrie!(8, 5, 8)(set); 5471 else static if (level == 4) 5472 return codepointSetTrie!(6, 4, 4, 7)(set); 5473 else 5474 static assert(false, 5475 "Sorry, toTrie doesn't support levels > 4, use codepointSetTrie directly"); 5476} 5477 5478/** 5479 $(P Builds a `Trie` with typically optimal speed-size trade-off 5480 and wraps it into a delegate of the following type: 5481 $(D bool delegate(dchar ch)). ) 5482 5483 $(P Effectively this creates a 'tester' lambda suitable 5484 for algorithms like std.algorithm.find that take unary predicates. ) 5485 5486 See the $(S_LINK Synopsis, Synopsis) section for example. 5487*/ 5488public auto toDelegate(Set)(Set set) 5489if (isCodepointSet!Set) 5490{ 5491 // 3 is very small and is almost as fast as 2-level (due to CPU caches?) 5492 auto t = toTrie!3(set); 5493 return (dchar ch) => t[ch]; 5494} 5495 5496/** 5497 $(P Opaque wrapper around unsigned built-in integers and 5498 code unit (char/wchar/dchar) types. 5499 Parameter `sz` indicates that the value is confined 5500 to the range of [0, 2^^sz$(RPAREN). With this knowledge it can be 5501 packed more tightly when stored in certain 5502 data-structures like trie. ) 5503 5504 Note: 5505 $(P The $(D BitPacked!(T, sz)) is implicitly convertible to `T` 5506 but not vise-versa. Users have to ensure the value fits in 5507 the range required and use the `cast` 5508 operator to perform the conversion.) 5509*/ 5510struct BitPacked(T, size_t sz) 5511if (isIntegral!T || is(T:dchar)) 5512{ 5513 enum bitSize = sz; 5514 T _value; 5515 alias _value this; 5516} 5517 5518/* 5519 Depending on the form of the passed argument `bitSizeOf` returns 5520 the amount of bits required to represent a given type 5521 or a return type of a given functor. 5522*/ 5523template bitSizeOf(Args...) 5524if (Args.length == 1) 5525{ 5526 import std.traits : ReturnType; 5527 alias T = Args[0]; 5528 static if (__traits(compiles, { size_t val = T.bitSize; })) //(is(typeof(T.bitSize) : size_t)) 5529 { 5530 enum bitSizeOf = T.bitSize; 5531 } 5532 else static if (is(ReturnType!T dummy == BitPacked!(U, bits), U, size_t bits)) 5533 { 5534 enum bitSizeOf = bitSizeOf!(ReturnType!T); 5535 } 5536 else 5537 { 5538 enum bitSizeOf = T.sizeof*8; 5539 } 5540} 5541 5542/** 5543 Tests if `T` is some instantiation of $(LREF BitPacked)!(U, x) 5544 and thus suitable for packing. 5545*/ 5546template isBitPacked(T) 5547{ 5548 static if (is(T dummy == BitPacked!(U, bits), U, size_t bits)) 5549 enum isBitPacked = true; 5550 else 5551 enum isBitPacked = false; 5552} 5553 5554/** 5555 Gives the type `U` from $(LREF BitPacked)!(U, x) 5556 or `T` itself for every other type. 5557*/ 5558template TypeOfBitPacked(T) 5559{ 5560 static if (is(T dummy == BitPacked!(U, bits), U, size_t bits)) 5561 alias TypeOfBitPacked = U; 5562 else 5563 alias TypeOfBitPacked = T; 5564} 5565 5566/* 5567 Wrapper, used in definition of custom data structures from `Trie` template. 5568 Applying it to a unary lambda function indicates that the returned value always 5569 fits within `bits` of bits. 5570*/ 5571struct assumeSize(alias Fn, size_t bits) 5572{ 5573 enum bitSize = bits; 5574 static auto ref opCall(T)(auto ref T arg) 5575 { 5576 return Fn(arg); 5577 } 5578} 5579 5580/* 5581 A helper for defining lambda function that yields a slice 5582 of certain bits from an unsigned integral value. 5583 The resulting lambda is wrapped in assumeSize and can be used directly 5584 with `Trie` template. 5585*/ 5586struct sliceBits(size_t from, size_t to) 5587{ 5588 //for now bypass assumeSize, DMD has trouble inlining it 5589 enum bitSize = to-from; 5590 static auto opCall(T)(T x) 5591 out(result) 5592 { 5593 assert(result < (1 << to-from)); 5594 } 5595 do 5596 { 5597 static assert(from < to); 5598 static if (from == 0) 5599 return x & ((1 << to)-1); 5600 else 5601 return (x >> from) & ((1<<(to-from))-1); 5602 } 5603} 5604 5605@safe pure nothrow @nogc uint low_8(uint x) { return x&0xFF; } 5606@safe pure nothrow @nogc uint midlow_8(uint x){ return (x&0xFF00)>>8; } 5607alias lo8 = assumeSize!(low_8, 8); 5608alias mlo8 = assumeSize!(midlow_8, 8); 5609 5610@safe pure nothrow @nogc unittest 5611{ 5612 static assert(bitSizeOf!lo8 == 8); 5613 static assert(bitSizeOf!(sliceBits!(4, 7)) == 3); 5614 static assert(bitSizeOf!(BitPacked!(uint, 2)) == 2); 5615} 5616 5617template Sequence(size_t start, size_t end) 5618{ 5619 static if (start < end) 5620 alias Sequence = AliasSeq!(start, Sequence!(start+1, end)); 5621 else 5622 alias Sequence = AliasSeq!(); 5623} 5624 5625//---- TRIE TESTS ---- 5626@system unittest 5627{ 5628 import std.algorithm.iteration : map; 5629 import std.algorithm.sorting : sort; 5630 import std.array : array; 5631 import std.conv : text, to; 5632 import std.range : iota; 5633 static trieStats(TRIE)(TRIE t) 5634 { 5635 version (std_uni_stats) 5636 { 5637 import std.stdio : writefln, writeln; 5638 writeln("---TRIE FOOTPRINT STATS---"); 5639 static foreach (i; 0 .. t.table.dim) 5640 { 5641 writefln("lvl%s = %s bytes; %s pages" 5642 , i, t.bytes!i, t.pages!i); 5643 } 5644 writefln("TOTAL: %s bytes", t.bytes); 5645 version (none) 5646 { 5647 writeln("INDEX (excluding value level):"); 5648 static foreach (i; 0 .. t.table.dim-1) 5649 writeln(t.table.slice!(i)[0 .. t.table.length!i]); 5650 } 5651 writeln("---------------------------"); 5652 } 5653 } 5654 //@@@BUG link failure, lambdas not found by linker somehow (in case of trie2) 5655 // alias lo8 = assumeSize!(8, function (uint x) { return x&0xFF; }); 5656 // alias next8 = assumeSize!(7, function (uint x) { return (x&0x7F00)>>8; }); 5657 alias Set = CodepointSet; 5658 auto set = Set('A','Z','a','z'); 5659 auto trie = buildTrie!(bool, uint, 256, lo8)(set.byInterval);// simple bool array 5660 for (int a='a'; a<'z';a++) 5661 assert(trie[a]); 5662 for (int a='A'; a<'Z';a++) 5663 assert(trie[a]); 5664 for (int a=0; a<'A'; a++) 5665 assert(!trie[a]); 5666 for (int a ='Z'; a<'a'; a++) 5667 assert(!trie[a]); 5668 trieStats(trie); 5669 5670 auto redundant2 = Set( 5671 1, 18, 256+2, 256+111, 512+1, 512+18, 768+2, 768+111); 5672 auto trie2 = buildTrie!(bool, uint, 1024, mlo8, lo8)(redundant2.byInterval); 5673 trieStats(trie2); 5674 foreach (e; redundant2.byCodepoint) 5675 assert(trie2[e], text(cast(uint) e, " - ", trie2[e])); 5676 foreach (i; 0 .. 1024) 5677 { 5678 assert(trie2[i] == (i in redundant2)); 5679 } 5680 5681 5682 auto redundant3 = Set( 5683 2, 4, 6, 8, 16, 5684 2+16, 4+16, 16+6, 16+8, 16+16, 5685 2+32, 4+32, 32+6, 32+8, 5686 ); 5687 5688 enum max3 = 256; 5689 // sliceBits 5690 auto trie3 = buildTrie!(bool, uint, max3, 5691 sliceBits!(6,8), sliceBits!(4,6), sliceBits!(0,4) 5692 )(redundant3.byInterval); 5693 trieStats(trie3); 5694 foreach (i; 0 .. max3) 5695 assert(trie3[i] == (i in redundant3), text(cast(uint) i)); 5696 5697 auto redundant4 = Set( 5698 10, 64, 64+10, 128, 128+10, 256, 256+10, 512, 5699 1000, 2000, 3000, 4000, 5000, 6000 5700 ); 5701 enum max4 = 2^^16; 5702 auto trie4 = buildTrie!(bool, size_t, max4, 5703 sliceBits!(13, 16), sliceBits!(9, 13), sliceBits!(6, 9) , sliceBits!(0, 6) 5704 )(redundant4.byInterval); 5705 foreach (i; 0 .. max4) 5706 { 5707 if (i in redundant4) 5708 assert(trie4[i], text(cast(uint) i)); 5709 } 5710 trieStats(trie4); 5711 5712 alias mapToS = mapTrieIndex!(useItemAt!(0, char)); 5713 string[] redundantS = ["tea", "start", "orange"]; 5714 redundantS.sort!((a,b) => mapToS(a) < mapToS(b))(); 5715 auto strie = buildTrie!(bool, string, useItemAt!(0, char))(redundantS); 5716 // using first char only 5717 assert(redundantS == ["orange", "start", "tea"]); 5718 assert(strie["test"], text(strie["test"])); 5719 assert(!strie["aea"]); 5720 assert(strie["s"]); 5721 5722 // a bit size test 5723 auto a = array(map!(x => to!ubyte(x))(iota(0, 256))); 5724 auto bt = buildTrie!(bool, ubyte, sliceBits!(7, 8), sliceBits!(5, 7), sliceBits!(0, 5))(a); 5725 trieStats(bt); 5726 foreach (i; 0 .. 256) 5727 assert(bt[cast(ubyte) i]); 5728} 5729 5730template useItemAt(size_t idx, T) 5731if (isIntegral!T || is(T: dchar)) 5732{ 5733 size_t impl(const scope T[] arr){ return arr[idx]; } 5734 alias useItemAt = assumeSize!(impl, 8*T.sizeof); 5735} 5736 5737template useLastItem(T) 5738{ 5739 size_t impl(const scope T[] arr){ return arr[$-1]; } 5740 alias useLastItem = assumeSize!(impl, 8*T.sizeof); 5741} 5742 5743template fullBitSize(Prefix...) 5744{ 5745 static if (Prefix.length > 0) 5746 enum fullBitSize = bitSizeOf!(Prefix[0])+fullBitSize!(Prefix[1..$]); 5747 else 5748 enum fullBitSize = 0; 5749} 5750 5751template idxTypes(Key, size_t fullBits, Prefix...) 5752{ 5753 static if (Prefix.length == 1) 5754 {// the last level is value level, so no index once reduced to 1-level 5755 alias idxTypes = AliasSeq!(); 5756 } 5757 else 5758 { 5759 // Important note on bit packing 5760 // Each level has to hold enough of bits to address the next one 5761 // The bottom level is known to hold full bit width 5762 // thus it's size in pages is full_bit_width - size_of_last_prefix 5763 // Recourse on this notion 5764 alias idxTypes = 5765 AliasSeq!( 5766 idxTypes!(Key, fullBits - bitSizeOf!(Prefix[$-1]), Prefix[0..$-1]), 5767 BitPacked!(typeof(Prefix[$-2](Key.init)), fullBits - bitSizeOf!(Prefix[$-1])) 5768 ); 5769 } 5770} 5771 5772//============================================================================ 5773 5774@safe pure int comparePropertyName(Char1, Char2)(const(Char1)[] a, const(Char2)[] b) 5775if (is(Char1 : dchar) && is(Char2 : dchar)) 5776{ 5777 import std.algorithm.comparison : cmp; 5778 import std.algorithm.iteration : map, filter; 5779 import std.ascii : toLower; 5780 static bool pred(dchar c) {return !c.isWhite && c != '-' && c != '_';} 5781 return cmp( 5782 a.map!toLower.filter!pred, 5783 b.map!toLower.filter!pred); 5784} 5785 5786@safe pure unittest 5787{ 5788 assert(!comparePropertyName("foo-bar", "fooBar")); 5789} 5790 5791bool propertyNameLess(Char1, Char2)(const(Char1)[] a, const(Char2)[] b) @safe pure 5792if (is(Char1 : dchar) && is(Char2 : dchar)) 5793{ 5794 return comparePropertyName(a, b) < 0; 5795} 5796 5797//============================================================================ 5798// Utilities for compression of Unicode code point sets 5799//============================================================================ 5800 5801@safe void compressTo(uint val, ref scope ubyte[] arr) pure nothrow 5802{ 5803 // not optimized as usually done 1 time (and not public interface) 5804 if (val < 128) 5805 arr ~= cast(ubyte) val; 5806 else if (val < (1 << 13)) 5807 { 5808 arr ~= (0b1_00 << 5) | cast(ubyte)(val >> 8); 5809 arr ~= val & 0xFF; 5810 } 5811 else 5812 { 5813 assert(val < (1 << 21)); 5814 arr ~= (0b1_01 << 5) | cast(ubyte)(val >> 16); 5815 arr ~= (val >> 8) & 0xFF; 5816 arr ~= val & 0xFF; 5817 } 5818} 5819 5820@safe uint decompressFrom(scope const(ubyte)[] arr, ref size_t idx) pure 5821{ 5822 import std.exception : enforce; 5823 immutable first = arr[idx++]; 5824 if (!(first & 0x80)) // no top bit -> [0 .. 127] 5825 return first; 5826 immutable extra = ((first >> 5) & 1) + 1; // [1, 2] 5827 uint val = (first & 0x1F); 5828 enforce(idx + extra <= arr.length, "bad code point interval encoding"); 5829 foreach (j; 0 .. extra) 5830 val = (val << 8) | arr[idx+j]; 5831 idx += extra; 5832 return val; 5833} 5834 5835 5836package(std) ubyte[] compressIntervals(Range)(Range intervals) 5837if (isInputRange!Range && isIntegralPair!(ElementType!Range)) 5838{ 5839 ubyte[] storage; 5840 uint base = 0; 5841 // RLE encode 5842 foreach (val; intervals) 5843 { 5844 compressTo(val[0]-base, storage); 5845 base = val[0]; 5846 if (val[1] != lastDchar+1) // till the end of the domain so don't store it 5847 { 5848 compressTo(val[1]-base, storage); 5849 base = val[1]; 5850 } 5851 } 5852 return storage; 5853} 5854 5855@safe pure unittest 5856{ 5857 import std.algorithm.comparison : equal; 5858 import std.typecons : tuple; 5859 5860 auto run = [tuple(80, 127), tuple(128, (1 << 10)+128)]; 5861 ubyte[] enc = [cast(ubyte) 80, 47, 1, (0b1_00 << 5) | (1 << 2), 0]; 5862 assert(compressIntervals(run) == enc); 5863 auto run2 = [tuple(0, (1 << 20)+512+1), tuple((1 << 20)+512+4, lastDchar+1)]; 5864 ubyte[] enc2 = [cast(ubyte) 0, (0b1_01 << 5) | (1 << 4), 2, 1, 3]; // odd length-ed 5865 assert(compressIntervals(run2) == enc2); 5866 size_t idx = 0; 5867 assert(decompressFrom(enc, idx) == 80); 5868 assert(decompressFrom(enc, idx) == 47); 5869 assert(decompressFrom(enc, idx) == 1); 5870 assert(decompressFrom(enc, idx) == (1 << 10)); 5871 idx = 0; 5872 assert(decompressFrom(enc2, idx) == 0); 5873 assert(decompressFrom(enc2, idx) == (1 << 20)+512+1); 5874 assert(equal(decompressIntervals(compressIntervals(run)), run)); 5875 assert(equal(decompressIntervals(compressIntervals(run2)), run2)); 5876} 5877 5878// Creates a range of `CodepointInterval` that lazily decodes compressed data. 5879@safe package(std) auto decompressIntervals(const(ubyte)[] data) pure 5880{ 5881 return DecompressedIntervals(data); 5882} 5883 5884@safe struct DecompressedIntervals 5885{ 5886pure: 5887 const(ubyte)[] _stream; 5888 size_t _idx; 5889 CodepointInterval _front; 5890 5891 this(const(ubyte)[] stream) 5892 { 5893 _stream = stream; 5894 popFront(); 5895 } 5896 5897 @property CodepointInterval front() 5898 { 5899 assert(!empty); 5900 return _front; 5901 } 5902 5903 void popFront() 5904 { 5905 if (_idx == _stream.length) 5906 { 5907 _idx = size_t.max; 5908 return; 5909 } 5910 uint base = _front[1]; 5911 _front[0] = base + decompressFrom(_stream, _idx); 5912 if (_idx == _stream.length)// odd length ---> till the end 5913 _front[1] = lastDchar+1; 5914 else 5915 { 5916 base = _front[0]; 5917 _front[1] = base + decompressFrom(_stream, _idx); 5918 } 5919 } 5920 5921 @property bool empty() const 5922 { 5923 return _idx == size_t.max; 5924 } 5925 5926 @property DecompressedIntervals save() return scope { return this; } 5927} 5928 5929@safe pure nothrow @nogc unittest 5930{ 5931 static assert(isInputRange!DecompressedIntervals); 5932 static assert(isForwardRange!DecompressedIntervals); 5933} 5934 5935//============================================================================ 5936 5937version (std_uni_bootstrap){} 5938else 5939{ 5940 5941// helper for looking up code point sets 5942ptrdiff_t findUnicodeSet(alias table, C)(const scope C[] name) 5943{ 5944 import std.algorithm.iteration : map; 5945 import std.range : assumeSorted; 5946 auto range = assumeSorted!((a,b) => propertyNameLess(a,b)) 5947 (table.map!"a.name"()); 5948 size_t idx = range.lowerBound(name).length; 5949 if (idx < range.length && comparePropertyName(range[idx], name) == 0) 5950 return idx; 5951 return -1; 5952} 5953 5954// another one that loads it 5955bool loadUnicodeSet(alias table, Set, C)(const scope C[] name, ref Set dest) 5956{ 5957 auto idx = findUnicodeSet!table(name); 5958 if (idx >= 0) 5959 { 5960 dest = Set(asSet(table[idx].compressed)); 5961 return true; 5962 } 5963 return false; 5964} 5965 5966bool loadProperty(Set=CodepointSet, C) 5967 (const scope C[] name, ref Set target) pure 5968{ 5969 import std.internal.unicode_tables : uniProps; // generated file 5970 alias ucmp = comparePropertyName; 5971 // conjure cumulative properties by hand 5972 if (ucmp(name, "L") == 0 || ucmp(name, "Letter") == 0) 5973 { 5974 target = asSet(uniProps.Lu); 5975 target |= asSet(uniProps.Ll); 5976 target |= asSet(uniProps.Lt); 5977 target |= asSet(uniProps.Lo); 5978 target |= asSet(uniProps.Lm); 5979 } 5980 else if (ucmp(name,"LC") == 0 || ucmp(name,"Cased Letter")==0) 5981 { 5982 target = asSet(uniProps.Ll); 5983 target |= asSet(uniProps.Lu); 5984 target |= asSet(uniProps.Lt);// Title case 5985 } 5986 else if (ucmp(name, "M") == 0 || ucmp(name, "Mark") == 0) 5987 { 5988 target = asSet(uniProps.Mn); 5989 target |= asSet(uniProps.Mc); 5990 target |= asSet(uniProps.Me); 5991 } 5992 else if (ucmp(name, "N") == 0 || ucmp(name, "Number") == 0) 5993 { 5994 target = asSet(uniProps.Nd); 5995 target |= asSet(uniProps.Nl); 5996 target |= asSet(uniProps.No); 5997 } 5998 else if (ucmp(name, "P") == 0 || ucmp(name, "Punctuation") == 0) 5999 { 6000 target = asSet(uniProps.Pc); 6001 target |= asSet(uniProps.Pd); 6002 target |= asSet(uniProps.Ps); 6003 target |= asSet(uniProps.Pe); 6004 target |= asSet(uniProps.Pi); 6005 target |= asSet(uniProps.Pf); 6006 target |= asSet(uniProps.Po); 6007 } 6008 else if (ucmp(name, "S") == 0 || ucmp(name, "Symbol") == 0) 6009 { 6010 target = asSet(uniProps.Sm); 6011 target |= asSet(uniProps.Sc); 6012 target |= asSet(uniProps.Sk); 6013 target |= asSet(uniProps.So); 6014 } 6015 else if (ucmp(name, "Z") == 0 || ucmp(name, "Separator") == 0) 6016 { 6017 target = asSet(uniProps.Zs); 6018 target |= asSet(uniProps.Zl); 6019 target |= asSet(uniProps.Zp); 6020 } 6021 else if (ucmp(name, "C") == 0 || ucmp(name, "Other") == 0) 6022 { 6023 target = asSet(uniProps.Co); 6024 target |= asSet(uniProps.Lo); 6025 target |= asSet(uniProps.No); 6026 target |= asSet(uniProps.So); 6027 target |= asSet(uniProps.Po); 6028 } 6029 else if (ucmp(name, "graphical") == 0) 6030 { 6031 target = asSet(uniProps.Alphabetic); 6032 6033 target |= asSet(uniProps.Mn); 6034 target |= asSet(uniProps.Mc); 6035 target |= asSet(uniProps.Me); 6036 6037 target |= asSet(uniProps.Nd); 6038 target |= asSet(uniProps.Nl); 6039 target |= asSet(uniProps.No); 6040 6041 target |= asSet(uniProps.Pc); 6042 target |= asSet(uniProps.Pd); 6043 target |= asSet(uniProps.Ps); 6044 target |= asSet(uniProps.Pe); 6045 target |= asSet(uniProps.Pi); 6046 target |= asSet(uniProps.Pf); 6047 target |= asSet(uniProps.Po); 6048 6049 target |= asSet(uniProps.Zs); 6050 6051 target |= asSet(uniProps.Sm); 6052 target |= asSet(uniProps.Sc); 6053 target |= asSet(uniProps.Sk); 6054 target |= asSet(uniProps.So); 6055 } 6056 else if (ucmp(name, "any") == 0) 6057 target = Set.fromIntervals(0, 0x110000); 6058 else if (ucmp(name, "ascii") == 0) 6059 target = Set.fromIntervals(0, 0x80); 6060 else 6061 return loadUnicodeSet!(uniProps.tab)(name, target); 6062 return true; 6063} 6064 6065// CTFE-only helper for checking property names at compile-time 6066@safe bool isPrettyPropertyName(C)(const scope C[] name) 6067{ 6068 import std.algorithm.searching : find; 6069 auto names = [ 6070 "L", "Letter", 6071 "LC", "Cased Letter", 6072 "M", "Mark", 6073 "N", "Number", 6074 "P", "Punctuation", 6075 "S", "Symbol", 6076 "Z", "Separator", 6077 "Graphical", 6078 "any", 6079 "ascii" 6080 ]; 6081 auto x = find!(x => comparePropertyName(x, name) == 0)(names); 6082 return !x.empty; 6083} 6084 6085// ditto, CTFE-only, not optimized 6086@safe private static bool findSetName(alias table, C)(const scope C[] name) 6087{ 6088 return findUnicodeSet!table(name) >= 0; 6089} 6090 6091template SetSearcher(alias table, string kind) 6092{ 6093 /// Run-time checked search. 6094 static auto opCall(C)(const scope C[] name) 6095 if (is(C : dchar)) 6096 { 6097 import std.conv : to; 6098 CodepointSet set; 6099 if (loadUnicodeSet!table(name, set)) 6100 return set; 6101 throw new Exception("No unicode set for "~kind~" by name " 6102 ~name.to!string()~" was found."); 6103 } 6104 /// Compile-time checked search. 6105 static @property auto opDispatch(string name)() 6106 { 6107 static if (findSetName!table(name)) 6108 { 6109 CodepointSet set; 6110 loadUnicodeSet!table(name, set); 6111 return set; 6112 } 6113 else 6114 static assert(false, "No unicode set for "~kind~" by name " 6115 ~name~" was found."); 6116 } 6117} 6118 6119// Characters that need escaping in string posed as regular expressions 6120package(std) alias Escapables = AliasSeq!('[', ']', '\\', '^', '$', '.', '|', '?', ',', '-', 6121 ';', ':', '#', '&', '%', '/', '<', '>', '`', '*', '+', '(', ')', '{', '}', '~'); 6122 6123package(std) CodepointSet memoizeExpr(string expr)() 6124{ 6125 if (__ctfe) 6126 return mixin(expr); 6127 alias T = typeof(mixin(expr)); 6128 static T slot; 6129 static bool initialized; 6130 if (!initialized) 6131 { 6132 slot = mixin(expr); 6133 initialized = true; 6134 } 6135 return slot; 6136} 6137 6138//property for \w character class 6139package(std) @property CodepointSet wordCharacter() @safe 6140{ 6141 return memoizeExpr!("unicode.Alphabetic | unicode.Mn | unicode.Mc 6142 | unicode.Me | unicode.Nd | unicode.Pc")(); 6143} 6144 6145//basic stack, just in case it gets used anywhere else then Parser 6146package(std) struct Stack(T) 6147{ 6148@safe: 6149 T[] data; 6150 @property bool empty(){ return data.empty; } 6151 6152 @property size_t length(){ return data.length; } 6153 6154 void push(T val){ data ~= val; } 6155 6156 @trusted T pop() 6157 { 6158 assert(!empty); 6159 auto val = data[$ - 1]; 6160 data = data[0 .. $ - 1]; 6161 if (!__ctfe) 6162 cast(void) data.assumeSafeAppend(); 6163 return val; 6164 } 6165 6166 @property ref T top() 6167 { 6168 assert(!empty); 6169 return data[$ - 1]; 6170 } 6171} 6172 6173//test if a given string starts with hex number of maxDigit that's a valid codepoint 6174//returns it's value and skips these maxDigit chars on success, throws on failure 6175package(std) dchar parseUniHex(Range)(ref Range str, size_t maxDigit) 6176{ 6177 import std.exception : enforce; 6178 //std.conv.parse is both @system and bogus 6179 uint val; 6180 for (int k = 0; k < maxDigit; k++) 6181 { 6182 enforce(!str.empty, "incomplete escape sequence"); 6183 //accepts ascii only, so it's OK to index directly 6184 immutable current = str.front; 6185 if ('0' <= current && current <= '9') 6186 val = val * 16 + current - '0'; 6187 else if ('a' <= current && current <= 'f') 6188 val = val * 16 + current -'a' + 10; 6189 else if ('A' <= current && current <= 'F') 6190 val = val * 16 + current - 'A' + 10; 6191 else 6192 throw new Exception("invalid escape sequence"); 6193 str.popFront(); 6194 } 6195 enforce(val <= 0x10FFFF, "invalid codepoint"); 6196 return val; 6197} 6198 6199@safe unittest 6200{ 6201 import std.algorithm.searching : canFind; 6202 import std.exception : collectException; 6203 string[] non_hex = [ "000j", "000z", "FffG", "0Z"]; 6204 string[] hex = [ "01", "ff", "00af", "10FFFF" ]; 6205 int[] value = [ 1, 0xFF, 0xAF, 0x10FFFF ]; 6206 foreach (v; non_hex) 6207 assert(collectException(parseUniHex(v, v.length)).msg 6208 .canFind("invalid escape sequence")); 6209 foreach (i, v; hex) 6210 assert(parseUniHex(v, v.length) == value[i]); 6211 string over = "0011FFFF"; 6212 assert(collectException(parseUniHex(over, over.length)).msg 6213 .canFind("invalid codepoint")); 6214} 6215 6216auto caseEnclose(CodepointSet set) 6217{ 6218 auto cased = set & unicode.LC; 6219 foreach (dchar ch; cased.byCodepoint) 6220 { 6221 foreach (c; simpleCaseFoldings(ch)) 6222 set |= c; 6223 } 6224 return set; 6225} 6226 6227/+ 6228 fetch codepoint set corresponding to a name (InBlock or binary property) 6229+/ 6230CodepointSet getUnicodeSet(const scope char[] name, bool negated, bool casefold) @safe 6231{ 6232 CodepointSet s = unicode(name); 6233 //FIXME: caseEnclose for new uni as Set | CaseEnclose(SET && LC) 6234 if (casefold) 6235 s = caseEnclose(s); 6236 if (negated) 6237 s = s.inverted; 6238 return s; 6239} 6240 6241struct UnicodeSetParser(Range) 6242{ 6243 import std.exception : enforce; 6244 import std.typecons : tuple, Tuple; 6245 Range range; 6246 bool casefold_; 6247 6248 @property bool empty(){ return range.empty; } 6249 @property dchar front(){ return range.front; } 6250 void popFront(){ range.popFront(); } 6251 6252 //CodepointSet operations relatively in order of priority 6253 enum Operator:uint { 6254 Open = 0, Negate, Difference, SymDifference, Intersection, Union, None 6255 } 6256 6257 //parse unit of CodepointSet spec, most notably escape sequences and char ranges 6258 //also fetches next set operation 6259 Tuple!(CodepointSet,Operator) parseCharTerm() 6260 { 6261 import std.range : drop; 6262 enum privateUseStart = '\U000F0000', privateUseEnd ='\U000FFFFD'; 6263 enum State{ Start, Char, Escape, CharDash, CharDashEscape, 6264 PotentialTwinSymbolOperator } 6265 Operator op = Operator.None; 6266 dchar last; 6267 CodepointSet set; 6268 State state = State.Start; 6269 6270 void addWithFlags(ref CodepointSet set, uint ch) 6271 { 6272 if (casefold_) 6273 { 6274 auto range = simpleCaseFoldings(ch); 6275 foreach (v; range) 6276 set |= v; 6277 } 6278 else 6279 set |= ch; 6280 } 6281 6282 static Operator twinSymbolOperator(dchar symbol) 6283 { 6284 switch (symbol) 6285 { 6286 case '|': 6287 return Operator.Union; 6288 case '-': 6289 return Operator.Difference; 6290 case '~': 6291 return Operator.SymDifference; 6292 case '&': 6293 return Operator.Intersection; 6294 default: 6295 assert(false); 6296 } 6297 } 6298 6299 L_CharTermLoop: 6300 for (;;) 6301 { 6302 final switch (state) 6303 { 6304 case State.Start: 6305 switch (front) 6306 { 6307 case '|': 6308 case '-': 6309 case '~': 6310 case '&': 6311 state = State.PotentialTwinSymbolOperator; 6312 last = front; 6313 break; 6314 case '[': 6315 op = Operator.Union; 6316 goto case; 6317 case ']': 6318 break L_CharTermLoop; 6319 case '\\': 6320 state = State.Escape; 6321 break; 6322 default: 6323 state = State.Char; 6324 last = front; 6325 } 6326 break; 6327 case State.Char: 6328 // xxx last front xxx 6329 switch (front) 6330 { 6331 case '|': 6332 case '~': 6333 case '&': 6334 // then last is treated as normal char and added as implicit union 6335 state = State.PotentialTwinSymbolOperator; 6336 addWithFlags(set, last); 6337 last = front; 6338 break; 6339 case '-': // still need more info 6340 state = State.CharDash; 6341 break; 6342 case '\\': 6343 set |= last; 6344 state = State.Escape; 6345 break; 6346 case '[': 6347 op = Operator.Union; 6348 goto case; 6349 case ']': 6350 addWithFlags(set, last); 6351 break L_CharTermLoop; 6352 default: 6353 state = State.Char; 6354 addWithFlags(set, last); 6355 last = front; 6356 } 6357 break; 6358 case State.PotentialTwinSymbolOperator: 6359 // xxx last front xxxx 6360 // where last = [|-&~] 6361 if (front == last) 6362 { 6363 op = twinSymbolOperator(last); 6364 popFront();//skip second twin char 6365 break L_CharTermLoop; 6366 } 6367 goto case State.Char; 6368 case State.Escape: 6369 // xxx \ front xxx 6370 switch (front) 6371 { 6372 case 'f': 6373 last = '\f'; 6374 state = State.Char; 6375 break; 6376 case 'n': 6377 last = '\n'; 6378 state = State.Char; 6379 break; 6380 case 'r': 6381 last = '\r'; 6382 state = State.Char; 6383 break; 6384 case 't': 6385 last = '\t'; 6386 state = State.Char; 6387 break; 6388 case 'v': 6389 last = '\v'; 6390 state = State.Char; 6391 break; 6392 case 'c': 6393 last = unicode.parseControlCode(this); 6394 state = State.Char; 6395 break; 6396 foreach (val; Escapables) 6397 { 6398 case val: 6399 } 6400 last = front; 6401 state = State.Char; 6402 break; 6403 case 'p': 6404 set.add(unicode.parsePropertySpec(this, false, casefold_)); 6405 state = State.Start; 6406 continue L_CharTermLoop; //next char already fetched 6407 case 'P': 6408 set.add(unicode.parsePropertySpec(this, true, casefold_)); 6409 state = State.Start; 6410 continue L_CharTermLoop; //next char already fetched 6411 case 'x': 6412 popFront(); 6413 last = parseUniHex(this, 2); 6414 state = State.Char; 6415 continue L_CharTermLoop; 6416 case 'u': 6417 popFront(); 6418 last = parseUniHex(this, 4); 6419 state = State.Char; 6420 continue L_CharTermLoop; 6421 case 'U': 6422 popFront(); 6423 last = parseUniHex(this, 8); 6424 state = State.Char; 6425 continue L_CharTermLoop; 6426 case 'd': 6427 set.add(unicode.Nd); 6428 state = State.Start; 6429 break; 6430 case 'D': 6431 set.add(unicode.Nd.inverted); 6432 state = State.Start; 6433 break; 6434 case 's': 6435 set.add(unicode.White_Space); 6436 state = State.Start; 6437 break; 6438 case 'S': 6439 set.add(unicode.White_Space.inverted); 6440 state = State.Start; 6441 break; 6442 case 'w': 6443 set.add(wordCharacter); 6444 state = State.Start; 6445 break; 6446 case 'W': 6447 set.add(wordCharacter.inverted); 6448 state = State.Start; 6449 break; 6450 default: 6451 if (front >= privateUseStart && front <= privateUseEnd) 6452 enforce(false, "no matching ']' found while parsing character class"); 6453 enforce(false, "invalid escape sequence"); 6454 } 6455 break; 6456 case State.CharDash: 6457 // xxx last - front xxx 6458 switch (front) 6459 { 6460 case '[': 6461 op = Operator.Union; 6462 goto case; 6463 case ']': 6464 //means dash is a single char not an interval specifier 6465 addWithFlags(set, last); 6466 addWithFlags(set, '-'); 6467 break L_CharTermLoop; 6468 case '-'://set Difference again 6469 addWithFlags(set, last); 6470 op = Operator.Difference; 6471 popFront();//skip '-' 6472 break L_CharTermLoop; 6473 case '\\': 6474 state = State.CharDashEscape; 6475 break; 6476 default: 6477 enforce(last <= front, "inverted range"); 6478 if (casefold_) 6479 { 6480 for (uint ch = last; ch <= front; ch++) 6481 addWithFlags(set, ch); 6482 } 6483 else 6484 set.add(last, front + 1); 6485 state = State.Start; 6486 } 6487 break; 6488 case State.CharDashEscape: 6489 //xxx last - \ front xxx 6490 uint end; 6491 switch (front) 6492 { 6493 case 'f': 6494 end = '\f'; 6495 break; 6496 case 'n': 6497 end = '\n'; 6498 break; 6499 case 'r': 6500 end = '\r'; 6501 break; 6502 case 't': 6503 end = '\t'; 6504 break; 6505 case 'v': 6506 end = '\v'; 6507 break; 6508 foreach (val; Escapables) 6509 { 6510 case val: 6511 } 6512 end = front; 6513 break; 6514 case 'c': 6515 end = unicode.parseControlCode(this); 6516 break; 6517 case 'x': 6518 popFront(); 6519 end = parseUniHex(this, 2); 6520 enforce(last <= end,"inverted range"); 6521 set.add(last, end + 1); 6522 state = State.Start; 6523 continue L_CharTermLoop; 6524 case 'u': 6525 popFront(); 6526 end = parseUniHex(this, 4); 6527 enforce(last <= end,"inverted range"); 6528 set.add(last, end + 1); 6529 state = State.Start; 6530 continue L_CharTermLoop; 6531 case 'U': 6532 popFront(); 6533 end = parseUniHex(this, 8); 6534 enforce(last <= end,"inverted range"); 6535 set.add(last, end + 1); 6536 state = State.Start; 6537 continue L_CharTermLoop; 6538 default: 6539 if (front >= privateUseStart && front <= privateUseEnd) 6540 enforce(false, "no matching ']' found while parsing character class"); 6541 enforce(false, "invalid escape sequence"); 6542 } 6543 // Lookahead to check if it's a \T 6544 // where T is sub-pattern terminator in multi-pattern scheme 6545 auto lookahead = range.save.drop(1); 6546 if (end == '\\' && !lookahead.empty) 6547 { 6548 if (lookahead.front >= privateUseStart && lookahead.front <= privateUseEnd) 6549 enforce(false, "no matching ']' found while parsing character class"); 6550 } 6551 enforce(last <= end,"inverted range"); 6552 set.add(last, end + 1); 6553 state = State.Start; 6554 break; 6555 } 6556 popFront(); 6557 enforce(!empty, "unexpected end of CodepointSet"); 6558 } 6559 return tuple(set, op); 6560 } 6561 6562 alias ValStack = Stack!(CodepointSet); 6563 alias OpStack = Stack!(Operator); 6564 6565 CodepointSet parseSet() 6566 { 6567 ValStack vstack; 6568 OpStack opstack; 6569 import std.functional : unaryFun; 6570 enforce(!empty, "unexpected end of input"); 6571 enforce(front == '[', "expected '[' at the start of unicode set"); 6572 // 6573 static bool apply(Operator op, ref ValStack stack) 6574 { 6575 switch (op) 6576 { 6577 case Operator.Negate: 6578 enforce(!stack.empty, "no operand for '^'"); 6579 stack.top = stack.top.inverted; 6580 break; 6581 case Operator.Union: 6582 auto s = stack.pop();//2nd operand 6583 enforce(!stack.empty, "no operand for '||'"); 6584 stack.top.add(s); 6585 break; 6586 case Operator.Difference: 6587 auto s = stack.pop();//2nd operand 6588 enforce(!stack.empty, "no operand for '--'"); 6589 stack.top.sub(s); 6590 break; 6591 case Operator.SymDifference: 6592 auto s = stack.pop();//2nd operand 6593 enforce(!stack.empty, "no operand for '~~'"); 6594 stack.top ~= s; 6595 break; 6596 case Operator.Intersection: 6597 auto s = stack.pop();//2nd operand 6598 enforce(!stack.empty, "no operand for '&&'"); 6599 stack.top.intersect(s); 6600 break; 6601 default: 6602 return false; 6603 } 6604 return true; 6605 } 6606 static bool unrollWhile(alias cond)(ref ValStack vstack, ref OpStack opstack) 6607 { 6608 while (cond(opstack.top)) 6609 { 6610 if (!apply(opstack.pop(),vstack)) 6611 return false;//syntax error 6612 if (opstack.empty) 6613 return false; 6614 } 6615 return true; 6616 } 6617 6618 L_CharsetLoop: 6619 do 6620 { 6621 switch (front) 6622 { 6623 case '[': 6624 opstack.push(Operator.Open); 6625 popFront(); 6626 enforce(!empty, "unexpected end of character class"); 6627 if (front == '^') 6628 { 6629 opstack.push(Operator.Negate); 6630 popFront(); 6631 enforce(!empty, "unexpected end of character class"); 6632 } 6633 else if (front == ']') // []...] is special cased 6634 { 6635 popFront(); 6636 enforce(!empty, "wrong character set"); 6637 auto pair = parseCharTerm(); 6638 pair[0].add(']', ']'+1); 6639 if (pair[1] != Operator.None) 6640 { 6641 if (opstack.top == Operator.Union) 6642 unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack); 6643 opstack.push(pair[1]); 6644 } 6645 vstack.push(pair[0]); 6646 } 6647 break; 6648 case ']': 6649 enforce(unrollWhile!(unaryFun!"a != a.Open")(vstack, opstack), 6650 "character class syntax error"); 6651 enforce(!opstack.empty, "unmatched ']'"); 6652 opstack.pop(); 6653 popFront(); 6654 if (opstack.empty) 6655 break L_CharsetLoop; 6656 auto pair = parseCharTerm(); 6657 if (!pair[0].empty)//not only operator e.g. -- or ~~ 6658 { 6659 vstack.top.add(pair[0]);//apply union 6660 } 6661 if (pair[1] != Operator.None) 6662 { 6663 if (opstack.top == Operator.Union) 6664 unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack); 6665 opstack.push(pair[1]); 6666 } 6667 break; 6668 // 6669 default://yet another pair of term(op)? 6670 auto pair = parseCharTerm(); 6671 if (pair[1] != Operator.None) 6672 { 6673 if (opstack.top == Operator.Union) 6674 unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack); 6675 opstack.push(pair[1]); 6676 } 6677 vstack.push(pair[0]); 6678 } 6679 6680 }while (!empty || !opstack.empty); 6681 while (!opstack.empty) 6682 apply(opstack.pop(),vstack); 6683 assert(vstack.length == 1); 6684 return vstack.top; 6685 } 6686} 6687 6688/** 6689 A single entry point to lookup Unicode $(CODEPOINT) sets by name or alias of 6690 a block, script or general category. 6691 6692 It uses well defined standard rules of property name lookup. 6693 This includes fuzzy matching of names, so that 6694 'White_Space', 'white-SpAce' and 'whitespace' are all considered equal 6695 and yield the same set of white space $(CHARACTERS). 6696*/ 6697@safe public struct unicode 6698{ 6699 import std.exception : enforce; 6700 /** 6701 Performs the lookup of set of $(CODEPOINTS) 6702 with compile-time correctness checking. 6703 This short-cut version combines 3 searches: 6704 across blocks, scripts, and common binary properties. 6705 6706 Note that since scripts and blocks overlap the 6707 usual trick to disambiguate is used - to get a block use 6708 `unicode.InBlockName`, to search a script 6709 use `unicode.ScriptName`. 6710 6711 See_Also: $(LREF block), $(LREF script) 6712 and (not included in this search) $(LREF hangulSyllableType). 6713 */ 6714 6715 static @property auto opDispatch(string name)() pure 6716 { 6717 static if (findAny(name)) 6718 return loadAny(name); 6719 else 6720 static assert(false, "No unicode set by name "~name~" was found."); 6721 } 6722 6723 /// 6724 @safe unittest 6725 { 6726 import std.exception : collectException; 6727 auto ascii = unicode.ASCII; 6728 assert(ascii['A']); 6729 assert(ascii['~']); 6730 assert(!ascii['\u00e0']); 6731 // matching is case-insensitive 6732 assert(ascii == unicode.ascII); 6733 assert(!ascii['��']); 6734 // underscores, '-' and whitespace in names are ignored too 6735 auto latin = unicode.in_latin1_Supplement; 6736 assert(latin['��']); 6737 assert(!latin['$']); 6738 // BTW Latin 1 Supplement is a block, hence "In" prefix 6739 assert(latin == unicode("In Latin 1 Supplement")); 6740 // run-time look up throws if no such set is found 6741 assert(collectException(unicode("InCyrilliac"))); 6742 } 6743 6744 /** 6745 The same lookup across blocks, scripts, or binary properties, 6746 but performed at run-time. 6747 This version is provided for cases where `name` 6748 is not known beforehand; otherwise compile-time 6749 checked $(LREF opDispatch) is typically a better choice. 6750 6751 See the $(S_LINK Unicode properties, table of properties) for available 6752 sets. 6753 */ 6754 static auto opCall(C)(const scope C[] name) 6755 if (is(C : dchar)) 6756 { 6757 return loadAny(name); 6758 } 6759 6760 /** 6761 Narrows down the search for sets of $(CODEPOINTS) to all Unicode blocks. 6762 6763 Note: 6764 Here block names are unambiguous as no scripts are searched 6765 and thus to search use simply `unicode.block.BlockName` notation. 6766 6767 See $(S_LINK Unicode properties, table of properties) for available sets. 6768 See_Also: $(S_LINK Unicode properties, table of properties). 6769 */ 6770 struct block 6771 { 6772 import std.internal.unicode_tables : blocks; // generated file 6773 mixin SetSearcher!(blocks.tab, "block"); 6774 } 6775 6776 /// 6777 @safe unittest 6778 { 6779 // use .block for explicitness 6780 assert(unicode.block.Greek_and_Coptic == unicode.InGreek_and_Coptic); 6781 } 6782 6783 /** 6784 Narrows down the search for sets of $(CODEPOINTS) to all Unicode scripts. 6785 6786 See the $(S_LINK Unicode properties, table of properties) for available 6787 sets. 6788 */ 6789 struct script 6790 { 6791 import std.internal.unicode_tables : scripts; // generated file 6792 mixin SetSearcher!(scripts.tab, "script"); 6793 } 6794 6795 /// 6796 @safe unittest 6797 { 6798 auto arabicScript = unicode.script.arabic; 6799 auto arabicBlock = unicode.block.arabic; 6800 // there is an intersection between script and block 6801 assert(arabicBlock['��']); 6802 assert(arabicScript['��']); 6803 // but they are different 6804 assert(arabicBlock != arabicScript); 6805 assert(arabicBlock == unicode.inArabic); 6806 assert(arabicScript == unicode.arabic); 6807 } 6808 6809 /** 6810 Fetch a set of $(CODEPOINTS) that have the given hangul syllable type. 6811 6812 Other non-binary properties (once supported) follow the same 6813 notation - `unicode.propertyName.propertyValue` for compile-time 6814 checked access and `unicode.propertyName(propertyValue)` 6815 for run-time checked one. 6816 6817 See the $(S_LINK Unicode properties, table of properties) for available 6818 sets. 6819 */ 6820 struct hangulSyllableType 6821 { 6822 import std.internal.unicode_tables : hangul; // generated file 6823 mixin SetSearcher!(hangul.tab, "hangul syllable type"); 6824 } 6825 6826 /// 6827 @safe unittest 6828 { 6829 // L here is syllable type not Letter as in unicode.L short-cut 6830 auto leadingVowel = unicode.hangulSyllableType("L"); 6831 // check that some leading vowels are present 6832 foreach (vowel; '\u1110'..'\u115F') 6833 assert(leadingVowel[vowel]); 6834 assert(leadingVowel == unicode.hangulSyllableType.L); 6835 } 6836 6837 //parse control code of form \cXXX, c assumed to be the current symbol 6838 static package(std) dchar parseControlCode(Parser)(ref Parser p) 6839 { 6840 with(p) 6841 { 6842 popFront(); 6843 enforce(!empty, "Unfinished escape sequence"); 6844 enforce(('a' <= front && front <= 'z') 6845 || ('A' <= front && front <= 'Z'), 6846 "Only letters are allowed after \\c"); 6847 return front & 0x1f; 6848 } 6849 } 6850 6851 //parse and return a CodepointSet for \p{...Property...} and \P{...Property..}, 6852 //\ - assumed to be processed, p - is current 6853 static package(std) CodepointSet parsePropertySpec(Range)(ref Range p, 6854 bool negated, bool casefold) 6855 { 6856 static import std.ascii; 6857 with(p) 6858 { 6859 enum MAX_PROPERTY = 128; 6860 char[MAX_PROPERTY] result; 6861 uint k = 0; 6862 popFront(); 6863 enforce(!empty, "eof parsing unicode property spec"); 6864 if (front == '{') 6865 { 6866 popFront(); 6867 while (k < MAX_PROPERTY && !empty && front !='}' 6868 && front !=':') 6869 { 6870 if (front != '-' && front != ' ' && front != '_') 6871 result[k++] = cast(char) std.ascii.toLower(front); 6872 popFront(); 6873 } 6874 enforce(k != MAX_PROPERTY, "invalid property name"); 6875 enforce(front == '}', "} expected "); 6876 } 6877 else 6878 {//single char properties e.g.: \pL, \pN ... 6879 enforce(front < 0x80, "invalid property name"); 6880 result[k++] = cast(char) front; 6881 } 6882 auto s = getUnicodeSet(result[0 .. k], negated, casefold); 6883 enforce(!s.empty, "unrecognized unicode property spec"); 6884 popFront(); 6885 return s; 6886 } 6887 } 6888 6889 /** 6890 Parse unicode codepoint set from given `range` using standard regex 6891 syntax '[...]'. The range is advanced skiping over regex set definition. 6892 `casefold` parameter determines if the set should be casefolded - that is 6893 include both lower and upper case versions for any letters in the set. 6894 */ 6895 static CodepointSet parseSet(Range)(ref Range range, bool casefold=false) 6896 if (isInputRange!Range && is(ElementType!Range : dchar)) 6897 { 6898 auto usParser = UnicodeSetParser!Range(range, casefold); 6899 auto set = usParser.parseSet(); 6900 range = usParser.range; 6901 return set; 6902 } 6903 6904 /// 6905 @safe unittest 6906 { 6907 import std.uni : unicode; 6908 string pat = "[a-zA-Z0-9]hello"; 6909 auto set = unicode.parseSet(pat); 6910 // check some of the codepoints 6911 assert(set['a'] && set['A'] && set['9']); 6912 assert(pat == "hello"); 6913 } 6914 6915private: 6916 alias ucmp = comparePropertyName; 6917 6918 static bool findAny(string name) 6919 { 6920 import std.internal.unicode_tables : blocks, scripts, uniProps; // generated file 6921 return isPrettyPropertyName(name) 6922 || findSetName!(uniProps.tab)(name) || findSetName!(scripts.tab)(name) 6923 || (ucmp(name[0 .. 2],"In") == 0 && findSetName!(blocks.tab)(name[2..$])); 6924 } 6925 6926 static auto loadAny(Set=CodepointSet, C)(const scope C[] name) pure 6927 { 6928 import std.conv : to; 6929 import std.internal.unicode_tables : blocks, scripts; // generated file 6930 Set set; 6931 immutable loaded = loadProperty(name, set) || loadUnicodeSet!(scripts.tab)(name, set) 6932 || (name.length > 2 && ucmp(name[0 .. 2],"In") == 0 6933 && loadUnicodeSet!(blocks.tab)(name[2..$], set)); 6934 if (loaded) 6935 return set; 6936 throw new Exception("No unicode set by name "~name.to!string()~" was found."); 6937 } 6938 6939 // FIXME: re-disable once the compiler is fixed 6940 // Disabled to prevent the mistake of creating instances of this pseudo-struct. 6941 //@disable ~this(); 6942} 6943 6944@safe unittest 6945{ 6946 import std.internal.unicode_tables : blocks, uniProps; // generated file 6947 assert(unicode("InHebrew") == asSet(blocks.Hebrew)); 6948 assert(unicode("separator") == (asSet(uniProps.Zs) | asSet(uniProps.Zl) | asSet(uniProps.Zp))); 6949 assert(unicode("In-Kharoshthi") == asSet(blocks.Kharoshthi)); 6950} 6951 6952enum EMPTY_CASE_TRIE = ushort.max;// from what gen_uni uses internally 6953 6954// control - '\r' 6955enum controlSwitch = ` 6956 case '\u0000':..case '\u0008':case '\u000E':..case '\u001F':case '\u007F':.. 6957 case '\u0084':case '\u0086':..case '\u009F': case '\u0009':..case '\u000C': case '\u0085': 6958`; 6959// TODO: redo the most of hangul stuff algorithmically in case of Graphemes too 6960// kill unrolled switches 6961 6962private static bool isRegionalIndicator(dchar ch) @safe pure @nogc nothrow 6963{ 6964 return ch >= '\U0001F1E6' && ch <= '\U0001F1FF'; 6965} 6966 6967template genericDecodeGrapheme(bool getValue) 6968{ 6969 alias graphemeExtend = graphemeExtendTrie; 6970 alias spacingMark = mcTrie; 6971 static if (getValue) 6972 alias Value = Grapheme; 6973 else 6974 alias Value = void; 6975 6976 Value genericDecodeGrapheme(Input)(ref Input range) 6977 { 6978 import std.internal.unicode_tables : isHangL, isHangT, isHangV; // generated file 6979 enum GraphemeState { 6980 Start, 6981 CR, 6982 RI, 6983 L, 6984 V, 6985 LVT 6986 } 6987 static if (getValue) 6988 Grapheme grapheme; 6989 auto state = GraphemeState.Start; 6990 enum eat = q{ 6991 static if (getValue) 6992 grapheme ~= ch; 6993 range.popFront(); 6994 }; 6995 6996 dchar ch; 6997 assert(!range.empty, "Attempting to decode grapheme from an empty " ~ Input.stringof); 6998 while (!range.empty) 6999 { 7000 ch = range.front; 7001 final switch (state) with(GraphemeState) 7002 { 7003 case Start: 7004 mixin(eat); 7005 if (ch == '\r') 7006 state = CR; 7007 else if (isRegionalIndicator(ch)) 7008 state = RI; 7009 else if (isHangL(ch)) 7010 state = L; 7011 else if (hangLV[ch] || isHangV(ch)) 7012 state = V; 7013 else if (hangLVT[ch]) 7014 state = LVT; 7015 else if (isHangT(ch)) 7016 state = LVT; 7017 else 7018 { 7019 switch (ch) 7020 { 7021 mixin(controlSwitch); 7022 goto L_End; 7023 default: 7024 goto L_End_Extend; 7025 } 7026 } 7027 break; 7028 case CR: 7029 if (ch == '\n') 7030 mixin(eat); 7031 goto L_End_Extend; 7032 case RI: 7033 if (isRegionalIndicator(ch)) 7034 mixin(eat); 7035 else 7036 goto L_End_Extend; 7037 break; 7038 case L: 7039 if (isHangL(ch)) 7040 mixin(eat); 7041 else if (isHangV(ch) || hangLV[ch]) 7042 { 7043 state = V; 7044 mixin(eat); 7045 } 7046 else if (hangLVT[ch]) 7047 { 7048 state = LVT; 7049 mixin(eat); 7050 } 7051 else 7052 goto L_End_Extend; 7053 break; 7054 case V: 7055 if (isHangV(ch)) 7056 mixin(eat); 7057 else if (isHangT(ch)) 7058 { 7059 state = LVT; 7060 mixin(eat); 7061 } 7062 else 7063 goto L_End_Extend; 7064 break; 7065 case LVT: 7066 if (isHangT(ch)) 7067 { 7068 mixin(eat); 7069 } 7070 else 7071 goto L_End_Extend; 7072 break; 7073 } 7074 } 7075 L_End_Extend: 7076 while (!range.empty) 7077 { 7078 ch = range.front; 7079 // extend & spacing marks 7080 if (!graphemeExtend[ch] && !spacingMark[ch]) 7081 break; 7082 mixin(eat); 7083 } 7084 L_End: 7085 static if (getValue) 7086 return grapheme; 7087 } 7088 7089} 7090 7091public: // Public API continues 7092 7093/++ 7094 Computes the length of grapheme cluster starting at `index`. 7095 Both the resulting length and the `index` are measured 7096 in $(S_LINK Code unit, code units). 7097 7098 Params: 7099 C = type that is implicitly convertible to `dchars` 7100 input = array of grapheme clusters 7101 index = starting index into `input[]` 7102 7103 Returns: 7104 length of grapheme cluster 7105+/ 7106size_t graphemeStride(C)(const scope C[] input, size_t index) @safe pure 7107if (is(C : dchar)) 7108{ 7109 auto src = input[index..$]; 7110 auto n = src.length; 7111 genericDecodeGrapheme!(false)(src); 7112 return n - src.length; 7113} 7114 7115/// 7116@safe unittest 7117{ 7118 assert(graphemeStride(" ", 1) == 1); 7119 // A + combing ring above 7120 string city = "A\u030Arhus"; 7121 size_t first = graphemeStride(city, 0); 7122 assert(first == 3); //\u030A has 2 UTF-8 code units 7123 assert(city[0 .. first] == "A\u030A"); 7124 assert(city[first..$] == "rhus"); 7125} 7126 7127@safe unittest 7128{ 7129 // Ensure that graphemeStride is usable from CTFE. 7130 enum c1 = graphemeStride("A", 0); 7131 static assert(c1 == 1); 7132 7133 enum c2 = graphemeStride("A\u0301", 0); 7134 static assert(c2 == 3); // \u0301 has 2 UTF-8 code units 7135} 7136 7137/++ 7138 Reads one full grapheme cluster from an 7139 $(REF_ALTTEXT input range, isInputRange, std,range,primitives) of dchar `inp`. 7140 7141 For examples see the $(LREF Grapheme) below. 7142 7143 Note: 7144 This function modifies `inp` and thus `inp` 7145 must be an L-value. 7146+/ 7147Grapheme decodeGrapheme(Input)(ref Input inp) 7148if (isInputRange!Input && is(immutable ElementType!Input == immutable dchar)) 7149{ 7150 return genericDecodeGrapheme!true(inp); 7151} 7152 7153@safe unittest 7154{ 7155 import std.algorithm.comparison : equal; 7156 7157 Grapheme gr; 7158 string s = " \u0020\u0308 "; 7159 gr = decodeGrapheme(s); 7160 assert(gr.length == 1 && gr[0] == ' '); 7161 gr = decodeGrapheme(s); 7162 assert(gr.length == 2 && equal(gr[0 .. 2], " \u0308")); 7163 s = "\u0300\u0308\u1100"; 7164 assert(equal(decodeGrapheme(s)[], "\u0300\u0308")); 7165 assert(equal(decodeGrapheme(s)[], "\u1100")); 7166 s = "\u11A8\u0308\uAC01"; 7167 assert(equal(decodeGrapheme(s)[], "\u11A8\u0308")); 7168 assert(equal(decodeGrapheme(s)[], "\uAC01")); 7169} 7170 7171/++ 7172 $(P Iterate a string by $(LREF Grapheme).) 7173 7174 $(P Useful for doing string manipulation that needs to be aware 7175 of graphemes.) 7176 7177 See_Also: 7178 $(LREF byCodePoint) 7179+/ 7180auto byGrapheme(Range)(Range range) 7181if (isInputRange!Range && is(immutable ElementType!Range == immutable dchar)) 7182{ 7183 // TODO: Bidirectional access 7184 static struct Result(R) 7185 { 7186 private R _range; 7187 private Grapheme _front; 7188 7189 bool empty() @property 7190 { 7191 return _front.length == 0; 7192 } 7193 7194 Grapheme front() @property 7195 { 7196 return _front; 7197 } 7198 7199 void popFront() 7200 { 7201 _front = _range.empty ? Grapheme.init : _range.decodeGrapheme(); 7202 } 7203 7204 static if (isForwardRange!R) 7205 { 7206 Result save() @property 7207 { 7208 return Result(_range.save, _front); 7209 } 7210 } 7211 } 7212 7213 auto result = Result!(Range)(range); 7214 result.popFront(); 7215 return result; 7216} 7217 7218/// 7219@safe unittest 7220{ 7221 import std.algorithm.comparison : equal; 7222 import std.range.primitives : walkLength; 7223 import std.range : take, drop; 7224 auto text = "noe\u0308l"; // no��l using e + combining diaeresis 7225 assert(text.walkLength == 5); // 5 code points 7226 7227 auto gText = text.byGrapheme; 7228 assert(gText.walkLength == 4); // 4 graphemes 7229 7230 assert(gText.take(3).equal("noe\u0308".byGrapheme)); 7231 assert(gText.drop(3).equal("l".byGrapheme)); 7232} 7233 7234// For testing non-forward-range input ranges 7235version (StdUnittest) 7236private static @safe struct InputRangeString 7237{ 7238 private string s; 7239 7240 bool empty() @property { return s.empty; } 7241 dchar front() @property { return s.front; } 7242 void popFront() { s.popFront(); } 7243} 7244 7245@safe unittest 7246{ 7247 import std.algorithm.comparison : equal; 7248 import std.array : array; 7249 import std.range : retro; 7250 import std.range.primitives : walkLength; 7251 assert("".byGrapheme.walkLength == 0); 7252 7253 auto reverse = "le\u0308on"; 7254 assert(reverse.walkLength == 5); 7255 7256 auto gReverse = reverse.byGrapheme; 7257 assert(gReverse.walkLength == 4); 7258 7259 static foreach (text; AliasSeq!("noe\u0308l"c, "noe\u0308l"w, "noe\u0308l"d)) 7260 {{ 7261 assert(text.walkLength == 5); 7262 static assert(isForwardRange!(typeof(text))); 7263 7264 auto gText = text.byGrapheme; 7265 static assert(isForwardRange!(typeof(gText))); 7266 assert(gText.walkLength == 4); 7267 assert(gText.array.retro.equal(gReverse)); 7268 }} 7269 7270 auto nonForwardRange = InputRangeString("noe\u0308l").byGrapheme; 7271 static assert(!isForwardRange!(typeof(nonForwardRange))); 7272 assert(nonForwardRange.walkLength == 4); 7273} 7274 7275/++ 7276 $(P Lazily transform a range of $(LREF Grapheme)s to a range of code points.) 7277 7278 $(P Useful for converting the result to a string after doing operations 7279 on graphemes.) 7280 7281 $(P If passed in a range of code points, returns a range with equivalent capabilities.) 7282+/ 7283auto byCodePoint(Range)(Range range) 7284if (isInputRange!Range && is(immutable ElementType!Range == immutable Grapheme)) 7285{ 7286 // TODO: Propagate bidirectional access 7287 static struct Result 7288 { 7289 private Range _range; 7290 private size_t i = 0; 7291 7292 bool empty() @property 7293 { 7294 return _range.empty; 7295 } 7296 7297 dchar front() @property 7298 { 7299 return _range.front[i]; 7300 } 7301 7302 void popFront() 7303 { 7304 ++i; 7305 7306 if (i >= _range.front.length) 7307 { 7308 _range.popFront(); 7309 i = 0; 7310 } 7311 } 7312 7313 static if (isForwardRange!Range) 7314 { 7315 Result save() @property 7316 { 7317 return Result(_range.save, i); 7318 } 7319 } 7320 } 7321 7322 return Result(range); 7323} 7324 7325/// Ditto 7326auto byCodePoint(Range)(Range range) 7327if (isInputRange!Range && is(immutable ElementType!Range == immutable dchar)) 7328{ 7329 import std.range.primitives : isBidirectionalRange, popBack; 7330 import std.traits : isNarrowString; 7331 static if (isNarrowString!Range) 7332 { 7333 static struct Result 7334 { 7335 private Range _range; 7336 @property bool empty() { return _range.empty; } 7337 @property dchar front(){ return _range.front; } 7338 void popFront(){ _range.popFront; } 7339 @property auto save() { return Result(_range.save); } 7340 @property dchar back(){ return _range.back; } 7341 void popBack(){ _range.popBack; } 7342 } 7343 static assert(isBidirectionalRange!(Result)); 7344 return Result(range); 7345 } 7346 else 7347 return range; 7348} 7349 7350/// 7351@safe unittest 7352{ 7353 import std.array : array; 7354 import std.conv : text; 7355 import std.range : retro; 7356 7357 string s = "noe\u0308l"; // no��l 7358 7359 // reverse it and convert the result to a string 7360 string reverse = s.byGrapheme 7361 .array 7362 .retro 7363 .byCodePoint 7364 .text; 7365 7366 assert(reverse == "le\u0308on"); // l��on 7367} 7368 7369@safe unittest 7370{ 7371 import std.algorithm.comparison : equal; 7372 import std.range.primitives : walkLength; 7373 import std.range : retro; 7374 assert("".byGrapheme.byCodePoint.equal("")); 7375 7376 string text = "noe\u0308l"; 7377 static assert(!__traits(compiles, "noe\u0308l".byCodePoint.length)); 7378 7379 auto gText = InputRangeString(text).byGrapheme; 7380 static assert(!isForwardRange!(typeof(gText))); 7381 7382 auto cpText = gText.byCodePoint; 7383 static assert(!isForwardRange!(typeof(cpText))); 7384 7385 assert(cpText.walkLength == text.walkLength); 7386 7387 auto plainCp = text.byCodePoint; 7388 static assert(isForwardRange!(typeof(plainCp))); 7389 assert(equal(plainCp, text)); 7390 assert(equal(retro(plainCp.save), retro(text.save))); 7391 // Check that we still have length for dstring 7392 assert("����������"d.byCodePoint.length == 5); 7393} 7394 7395/++ 7396 $(P A structure designed to effectively pack $(CHARACTERS) 7397 of a $(CLUSTER). 7398 ) 7399 7400 $(P `Grapheme` has value semantics so 2 copies of a `Grapheme` 7401 always refer to distinct objects. In most actual scenarios a `Grapheme` 7402 fits on the stack and avoids memory allocation overhead for all but quite 7403 long clusters. 7404 ) 7405 7406 See_Also: $(LREF decodeGrapheme), $(LREF graphemeStride) 7407+/ 7408@safe struct Grapheme 7409{ 7410 import std.exception : enforce; 7411 import std.traits : isDynamicArray; 7412 7413public: 7414 /// Ctor 7415 this(C)(const scope C[] chars...) 7416 if (is(C : dchar)) 7417 { 7418 this ~= chars; 7419 } 7420 7421 ///ditto 7422 this(Input)(Input seq) 7423 if (!isDynamicArray!Input 7424 && isInputRange!Input && is(ElementType!Input : dchar)) 7425 { 7426 this ~= seq; 7427 } 7428 7429 /// Gets a $(CODEPOINT) at the given index in this cluster. 7430 dchar opIndex(size_t index) const @nogc nothrow pure @trusted 7431 { 7432 assert(index < length); 7433 return read24(isBig ? ptr_ : small_.ptr, index); 7434 } 7435 7436 /++ 7437 Writes a $(CODEPOINT) `ch` at given index in this cluster. 7438 7439 Warning: 7440 Use of this facility may invalidate grapheme cluster, 7441 see also $(LREF Grapheme.valid). 7442 +/ 7443 void opIndexAssign(dchar ch, size_t index) @nogc nothrow pure @trusted 7444 { 7445 assert(index < length); 7446 write24(isBig ? ptr_ : small_.ptr, ch, index); 7447 } 7448 7449 /// 7450 @safe unittest 7451 { 7452 auto g = Grapheme("A\u0302"); 7453 assert(g[0] == 'A'); 7454 assert(g.valid); 7455 g[1] = '~'; // ASCII tilda is not a combining mark 7456 assert(g[1] == '~'); 7457 assert(!g.valid); 7458 } 7459 7460 /++ 7461 Random-access range over Grapheme's $(CHARACTERS). 7462 7463 Warning: Invalidates when this Grapheme leaves the scope, 7464 attempts to use it then would lead to memory corruption. 7465 +/ 7466 SliceOverIndexed!Grapheme opSlice(size_t a, size_t b) @nogc nothrow pure return 7467 { 7468 return sliceOverIndexed(a, b, &this); 7469 } 7470 7471 /// ditto 7472 SliceOverIndexed!Grapheme opSlice() @nogc nothrow pure return 7473 { 7474 return sliceOverIndexed(0, length, &this); 7475 } 7476 7477 /// Grapheme cluster length in $(CODEPOINTS). 7478 @property size_t length() const @nogc nothrow pure 7479 { 7480 return isBig ? len_ : slen_ & 0x7F; 7481 } 7482 7483 /++ 7484 Append $(CHARACTER) `ch` to this grapheme. 7485 Warning: 7486 Use of this facility may invalidate grapheme cluster, 7487 see also `valid`. 7488 7489 See_Also: $(LREF Grapheme.valid) 7490 +/ 7491 ref opOpAssign(string op)(dchar ch) @trusted 7492 { 7493 static if (op == "~") 7494 { 7495 import std.internal.memory : enforceRealloc; 7496 if (!isBig) 7497 { 7498 if (slen_ == small_cap) 7499 convertToBig();// & fallthrough to "big" branch 7500 else 7501 { 7502 write24(small_.ptr, ch, smallLength); 7503 slen_++; 7504 return this; 7505 } 7506 } 7507 7508 assert(isBig); 7509 if (len_ == cap_) 7510 { 7511 import core.checkedint : addu, mulu; 7512 bool overflow; 7513 cap_ = addu(cap_, grow, overflow); 7514 auto nelems = mulu(3, addu(cap_, 1, overflow), overflow); 7515 if (overflow) assert(0); 7516 ptr_ = cast(ubyte*) enforceRealloc(ptr_, nelems); 7517 } 7518 write24(ptr_, ch, len_++); 7519 return this; 7520 } 7521 else 7522 static assert(false, "No operation "~op~" defined for Grapheme"); 7523 } 7524 7525 /// 7526 @safe unittest 7527 { 7528 import std.algorithm.comparison : equal; 7529 auto g = Grapheme("A"); 7530 assert(g.valid); 7531 g ~= '\u0301'; 7532 assert(g[].equal("A\u0301")); 7533 assert(g.valid); 7534 g ~= "B"; 7535 // not a valid grapheme cluster anymore 7536 assert(!g.valid); 7537 // still could be useful though 7538 assert(g[].equal("A\u0301B")); 7539 } 7540 7541 /// Append all $(CHARACTERS) from the input range `inp` to this Grapheme. 7542 ref opOpAssign(string op, Input)(scope Input inp) 7543 if (isInputRange!Input && is(ElementType!Input : dchar)) 7544 { 7545 static if (op == "~") 7546 { 7547 foreach (dchar ch; inp) 7548 this ~= ch; 7549 return this; 7550 } 7551 else 7552 static assert(false, "No operation "~op~" defined for Grapheme"); 7553 } 7554 7555 /++ 7556 True if this object contains valid extended grapheme cluster. 7557 Decoding primitives of this module always return a valid `Grapheme`. 7558 7559 Appending to and direct manipulation of grapheme's $(CHARACTERS) may 7560 render it no longer valid. Certain applications may chose to use 7561 Grapheme as a "small string" of any $(CODEPOINTS) and ignore this property 7562 entirely. 7563 +/ 7564 @property bool valid()() /*const*/ 7565 { 7566 auto r = this[]; 7567 genericDecodeGrapheme!false(r); 7568 return r.length == 0; 7569 } 7570 7571 this(this) @nogc nothrow pure @trusted 7572 { 7573 import std.internal.memory : enforceMalloc; 7574 if (isBig) 7575 {// dup it 7576 import core.checkedint : addu, mulu; 7577 bool overflow; 7578 auto raw_cap = mulu(3, addu(cap_, 1, overflow), overflow); 7579 if (overflow) assert(0); 7580 7581 auto p = cast(ubyte*) enforceMalloc(raw_cap); 7582 p[0 .. raw_cap] = ptr_[0 .. raw_cap]; 7583 ptr_ = p; 7584 } 7585 } 7586 7587 ~this() @nogc nothrow pure @trusted 7588 { 7589 import core.memory : pureFree; 7590 if (isBig) 7591 { 7592 pureFree(ptr_); 7593 } 7594 } 7595 7596 7597private: 7598 enum small_bytes = ((ubyte*).sizeof+3*size_t.sizeof-1); 7599 // "out of the blue" grow rate, needs testing 7600 // (though graphemes are typically small < 9) 7601 enum grow = 20; 7602 enum small_cap = small_bytes/3; 7603 enum small_flag = 0x80, small_mask = 0x7F; 7604 // 16 bytes in 32bits, should be enough for the majority of cases 7605 union 7606 { 7607 struct 7608 { 7609 ubyte* ptr_; 7610 size_t cap_; 7611 size_t len_; 7612 size_t padding_; 7613 } 7614 struct 7615 { 7616 ubyte[small_bytes] small_; 7617 ubyte slen_; 7618 } 7619 } 7620 7621 void convertToBig() @nogc nothrow pure @trusted 7622 { 7623 import std.internal.memory : enforceMalloc; 7624 static assert(grow.max / 3 - 1 >= grow); 7625 enum nbytes = 3 * (grow + 1); 7626 size_t k = smallLength; 7627 ubyte* p = cast(ubyte*) enforceMalloc(nbytes); 7628 for (int i=0; i<k; i++) 7629 write24(p, read24(small_.ptr, i), i); 7630 // now we can overwrite small array data 7631 ptr_ = p; 7632 len_ = slen_; 7633 assert(grow > len_); 7634 cap_ = grow; 7635 setBig(); 7636 } 7637 7638 void setBig() @nogc nothrow pure { slen_ |= small_flag; } 7639 7640 @property size_t smallLength() const @nogc nothrow pure 7641 { 7642 return slen_ & small_mask; 7643 } 7644 @property ubyte isBig() const @nogc nothrow pure 7645 { 7646 return slen_ & small_flag; 7647 } 7648} 7649 7650static assert(Grapheme.sizeof == size_t.sizeof*4); 7651 7652 7653@safe pure /*nothrow @nogc*/ unittest // TODO: string .front is GC and throw 7654{ 7655 import std.algorithm.comparison : equal; 7656 Grapheme[3] data = [Grapheme("��"), Grapheme("��"), Grapheme("��")]; 7657 assert(byGrapheme("������").equal(data[])); 7658} 7659 7660/// 7661@safe unittest 7662{ 7663 import std.algorithm.comparison : equal; 7664 import std.algorithm.iteration : filter; 7665 import std.range : isRandomAccessRange; 7666 7667 string bold = "ku\u0308hn"; 7668 7669 // note that decodeGrapheme takes parameter by ref 7670 auto first = decodeGrapheme(bold); 7671 7672 assert(first.length == 1); 7673 assert(first[0] == 'k'); 7674 7675 // the next grapheme is 2 characters long 7676 auto wideOne = decodeGrapheme(bold); 7677 // slicing a grapheme yields a random-access range of dchar 7678 assert(wideOne[].equal("u\u0308")); 7679 assert(wideOne.length == 2); 7680 static assert(isRandomAccessRange!(typeof(wideOne[]))); 7681 7682 // all of the usual range manipulation is possible 7683 assert(wideOne[].filter!isMark().equal("\u0308")); 7684 7685 auto g = Grapheme("A"); 7686 assert(g.valid); 7687 g ~= '\u0301'; 7688 assert(g[].equal("A\u0301")); 7689 assert(g.valid); 7690 g ~= "B"; 7691 // not a valid grapheme cluster anymore 7692 assert(!g.valid); 7693 // still could be useful though 7694 assert(g[].equal("A\u0301B")); 7695} 7696 7697@safe unittest 7698{ 7699 auto g = Grapheme("A\u0302"); 7700 assert(g[0] == 'A'); 7701 assert(g.valid); 7702 g[1] = '~'; // ASCII tilda is not a combining mark 7703 assert(g[1] == '~'); 7704 assert(!g.valid); 7705} 7706 7707@safe unittest 7708{ 7709 import std.algorithm.comparison : equal; 7710 import std.algorithm.iteration : map; 7711 import std.conv : text; 7712 import std.range : iota; 7713 7714 // not valid clusters (but it just a test) 7715 auto g = Grapheme('a', 'b', 'c', 'd', 'e'); 7716 assert(g[0] == 'a'); 7717 assert(g[1] == 'b'); 7718 assert(g[2] == 'c'); 7719 assert(g[3] == 'd'); 7720 assert(g[4] == 'e'); 7721 g[3] = '��'; 7722 assert(g[2] == 'c'); 7723 assert(g[3] == '��', text(g[3], " vs ", '��')); 7724 assert(g[4] == 'e'); 7725 assert(!g.valid); 7726 7727 g ~= '��'; 7728 g ~= '~'; 7729 assert(g[0] == 'a'); 7730 assert(g[1] == 'b'); 7731 assert(g[2] == 'c'); 7732 assert(g[3] == '��'); 7733 assert(g[4] == 'e'); 7734 assert(g[5] == '��'); 7735 assert(g[6] == '~'); 7736 assert(!g.valid); 7737 7738 Grapheme copy = g; 7739 copy[0] = 'X'; 7740 copy[1] = '-'; 7741 assert(g[0] == 'a' && copy[0] == 'X'); 7742 assert(g[1] == 'b' && copy[1] == '-'); 7743 assert(equal(g[2 .. g.length], copy[2 .. copy.length])); 7744 copy = Grapheme("��������������������������"); 7745 assert(equal(copy[0 .. 8], "����������������"), text(copy[0 .. 8])); 7746 copy ~= "xyz"; 7747 assert(equal(copy[13 .. 15], "xy"), text(copy[13 .. 15])); 7748 assert(!copy.valid); 7749 7750 Grapheme h; 7751 foreach (dchar v; iota(cast(int)'A', cast(int)'Z'+1).map!"cast(dchar)a"()) 7752 h ~= v; 7753 assert(equal(h[], iota(cast(int)'A', cast(int)'Z'+1))); 7754} 7755 7756/++ 7757 $(P Does basic case-insensitive comparison of `r1` and `r2`. 7758 This function uses simpler comparison rule thus achieving better performance 7759 than $(LREF icmp). However keep in mind the warning below.) 7760 7761 Params: 7762 r1 = an $(REF_ALTTEXT input range, isInputRange, std,range,primitives) of characters 7763 r2 = an $(REF_ALTTEXT input range, isInputRange, std,range,primitives) of characters 7764 7765 Returns: 7766 An `int` that is 0 if the strings match, 7767 <0 if `r1` is lexicographically "less" than `r2`, 7768 >0 if `r1` is lexicographically "greater" than `r2` 7769 7770 Warning: 7771 This function only handles 1:1 $(CODEPOINT) mapping 7772 and thus is not sufficient for certain alphabets 7773 like German, Greek and few others. 7774 7775 See_Also: 7776 $(LREF icmp) 7777 $(REF cmp, std,algorithm,comparison) 7778+/ 7779int sicmp(S1, S2)(scope S1 r1, scope S2 r2) 7780if (isInputRange!S1 && isSomeChar!(ElementEncodingType!S1) 7781 && isInputRange!S2 && isSomeChar!(ElementEncodingType!S2)) 7782{ 7783 import std.internal.unicode_tables : sTable = simpleCaseTable; // generated file 7784 import std.range.primitives : isInfinite; 7785 import std.utf : decodeFront; 7786 import std.traits : isDynamicArray; 7787 import std.typecons : Yes; 7788 static import std.ascii; 7789 7790 static if ((isDynamicArray!S1 || isRandomAccessRange!S1) 7791 && (isDynamicArray!S2 || isRandomAccessRange!S2) 7792 && !(isInfinite!S1 && isInfinite!S2) 7793 && __traits(compiles, 7794 { 7795 size_t s = size_t.sizeof / 2; 7796 r1 = r1[s .. $]; 7797 r2 = r2[s .. $]; 7798 })) 7799 {{ 7800 // ASCII optimization for dynamic arrays & similar. 7801 size_t i = 0; 7802 static if (isInfinite!S1) 7803 immutable end = r2.length; 7804 else static if (isInfinite!S2) 7805 immutable end = r1.length; 7806 else 7807 immutable end = r1.length > r2.length ? r2.length : r1.length; 7808 for (; i < end; ++i) 7809 { 7810 auto lhs = r1[i]; 7811 auto rhs = r2[i]; 7812 if ((lhs | rhs) >= 0x80) goto NonAsciiPath; 7813 if (lhs == rhs) continue; 7814 auto lowDiff = std.ascii.toLower(lhs) - std.ascii.toLower(rhs); 7815 if (lowDiff) return lowDiff; 7816 } 7817 static if (isInfinite!S1) 7818 return 1; 7819 else static if (isInfinite!S2) 7820 return -1; 7821 else 7822 return (r1.length > r2.length) - (r2.length > r1.length); 7823 7824 NonAsciiPath: 7825 r1 = r1[i .. $]; 7826 r2 = r2[i .. $]; 7827 // Fall through to standard case. 7828 }} 7829 7830 while (!r1.empty) 7831 { 7832 immutable lhs = decodeFront!(Yes.useReplacementDchar)(r1); 7833 if (r2.empty) 7834 return 1; 7835 immutable rhs = decodeFront!(Yes.useReplacementDchar)(r2); 7836 int diff = lhs - rhs; 7837 if (!diff) 7838 continue; 7839 if ((lhs | rhs) < 0x80) 7840 { 7841 immutable d = std.ascii.toLower(lhs) - std.ascii.toLower(rhs); 7842 if (!d) continue; 7843 return d; 7844 } 7845 size_t idx = simpleCaseTrie[lhs]; 7846 size_t idx2 = simpleCaseTrie[rhs]; 7847 // simpleCaseTrie is packed index table 7848 if (idx != EMPTY_CASE_TRIE) 7849 { 7850 if (idx2 != EMPTY_CASE_TRIE) 7851 {// both cased chars 7852 // adjust idx --> start of bucket 7853 idx = idx - sTable[idx].n; 7854 idx2 = idx2 - sTable[idx2].n; 7855 if (idx == idx2)// one bucket, equivalent chars 7856 continue; 7857 else// not the same bucket 7858 diff = sTable[idx].ch - sTable[idx2].ch; 7859 } 7860 else 7861 diff = sTable[idx - sTable[idx].n].ch - rhs; 7862 } 7863 else if (idx2 != EMPTY_CASE_TRIE) 7864 { 7865 diff = lhs - sTable[idx2 - sTable[idx2].n].ch; 7866 } 7867 // one of chars is not cased at all 7868 return diff; 7869 } 7870 return int(r2.empty) - 1; 7871} 7872 7873/// 7874@safe @nogc pure nothrow unittest 7875{ 7876 assert(sicmp("������������", "������������") == 0); 7877 // Greek also works as long as there is no 1:M mapping in sight 7878 assert(sicmp("����", "����") == 0); 7879 // things like the following won't get matched as equal 7880 // Greek small letter iota with dialytika and tonos 7881 assert(sicmp("��", "\u03B9\u0308\u0301") != 0); 7882 7883 // while icmp has no problem with that 7884 assert(icmp("��", "\u03B9\u0308\u0301") == 0); 7885 assert(icmp("����", "����") == 0); 7886} 7887 7888// overloads for the most common cases to reduce compile time 7889@safe @nogc pure nothrow 7890{ 7891 int sicmp(scope const(char)[] str1, scope const(char)[] str2) 7892 { return sicmp!(const(char)[], const(char)[])(str1, str2); } 7893 7894 int sicmp(scope const(wchar)[] str1, scope const(wchar)[] str2) 7895 { return sicmp!(const(wchar)[], const(wchar)[])(str1, str2); } 7896 7897 int sicmp(scope const(dchar)[] str1, scope const(dchar)[] str2) 7898 { return sicmp!(const(dchar)[], const(dchar)[])(str1, str2); } 7899} 7900 7901private int fullCasedCmp(Range)(dchar lhs, dchar rhs, ref Range rtail) 7902{ 7903 import std.algorithm.searching : skipOver; 7904 import std.internal.unicode_tables : fullCaseTable; // generated file 7905 alias fTable = fullCaseTable; 7906 size_t idx = fullCaseTrie[lhs]; 7907 // fullCaseTrie is packed index table 7908 if (idx == EMPTY_CASE_TRIE) 7909 return lhs; 7910 immutable start = idx - fTable[idx].n; 7911 immutable end = fTable[idx].size + start; 7912 assert(fTable[start].entry_len == 1); 7913 for (idx=start; idx<end; idx++) 7914 { 7915 auto entryLen = fTable[idx].entry_len; 7916 if (entryLen == 1) 7917 { 7918 if (fTable[idx].seq[0] == rhs) 7919 { 7920 return 0; 7921 } 7922 } 7923 else 7924 {// OK it's a long chunk, like 'ss' for German 7925 dstring seq = fTable[idx].seq[0 .. entryLen]; 7926 if (rhs == seq[0] 7927 && rtail.skipOver(seq[1..$])) 7928 { 7929 // note that this path modifies rtail 7930 // iff we managed to get there 7931 return 0; 7932 } 7933 } 7934 } 7935 return fTable[start].seq[0]; // new remapped character for accurate diffs 7936} 7937 7938/++ 7939 Does case insensitive comparison of `r1` and `r2`. 7940 Follows the rules of full case-folding mapping. 7941 This includes matching as equal german �� with "ss" and 7942 other 1:M $(CODEPOINT) mappings unlike $(LREF sicmp). 7943 The cost of `icmp` being pedantically correct is 7944 slightly worse performance. 7945 7946 Params: 7947 r1 = a forward range of characters 7948 r2 = a forward range of characters 7949 7950 Returns: 7951 An `int` that is 0 if the strings match, 7952 <0 if `str1` is lexicographically "less" than `str2`, 7953 >0 if `str1` is lexicographically "greater" than `str2` 7954 7955 See_Also: 7956 $(LREF sicmp) 7957 $(REF cmp, std,algorithm,comparison) 7958+/ 7959int icmp(S1, S2)(S1 r1, S2 r2) 7960if (isForwardRange!S1 && isSomeChar!(ElementEncodingType!S1) 7961 && isForwardRange!S2 && isSomeChar!(ElementEncodingType!S2)) 7962{ 7963 import std.range.primitives : isInfinite; 7964 import std.traits : isDynamicArray; 7965 import std.utf : byDchar; 7966 static import std.ascii; 7967 7968 static if ((isDynamicArray!S1 || isRandomAccessRange!S1) 7969 && (isDynamicArray!S2 || isRandomAccessRange!S2) 7970 && !(isInfinite!S1 && isInfinite!S2) 7971 && __traits(compiles, 7972 { 7973 size_t s = size_t.max / 2; 7974 r1 = r1[s .. $]; 7975 r2 = r2[s .. $]; 7976 })) 7977 {{ 7978 // ASCII optimization for dynamic arrays & similar. 7979 size_t i = 0; 7980 static if (isInfinite!S1) 7981 immutable end = r2.length; 7982 else static if (isInfinite!S2) 7983 immutable end = r1.length; 7984 else 7985 immutable end = r1.length > r2.length ? r2.length : r1.length; 7986 for (; i < end; ++i) 7987 { 7988 auto lhs = r1[i]; 7989 auto rhs = r2[i]; 7990 if ((lhs | rhs) >= 0x80) goto NonAsciiPath; 7991 if (lhs == rhs) continue; 7992 auto lowDiff = std.ascii.toLower(lhs) - std.ascii.toLower(rhs); 7993 if (lowDiff) return lowDiff; 7994 } 7995 static if (isInfinite!S1) 7996 return 1; 7997 else static if (isInfinite!S2) 7998 return -1; 7999 else 8000 return (r1.length > r2.length) - (r2.length > r1.length); 8001 8002 NonAsciiPath: 8003 r1 = r1[i .. $]; 8004 r2 = r2[i .. $]; 8005 // Fall through to standard case. 8006 }} 8007 8008 auto str1 = r1.byDchar; 8009 auto str2 = r2.byDchar; 8010 8011 for (;;) 8012 { 8013 if (str1.empty) 8014 return str2.empty ? 0 : -1; 8015 immutable lhs = str1.front; 8016 if (str2.empty) 8017 return 1; 8018 immutable rhs = str2.front; 8019 str1.popFront(); 8020 str2.popFront(); 8021 if (!(lhs - rhs)) 8022 continue; 8023 // first try to match lhs to <rhs,right-tail> sequence 8024 immutable cmpLR = fullCasedCmp(lhs, rhs, str2); 8025 if (!cmpLR) 8026 continue; 8027 // then rhs to <lhs,left-tail> sequence 8028 immutable cmpRL = fullCasedCmp(rhs, lhs, str1); 8029 if (!cmpRL) 8030 continue; 8031 // cmpXX contain remapped codepoints 8032 // to obtain stable ordering of icmp 8033 return cmpLR - cmpRL; 8034 } 8035} 8036 8037/// 8038@safe @nogc pure nothrow unittest 8039{ 8040 assert(icmp("Ru��land", "Russland") == 0); 8041 assert(icmp("��� -> \u1F70\u03B9", "\u1F61\u03B9 -> ���") == 0); 8042} 8043 8044/** 8045 * By using $(REF byUTF, std,utf) and its aliases, GC allocations via auto-decoding 8046 * and thrown exceptions can be avoided, making `icmp` `@safe @nogc nothrow pure`. 8047 */ 8048@safe @nogc nothrow pure unittest 8049{ 8050 import std.utf : byDchar; 8051 8052 assert(icmp("Ru��land".byDchar, "Russland".byDchar) == 0); 8053 assert(icmp("��� -> \u1F70\u03B9".byDchar, "\u1F61\u03B9 -> ���".byDchar) == 0); 8054} 8055 8056// test different character types 8057@safe unittest 8058{ 8059 assert(icmp("Ru��land", "Russland") == 0); 8060 assert(icmp("Ru��land"w, "Russland") == 0); 8061 assert(icmp("Ru��land", "Russland"w) == 0); 8062 assert(icmp("Ru��land"w, "Russland"w) == 0); 8063 assert(icmp("Ru��land"d, "Russland"w) == 0); 8064 assert(icmp("Ru��land"w, "Russland"d) == 0); 8065} 8066 8067// overloads for the most common cases to reduce compile time 8068@safe @nogc pure nothrow 8069{ 8070 int icmp(const(char)[] str1, const(char)[] str2) 8071 { return icmp!(const(char)[], const(char)[])(str1, str2); } 8072 int icmp(const(wchar)[] str1, const(wchar)[] str2) 8073 { return icmp!(const(wchar)[], const(wchar)[])(str1, str2); } 8074 int icmp(const(dchar)[] str1, const(dchar)[] str2) 8075 { return icmp!(const(dchar)[], const(dchar)[])(str1, str2); } 8076} 8077 8078@safe unittest 8079{ 8080 import std.algorithm.sorting : sort; 8081 import std.conv : to; 8082 import std.exception : assertCTFEable; 8083 assertCTFEable!( 8084 { 8085 static foreach (cfunc; AliasSeq!(icmp, sicmp)) 8086 {{ 8087 static foreach (S1; AliasSeq!(string, wstring, dstring)) 8088 static foreach (S2; AliasSeq!(string, wstring, dstring)) 8089 { 8090 assert(cfunc("".to!S1(), "".to!S2()) == 0); 8091 assert(cfunc("A".to!S1(), "".to!S2()) > 0); 8092 assert(cfunc("".to!S1(), "0".to!S2()) < 0); 8093 assert(cfunc("abc".to!S1(), "abc".to!S2()) == 0); 8094 assert(cfunc("abcd".to!S1(), "abc".to!S2()) > 0); 8095 assert(cfunc("abc".to!S1(), "abcd".to!S2()) < 0); 8096 assert(cfunc("Abc".to!S1(), "aBc".to!S2()) == 0); 8097 assert(cfunc("������������".to!S1(), "������������".to!S2()) == 0); 8098 // Check example: 8099 assert(cfunc("������������".to!S1(), "������������".to!S2()) == 0); 8100 assert(cfunc("����".to!S1(), "����".to!S2()) == 0); 8101 } 8102 // check that the order is properly agnostic to the case 8103 auto strs = [ "Apple", "ORANGE", "orAcle", "amp", "banana"]; 8104 sort!((a,b) => cfunc(a,b) < 0)(strs); 8105 assert(strs == ["amp", "Apple", "banana", "orAcle", "ORANGE"]); 8106 }} 8107 assert(icmp("��b", "ssa") > 0); 8108 // Check example: 8109 assert(icmp("Russland", "Ru��land") == 0); 8110 assert(icmp("��� -> \u1F70\u03B9", "\u1F61\u03B9 -> ���") == 0); 8111 assert(icmp("��"w, "\u03B9\u0308\u0301") == 0); 8112 assert(sicmp("��", "\u03B9\u0308\u0301") != 0); 8113 // https://issues.dlang.org/show_bug.cgi?id=11057 8114 assert( icmp("K", "L") < 0 ); 8115 }); 8116} 8117 8118// https://issues.dlang.org/show_bug.cgi?id=17372 8119@safe pure unittest 8120{ 8121 import std.algorithm.iteration : joiner, map; 8122 import std.algorithm.sorting : sort; 8123 import std.array : array; 8124 auto a = [["foo", "bar"], ["baz"]].map!(line => line.joiner(" ")).array.sort!((a, b) => icmp(a, b) < 0); 8125} 8126 8127// This is package(std) for the moment to be used as a support tool for std.regex 8128// It needs a better API 8129/* 8130 Return a range of all $(CODEPOINTS) that casefold to 8131 and from this `ch`. 8132*/ 8133package(std) auto simpleCaseFoldings(dchar ch) @safe 8134{ 8135 import std.internal.unicode_tables : simpleCaseTable; // generated file 8136 alias sTable = simpleCaseTable; 8137 static struct Range 8138 { 8139 @safe pure nothrow: 8140 uint idx; //if == uint.max, then read c. 8141 union 8142 { 8143 dchar c; // == 0 - empty range 8144 uint len; 8145 } 8146 @property bool isSmall() const { return idx == uint.max; } 8147 8148 this(dchar ch) 8149 { 8150 idx = uint.max; 8151 c = ch; 8152 } 8153 8154 this(uint start, uint size) 8155 { 8156 idx = start; 8157 len = size; 8158 } 8159 8160 @property dchar front() const 8161 { 8162 assert(!empty); 8163 if (isSmall) 8164 { 8165 return c; 8166 } 8167 auto ch = sTable[idx].ch; 8168 return ch; 8169 } 8170 8171 @property bool empty() const 8172 { 8173 if (isSmall) 8174 { 8175 return c == 0; 8176 } 8177 return len == 0; 8178 } 8179 8180 @property size_t length() const 8181 { 8182 if (isSmall) 8183 { 8184 return c == 0 ? 0 : 1; 8185 } 8186 return len; 8187 } 8188 8189 void popFront() 8190 { 8191 if (isSmall) 8192 c = 0; 8193 else 8194 { 8195 idx++; 8196 len--; 8197 } 8198 } 8199 } 8200 immutable idx = simpleCaseTrie[ch]; 8201 if (idx == EMPTY_CASE_TRIE) 8202 return Range(ch); 8203 auto entry = sTable[idx]; 8204 immutable start = idx - entry.n; 8205 return Range(start, entry.size); 8206} 8207 8208@safe unittest 8209{ 8210 import std.algorithm.comparison : equal; 8211 import std.algorithm.searching : canFind; 8212 import std.array : array; 8213 import std.exception : assertCTFEable; 8214 assertCTFEable!((){ 8215 auto r = simpleCaseFoldings('��').array; 8216 assert(r.length == 2); 8217 assert(r.canFind('��') && r.canFind('��')); 8218 auto sr = simpleCaseFoldings('~'); 8219 assert(sr.equal("~")); 8220 //A with ring above - casefolds to the same bucket as Angstrom sign 8221 sr = simpleCaseFoldings('��'); 8222 assert(sr.length == 3); 8223 assert(sr.canFind('��') && sr.canFind('��') && sr.canFind('\u212B')); 8224 }); 8225} 8226 8227/++ 8228 $(P Returns the $(S_LINK Combining class, combining class) of `ch`.) 8229+/ 8230ubyte combiningClass(dchar ch) @safe pure nothrow @nogc 8231{ 8232 return combiningClassTrie[ch]; 8233} 8234 8235/// 8236@safe unittest 8237{ 8238 // shorten the code 8239 alias CC = combiningClass; 8240 8241 // combining tilda 8242 assert(CC('\u0303') == 230); 8243 // combining ring below 8244 assert(CC('\u0325') == 220); 8245 // the simple consequence is that "tilda" should be 8246 // placed after a "ring below" in a sequence 8247} 8248 8249@safe pure nothrow @nogc unittest 8250{ 8251 foreach (ch; 0 .. 0x80) 8252 assert(combiningClass(ch) == 0); 8253 assert(combiningClass('\u05BD') == 22); 8254 assert(combiningClass('\u0300') == 230); 8255 assert(combiningClass('\u0317') == 220); 8256 assert(combiningClass('\u1939') == 222); 8257} 8258 8259/// Unicode character decomposition type. 8260enum UnicodeDecomposition { 8261 /// Canonical decomposition. The result is canonically equivalent sequence. 8262 Canonical, 8263 /** 8264 Compatibility decomposition. The result is compatibility equivalent sequence. 8265 Note: Compatibility decomposition is a $(B lossy) conversion, 8266 typically suitable only for fuzzy matching and internal processing. 8267 */ 8268 Compatibility 8269} 8270 8271/** 8272 Shorthand aliases for character decomposition type, passed as a 8273 template parameter to $(LREF decompose). 8274*/ 8275enum { 8276 Canonical = UnicodeDecomposition.Canonical, 8277 Compatibility = UnicodeDecomposition.Compatibility 8278} 8279 8280/++ 8281 Try to canonically compose 2 $(CHARACTERS). 8282 Returns the composed $(CHARACTER) if they do compose and dchar.init otherwise. 8283 8284 The assumption is that `first` comes before `second` in the original text, 8285 usually meaning that the first is a starter. 8286 8287 Note: Hangul syllables are not covered by this function. 8288 See `composeJamo` below. 8289+/ 8290public dchar compose(dchar first, dchar second) pure nothrow @safe 8291{ 8292 import std.algorithm.iteration : map; 8293 import std.internal.unicode_comp : compositionTable, composeCntShift, composeIdxMask; 8294 import std.range : assumeSorted; 8295 immutable packed = compositionJumpTrie[first]; 8296 if (packed == ushort.max) 8297 return dchar.init; 8298 // unpack offset and length 8299 immutable idx = packed & composeIdxMask, cnt = packed >> composeCntShift; 8300 // TODO: optimize this micro binary search (no more then 4-5 steps) 8301 auto r = compositionTable[idx .. idx+cnt].map!"a.rhs"().assumeSorted(); 8302 immutable target = r.lowerBound(second).length; 8303 if (target == cnt) 8304 return dchar.init; 8305 immutable entry = compositionTable[idx+target]; 8306 if (entry.rhs != second) 8307 return dchar.init; 8308 return entry.composed; 8309} 8310 8311/// 8312@safe unittest 8313{ 8314 assert(compose('A','\u0308') == '\u00C4'); 8315 assert(compose('A', 'B') == dchar.init); 8316 assert(compose('C', '\u0301') == '\u0106'); 8317 // note that the starter is the first one 8318 // thus the following doesn't compose 8319 assert(compose('\u0308', 'A') == dchar.init); 8320} 8321 8322/++ 8323 Returns a full $(S_LINK Canonical decomposition, Canonical) 8324 (by default) or $(S_LINK Compatibility decomposition, Compatibility) 8325 decomposition of $(CHARACTER) `ch`. 8326 If no decomposition is available returns a $(LREF Grapheme) 8327 with the `ch` itself. 8328 8329 Note: 8330 This function also decomposes hangul syllables 8331 as prescribed by the standard. 8332 8333 See_Also: $(LREF decomposeHangul) for a restricted version 8334 that takes into account only hangul syllables but 8335 no other decompositions. 8336+/ 8337public Grapheme decompose(UnicodeDecomposition decompType=Canonical)(dchar ch) @safe 8338{ 8339 import std.algorithm.searching : until; 8340 import std.internal.unicode_decomp : decompCompatTable, decompCanonTable; 8341 static if (decompType == Canonical) 8342 { 8343 alias table = decompCanonTable; 8344 alias mapping = canonMappingTrie; 8345 } 8346 else static if (decompType == Compatibility) 8347 { 8348 alias table = decompCompatTable; 8349 alias mapping = compatMappingTrie; 8350 } 8351 immutable idx = mapping[ch]; 8352 if (!idx) // not found, check hangul arithmetic decomposition 8353 return decomposeHangul(ch); 8354 auto decomp = table[idx..$].until(0); 8355 return Grapheme(decomp); 8356} 8357 8358/// 8359@safe unittest 8360{ 8361 import std.algorithm.comparison : equal; 8362 8363 assert(compose('A','\u0308') == '\u00C4'); 8364 assert(compose('A', 'B') == dchar.init); 8365 assert(compose('C', '\u0301') == '\u0106'); 8366 // note that the starter is the first one 8367 // thus the following doesn't compose 8368 assert(compose('\u0308', 'A') == dchar.init); 8369 8370 assert(decompose('��')[].equal("C\u0302")); 8371 assert(decompose('D')[].equal("D")); 8372 assert(decompose('\uD4DC')[].equal("\u1111\u1171\u11B7")); 8373 assert(decompose!Compatibility('��')[].equal("1")); 8374} 8375 8376//---------------------------------------------------------------------------- 8377// Hangul specific composition/decomposition 8378enum jamoSBase = 0xAC00; 8379enum jamoLBase = 0x1100; 8380enum jamoVBase = 0x1161; 8381enum jamoTBase = 0x11A7; 8382enum jamoLCount = 19, jamoVCount = 21, jamoTCount = 28; 8383enum jamoNCount = jamoVCount * jamoTCount; 8384enum jamoSCount = jamoLCount * jamoNCount; 8385 8386// Tests if `ch` is a Hangul leading consonant jamo. 8387bool isJamoL(dchar ch) pure nothrow @nogc @safe 8388{ 8389 // first cmp rejects ~ 1M code points above leading jamo range 8390 return ch < jamoLBase+jamoLCount && ch >= jamoLBase; 8391} 8392 8393// Tests if `ch` is a Hangul vowel jamo. 8394bool isJamoT(dchar ch) pure nothrow @nogc @safe 8395{ 8396 // first cmp rejects ~ 1M code points above trailing jamo range 8397 // Note: ch == jamoTBase doesn't indicate trailing jamo (TIndex must be > 0) 8398 return ch < jamoTBase+jamoTCount && ch > jamoTBase; 8399} 8400 8401// Tests if `ch` is a Hangul trailnig consonant jamo. 8402bool isJamoV(dchar ch) pure nothrow @nogc @safe 8403{ 8404 // first cmp rejects ~ 1M code points above vowel range 8405 return ch < jamoVBase+jamoVCount && ch >= jamoVBase; 8406} 8407 8408int hangulSyllableIndex(dchar ch) pure nothrow @nogc @safe 8409{ 8410 int idxS = cast(int) ch - jamoSBase; 8411 return idxS >= 0 && idxS < jamoSCount ? idxS : -1; 8412} 8413 8414// internal helper: compose hangul syllables leaving dchar.init in holes 8415void hangulRecompose(scope dchar[] seq) pure nothrow @nogc @safe 8416{ 8417 for (size_t idx = 0; idx + 1 < seq.length; ) 8418 { 8419 if (isJamoL(seq[idx]) && isJamoV(seq[idx+1])) 8420 { 8421 immutable int indexL = seq[idx] - jamoLBase; 8422 immutable int indexV = seq[idx+1] - jamoVBase; 8423 immutable int indexLV = indexL * jamoNCount + indexV * jamoTCount; 8424 if (idx + 2 < seq.length && isJamoT(seq[idx+2])) 8425 { 8426 seq[idx] = jamoSBase + indexLV + seq[idx+2] - jamoTBase; 8427 seq[idx+1] = dchar.init; 8428 seq[idx+2] = dchar.init; 8429 idx += 3; 8430 } 8431 else 8432 { 8433 seq[idx] = jamoSBase + indexLV; 8434 seq[idx+1] = dchar.init; 8435 idx += 2; 8436 } 8437 } 8438 else 8439 idx++; 8440 } 8441} 8442 8443//---------------------------------------------------------------------------- 8444public: 8445 8446/** 8447 Decomposes a Hangul syllable. If `ch` is not a composed syllable 8448 then this function returns $(LREF Grapheme) containing only `ch` as is. 8449*/ 8450Grapheme decomposeHangul(dchar ch) @safe 8451{ 8452 immutable idxS = cast(int) ch - jamoSBase; 8453 if (idxS < 0 || idxS >= jamoSCount) return Grapheme(ch); 8454 immutable idxL = idxS / jamoNCount; 8455 immutable idxV = (idxS % jamoNCount) / jamoTCount; 8456 immutable idxT = idxS % jamoTCount; 8457 8458 immutable partL = jamoLBase + idxL; 8459 immutable partV = jamoVBase + idxV; 8460 if (idxT > 0) // there is a trailling consonant (T); <L,V,T> decomposition 8461 return Grapheme(partL, partV, jamoTBase + idxT); 8462 else // <L, V> decomposition 8463 return Grapheme(partL, partV); 8464} 8465 8466/// 8467@safe unittest 8468{ 8469 import std.algorithm.comparison : equal; 8470 assert(decomposeHangul('\uD4DB')[].equal("\u1111\u1171\u11B6")); 8471} 8472 8473/++ 8474 Try to compose hangul syllable out of a leading consonant (`lead`), 8475 a `vowel` and optional `trailing` consonant jamos. 8476 8477 On success returns the composed LV or LVT hangul syllable. 8478 8479 If any of `lead` and `vowel` are not a valid hangul jamo 8480 of the respective $(CHARACTER) class returns dchar.init. 8481+/ 8482dchar composeJamo(dchar lead, dchar vowel, dchar trailing=dchar.init) pure nothrow @nogc @safe 8483{ 8484 if (!isJamoL(lead)) 8485 return dchar.init; 8486 immutable indexL = lead - jamoLBase; 8487 if (!isJamoV(vowel)) 8488 return dchar.init; 8489 immutable indexV = vowel - jamoVBase; 8490 immutable indexLV = indexL * jamoNCount + indexV * jamoTCount; 8491 immutable dchar syllable = jamoSBase + indexLV; 8492 return isJamoT(trailing) ? syllable + (trailing - jamoTBase) : syllable; 8493} 8494 8495/// 8496@safe unittest 8497{ 8498 assert(composeJamo('\u1111', '\u1171', '\u11B6') == '\uD4DB'); 8499 // leaving out T-vowel, or passing any codepoint 8500 // that is not trailing consonant composes an LV-syllable 8501 assert(composeJamo('\u1111', '\u1171') == '\uD4CC'); 8502 assert(composeJamo('\u1111', '\u1171', ' ') == '\uD4CC'); 8503 assert(composeJamo('\u1111', 'A') == dchar.init); 8504 assert(composeJamo('A', '\u1171') == dchar.init); 8505} 8506 8507@safe unittest 8508{ 8509 import std.algorithm.comparison : equal; 8510 import std.conv : text; 8511 8512 static void testDecomp(UnicodeDecomposition T)(dchar ch, string r) 8513 { 8514 Grapheme g = decompose!T(ch); 8515 assert(equal(g[], r), text(g[], " vs ", r)); 8516 } 8517 testDecomp!Canonical('\u1FF4', "\u03C9\u0301\u0345"); 8518 testDecomp!Canonical('\uF907', "\u9F9C"); 8519 testDecomp!Compatibility('\u33FF', "\u0067\u0061\u006C"); 8520 testDecomp!Compatibility('\uA7F9', "\u0153"); 8521 8522 // check examples 8523 assert(decomposeHangul('\uD4DB')[].equal("\u1111\u1171\u11B6")); 8524 assert(composeJamo('\u1111', '\u1171', '\u11B6') == '\uD4DB'); 8525 assert(composeJamo('\u1111', '\u1171') == '\uD4CC'); // leave out T-vowel 8526 assert(composeJamo('\u1111', '\u1171', ' ') == '\uD4CC'); 8527 assert(composeJamo('\u1111', 'A') == dchar.init); 8528 assert(composeJamo('A', '\u1171') == dchar.init); 8529} 8530 8531/** 8532 Enumeration type for normalization forms, 8533 passed as template parameter for functions like $(LREF normalize). 8534*/ 8535enum NormalizationForm { 8536 NFC, 8537 NFD, 8538 NFKC, 8539 NFKD 8540} 8541 8542 8543enum { 8544 /** 8545 Shorthand aliases from values indicating normalization forms. 8546 */ 8547 NFC = NormalizationForm.NFC, 8548 ///ditto 8549 NFD = NormalizationForm.NFD, 8550 ///ditto 8551 NFKC = NormalizationForm.NFKC, 8552 ///ditto 8553 NFKD = NormalizationForm.NFKD 8554} 8555 8556/++ 8557 Returns `input` string normalized to the chosen form. 8558 Form C is used by default. 8559 8560 For more information on normalization forms see 8561 the $(S_LINK Normalization, normalization section). 8562 8563 Note: 8564 In cases where the string in question is already normalized, 8565 it is returned unmodified and no memory allocation happens. 8566+/ 8567inout(C)[] normalize(NormalizationForm norm=NFC, C)(return scope inout(C)[] input) 8568{ 8569 import std.algorithm.mutation : SwapStrategy; 8570 import std.algorithm.sorting : sort; 8571 import std.array : appender; 8572 import std.range : zip; 8573 8574 auto anchors = splitNormalized!norm(input); 8575 if (anchors[0] == input.length && anchors[1] == input.length) 8576 return input; 8577 dchar[] decomposed; 8578 decomposed.reserve(31); 8579 ubyte[] ccc; 8580 ccc.reserve(31); 8581 auto app = appender!(C[])(); 8582 do 8583 { 8584 app.put(input[0 .. anchors[0]]); 8585 foreach (dchar ch; input[anchors[0]..anchors[1]]) 8586 static if (norm == NFD || norm == NFC) 8587 { 8588 foreach (dchar c; decompose!Canonical(ch)[]) 8589 decomposed ~= c; 8590 } 8591 else // NFKD & NFKC 8592 { 8593 foreach (dchar c; decompose!Compatibility(ch)[]) 8594 decomposed ~= c; 8595 } 8596 ccc.length = decomposed.length; 8597 size_t firstNonStable = 0; 8598 ubyte lastClazz = 0; 8599 8600 foreach (idx, dchar ch; decomposed) 8601 { 8602 immutable clazz = combiningClass(ch); 8603 ccc[idx] = clazz; 8604 if (clazz == 0 && lastClazz != 0) 8605 { 8606 // found a stable code point after unstable ones 8607 sort!("a[0] < b[0]", SwapStrategy.stable) 8608 (zip(ccc[firstNonStable .. idx], decomposed[firstNonStable .. idx])); 8609 firstNonStable = decomposed.length; 8610 } 8611 else if (clazz != 0 && lastClazz == 0) 8612 { 8613 // found first unstable code point after stable ones 8614 firstNonStable = idx; 8615 } 8616 lastClazz = clazz; 8617 } 8618 sort!("a[0] < b[0]", SwapStrategy.stable) 8619 (zip(ccc[firstNonStable..$], decomposed[firstNonStable..$])); 8620 static if (norm == NFC || norm == NFKC) 8621 { 8622 import std.algorithm.searching : countUntil; 8623 auto first = countUntil(ccc, 0); 8624 if (first >= 0) // no starters?? no recomposition 8625 { 8626 for (;;) 8627 { 8628 immutable second = recompose(first, decomposed, ccc); 8629 if (second == decomposed.length) 8630 break; 8631 first = second; 8632 } 8633 // 2nd pass for hangul syllables 8634 hangulRecompose(decomposed); 8635 } 8636 } 8637 static if (norm == NFD || norm == NFKD) 8638 app.put(decomposed); 8639 else 8640 { 8641 import std.algorithm.mutation : remove; 8642 auto clean = remove!("a == dchar.init", SwapStrategy.stable)(decomposed); 8643 app.put(decomposed[0 .. clean.length]); 8644 } 8645 // reset variables 8646 decomposed.length = 0; 8647 () @trusted { 8648 decomposed.assumeSafeAppend(); 8649 ccc.length = 0; 8650 ccc.assumeSafeAppend(); 8651 } (); 8652 input = input[anchors[1]..$]; 8653 // and move on 8654 anchors = splitNormalized!norm(input); 8655 }while (anchors[0] != input.length); 8656 app.put(input[0 .. anchors[0]]); 8657 return () @trusted inout { return cast(inout(C)[]) app.data; } (); 8658} 8659 8660/// 8661@safe unittest 8662{ 8663 // any encoding works 8664 wstring greet = "Hello world"; 8665 assert(normalize(greet) is greet); // the same exact slice 8666 8667 // An example of a character with all 4 forms being different: 8668 // Greek upsilon with acute and hook symbol (code point 0x03D3) 8669 assert(normalize!NFC("��") == "\u03D3"); 8670 assert(normalize!NFD("��") == "\u03D2\u0301"); 8671 assert(normalize!NFKC("��") == "\u038E"); 8672 assert(normalize!NFKD("��") == "\u03A5\u0301"); 8673} 8674 8675@safe unittest 8676{ 8677 import std.conv : text; 8678 8679 assert(normalize!NFD("abc\uF904def") == "abc\u6ED1def", text(normalize!NFD("abc\uF904def"))); 8680 assert(normalize!NFKD("2�����") == "210", normalize!NFKD("2�����")); 8681 assert(normalize!NFD("��ffin") == "A\u0308ffin"); 8682 8683 // check example 8684 8685 // any encoding works 8686 wstring greet = "Hello world"; 8687 assert(normalize(greet) is greet); // the same exact slice 8688 8689 // An example of a character with all 4 forms being different: 8690 // Greek upsilon with acute and hook symbol (code point 0x03D3) 8691 assert(normalize!NFC("��") == "\u03D3"); 8692 assert(normalize!NFD("��") == "\u03D2\u0301"); 8693 assert(normalize!NFKC("��") == "\u038E"); 8694 assert(normalize!NFKD("��") == "\u03A5\u0301"); 8695} 8696 8697// canonically recompose given slice of code points, works in-place and mutates data 8698private size_t recompose(size_t start, scope dchar[] input, scope ubyte[] ccc) pure nothrow @safe 8699{ 8700 assert(input.length == ccc.length); 8701 int accumCC = -1;// so that it's out of 0 .. 255 range 8702 // writefln("recomposing %( %04x %)", input); 8703 // first one is always a starter thus we start at i == 1 8704 size_t i = start+1; 8705 for (; ; ) 8706 { 8707 if (i == input.length) 8708 break; 8709 immutable curCC = ccc[i]; 8710 // In any character sequence beginning with a starter S 8711 // a character C is blocked from S if and only if there 8712 // is some character B between S and C, and either B 8713 // is a starter or it has the same or higher combining class as C. 8714 //------------------------ 8715 // Applying to our case: 8716 // S is input[0] 8717 // accumCC is the maximum CCC of characters between C and S, 8718 // as ccc are sorted 8719 // C is input[i] 8720 8721 if (curCC > accumCC) 8722 { 8723 immutable comp = compose(input[start], input[i]); 8724 if (comp != dchar.init) 8725 { 8726 input[start] = comp; 8727 input[i] = dchar.init;// put a sentinel 8728 // current was merged so its CCC shouldn't affect 8729 // composing with the next one 8730 } 8731 else 8732 { 8733 // if it was a starter then accumCC is now 0, end of loop 8734 accumCC = curCC; 8735 if (accumCC == 0) 8736 break; 8737 } 8738 } 8739 else 8740 { 8741 // ditto here 8742 accumCC = curCC; 8743 if (accumCC == 0) 8744 break; 8745 } 8746 i++; 8747 } 8748 return i; 8749} 8750 8751// returns tuple of 2 indexes that delimit: 8752// normalized text, piece that needs normalization and 8753// the rest of input starting with stable code point 8754private auto splitNormalized(NormalizationForm norm, C)(scope const(C)[] input) 8755{ 8756 import std.typecons : tuple; 8757 ubyte lastCC = 0; 8758 8759 foreach (idx, dchar ch; input) 8760 { 8761 static if (norm == NFC) 8762 if (ch < 0x0300) 8763 { 8764 lastCC = 0; 8765 continue; 8766 } 8767 immutable ubyte CC = combiningClass(ch); 8768 if (lastCC > CC && CC != 0) 8769 { 8770 return seekStable!norm(idx, input); 8771 } 8772 8773 if (notAllowedIn!norm(ch)) 8774 { 8775 return seekStable!norm(idx, input); 8776 } 8777 lastCC = CC; 8778 } 8779 return tuple(input.length, input.length); 8780} 8781 8782private auto seekStable(NormalizationForm norm, C)(size_t idx, const scope C[] input) 8783{ 8784 import std.typecons : tuple; 8785 import std.utf : codeLength; 8786 8787 auto br = input[0 .. idx]; 8788 size_t region_start = 0;// default 8789 for (;;) 8790 { 8791 if (br.empty)// start is 0 8792 break; 8793 dchar ch = br.back; 8794 if (combiningClass(ch) == 0 && allowedIn!norm(ch)) 8795 { 8796 region_start = br.length - codeLength!C(ch); 8797 break; 8798 } 8799 br.popFront(); 8800 } 8801 ///@@@BUG@@@ can't use find: " find is a nested function and can't be used..." 8802 size_t region_end=input.length;// end is $ by default 8803 foreach (i, dchar ch; input[idx..$]) 8804 { 8805 if (combiningClass(ch) == 0 && allowedIn!norm(ch)) 8806 { 8807 region_end = i+idx; 8808 break; 8809 } 8810 } 8811 // writeln("Region to normalize: ", input[region_start .. region_end]); 8812 return tuple(region_start, region_end); 8813} 8814 8815/** 8816 Tests if dchar `ch` is always allowed (Quick_Check=YES) in normalization 8817 form `norm`. 8818*/ 8819public bool allowedIn(NormalizationForm norm)(dchar ch) 8820{ 8821 return !notAllowedIn!norm(ch); 8822} 8823 8824/// 8825@safe unittest 8826{ 8827 // e.g. Cyrillic is always allowed, so is ASCII 8828 assert(allowedIn!NFC('��')); 8829 assert(allowedIn!NFD('��')); 8830 assert(allowedIn!NFKC('��')); 8831 assert(allowedIn!NFKD('��')); 8832 assert(allowedIn!NFC('Z')); 8833} 8834 8835// not user friendly name but more direct 8836private bool notAllowedIn(NormalizationForm norm)(dchar ch) 8837{ 8838 static if (norm == NFC) 8839 alias qcTrie = nfcQCTrie; 8840 else static if (norm == NFD) 8841 alias qcTrie = nfdQCTrie; 8842 else static if (norm == NFKC) 8843 alias qcTrie = nfkcQCTrie; 8844 else static if (norm == NFKD) 8845 alias qcTrie = nfkdQCTrie; 8846 else 8847 static assert("Unknown normalization form "~norm); 8848 return qcTrie[ch]; 8849} 8850 8851@safe unittest 8852{ 8853 assert(allowedIn!NFC('��')); 8854 assert(allowedIn!NFD('��')); 8855 assert(allowedIn!NFKC('��')); 8856 assert(allowedIn!NFKD('��')); 8857 assert(allowedIn!NFC('Z')); 8858} 8859 8860} 8861 8862version (std_uni_bootstrap) 8863{ 8864 // old version used for bootstrapping of gen_uni.d that generates 8865 // up to date optimal versions of all of isXXX functions 8866 @safe pure nothrow @nogc public bool isWhite(dchar c) 8867 { 8868 import std.ascii : isWhite; 8869 return isWhite(c) || 8870 c == lineSep || c == paraSep || 8871 c == '\u0085' || c == '\u00A0' || c == '\u1680' || c == '\u180E' || 8872 (c >= '\u2000' && c <= '\u200A') || 8873 c == '\u202F' || c == '\u205F' || c == '\u3000'; 8874 } 8875} 8876else 8877{ 8878 8879// trusted -> avoid bounds check 8880@trusted pure nothrow @nogc private 8881{ 8882 import std.internal.unicode_tables; // : toLowerTable, toTitleTable, toUpperTable; // generated file 8883 8884 // hide template instances behind functions 8885 // https://issues.dlang.org/show_bug.cgi?id=13232 8886 ushort toLowerIndex(dchar c) { return toLowerIndexTrie[c]; } 8887 ushort toLowerSimpleIndex(dchar c) { return toLowerSimpleIndexTrie[c]; } 8888 dchar toLowerTab(size_t idx) { return toLowerTable[idx]; } 8889 8890 ushort toTitleIndex(dchar c) { return toTitleIndexTrie[c]; } 8891 ushort toTitleSimpleIndex(dchar c) { return toTitleSimpleIndexTrie[c]; } 8892 dchar toTitleTab(size_t idx) { return toTitleTable[idx]; } 8893 8894 ushort toUpperIndex(dchar c) { return toUpperIndexTrie[c]; } 8895 ushort toUpperSimpleIndex(dchar c) { return toUpperSimpleIndexTrie[c]; } 8896 dchar toUpperTab(size_t idx) { return toUpperTable[idx]; } 8897} 8898 8899public: 8900 8901/++ 8902 Whether or not `c` is a Unicode whitespace $(CHARACTER). 8903 (general Unicode category: Part of C0(tab, vertical tab, form feed, 8904 carriage return, and linefeed characters), Zs, Zl, Zp, and NEL(U+0085)) 8905+/ 8906@safe pure nothrow @nogc 8907public bool isWhite(dchar c) 8908{ 8909 import std.internal.unicode_tables : isWhiteGen; // generated file 8910 return isWhiteGen(c); // call pregenerated binary search 8911} 8912 8913/++ 8914 Return whether `c` is a Unicode lowercase $(CHARACTER). 8915+/ 8916@safe pure nothrow @nogc 8917bool isLower(dchar c) 8918{ 8919 import std.ascii : isLower, isASCII; 8920 if (isASCII(c)) 8921 return isLower(c); 8922 return lowerCaseTrie[c]; 8923} 8924 8925@safe unittest 8926{ 8927 import std.ascii : isLower; 8928 foreach (v; 0 .. 0x80) 8929 assert(isLower(v) == .isLower(v)); 8930 assert(.isLower('��')); 8931 assert(.isLower('��')); 8932 assert(!.isLower('��')); 8933 // Greek HETA 8934 assert(!.isLower('\u0370')); 8935 assert(.isLower('\u0371')); 8936 assert(!.isLower('\u039C')); // capital MU 8937 assert(.isLower('\u03B2')); // beta 8938 // from extended Greek 8939 assert(!.isLower('\u1F18')); 8940 assert(.isLower('\u1F00')); 8941 foreach (v; unicode.lowerCase.byCodepoint) 8942 assert(.isLower(v) && !isUpper(v)); 8943} 8944 8945 8946/++ 8947 Return whether `c` is a Unicode uppercase $(CHARACTER). 8948+/ 8949@safe pure nothrow @nogc 8950bool isUpper(dchar c) 8951{ 8952 import std.ascii : isUpper, isASCII; 8953 if (isASCII(c)) 8954 return isUpper(c); 8955 return upperCaseTrie[c]; 8956} 8957 8958@safe unittest 8959{ 8960 import std.ascii : isLower; 8961 foreach (v; 0 .. 0x80) 8962 assert(isLower(v) == .isLower(v)); 8963 assert(!isUpper('��')); 8964 assert(isUpper('��')); 8965 // Greek HETA 8966 assert(isUpper('\u0370')); 8967 assert(!isUpper('\u0371')); 8968 assert(isUpper('\u039C')); // capital MU 8969 assert(!isUpper('\u03B2')); // beta 8970 // from extended Greek 8971 assert(!isUpper('\u1F00')); 8972 assert(isUpper('\u1F18')); 8973 foreach (v; unicode.upperCase.byCodepoint) 8974 assert(isUpper(v) && !.isLower(v)); 8975} 8976 8977 8978//TODO: Hidden for now, needs better API. 8979//Other transforms could use better API as well, but this one is a new primitive. 8980@safe pure nothrow @nogc 8981private dchar toTitlecase(dchar c) 8982{ 8983 // optimize ASCII case 8984 if (c < 0xAA) 8985 { 8986 if (c < 'a') 8987 return c; 8988 if (c <= 'z') 8989 return c - 32; 8990 return c; 8991 } 8992 size_t idx = toTitleSimpleIndex(c); 8993 if (idx != ushort.max) 8994 { 8995 return toTitleTab(idx); 8996 } 8997 return c; 8998} 8999 9000private alias UpperTriple = AliasSeq!(toUpperIndex, MAX_SIMPLE_UPPER, toUpperTab); 9001private alias LowerTriple = AliasSeq!(toLowerIndex, MAX_SIMPLE_LOWER, toLowerTab); 9002 9003// generic toUpper/toLower on whole string, creates new or returns as is 9004private ElementEncodingType!S[] toCase(alias indexFn, uint maxIdx, alias tableFn, alias asciiConvert, S)(S s) 9005if (isSomeString!S || (isRandomAccessRange!S && hasLength!S && hasSlicing!S && isSomeChar!(ElementType!S))) 9006{ 9007 import std.array : appender, array; 9008 import std.ascii : isASCII; 9009 import std.utf : byDchar, codeLength; 9010 9011 alias C = ElementEncodingType!S; 9012 9013 auto r = s.byDchar; 9014 for (size_t i; !r.empty; i += r.front.codeLength!C , r.popFront()) 9015 { 9016 auto cOuter = r.front; 9017 ushort idx = indexFn(cOuter); 9018 if (idx == ushort.max) 9019 continue; 9020 auto result = appender!(C[])(); 9021 result.reserve(s.length); 9022 result.put(s[0 .. i]); 9023 foreach (dchar c; s[i .. $].byDchar) 9024 { 9025 if (c.isASCII) 9026 { 9027 result.put(asciiConvert(c)); 9028 } 9029 else 9030 { 9031 idx = indexFn(c); 9032 if (idx == ushort.max) 9033 result.put(c); 9034 else if (idx < maxIdx) 9035 { 9036 c = tableFn(idx); 9037 result.put(c); 9038 } 9039 else 9040 { 9041 auto val = tableFn(idx); 9042 // unpack length + codepoint 9043 immutable uint len = val >> 24; 9044 result.put(cast(dchar)(val & 0xFF_FFFF)); 9045 foreach (j; idx+1 .. idx+len) 9046 result.put(tableFn(j)); 9047 } 9048 } 9049 } 9050 return result.data; 9051 } 9052 9053 static if (isSomeString!S) 9054 return s; 9055 else 9056 return s.array; 9057} 9058 9059// https://issues.dlang.org/show_bug.cgi?id=12428 9060@safe unittest 9061{ 9062 import std.array : replicate; 9063 auto s = "abcdefghij".replicate(300); 9064 s = s[0 .. 10]; 9065 9066 toUpper(s); 9067 9068 assert(s == "abcdefghij"); 9069} 9070 9071// https://issues.dlang.org/show_bug.cgi?id=18993 9072@safe unittest 9073{ 9074 static assert(`���������/A`.toLower.length == `���������/a`.toLower.length); 9075} 9076 9077 9078// generic toUpper/toLower on whole range, returns range 9079private auto toCaser(alias indexFn, uint maxIdx, alias tableFn, alias asciiConvert, Range)(Range str) 9080 // Accept range of dchar's 9081if (isInputRange!Range && 9082 isSomeChar!(ElementEncodingType!Range) && 9083 ElementEncodingType!Range.sizeof == dchar.sizeof) 9084{ 9085 static struct ToCaserImpl 9086 { 9087 @property bool empty() 9088 { 9089 return !nLeft && r.empty; 9090 } 9091 9092 @property auto front() 9093 { 9094 import std.ascii : isASCII; 9095 9096 if (!nLeft) 9097 { 9098 dchar c = r.front; 9099 if (c.isASCII) 9100 { 9101 buf[0] = asciiConvert(c); 9102 nLeft = 1; 9103 } 9104 else 9105 { 9106 const idx = indexFn(c); 9107 if (idx == ushort.max) 9108 { 9109 buf[0] = c; 9110 nLeft = 1; 9111 } 9112 else if (idx < maxIdx) 9113 { 9114 buf[0] = tableFn(idx); 9115 nLeft = 1; 9116 } 9117 else 9118 { 9119 immutable val = tableFn(idx); 9120 // unpack length + codepoint 9121 nLeft = val >> 24; 9122 if (nLeft == 0) 9123 nLeft = 1; 9124 assert(nLeft <= buf.length); 9125 buf[nLeft - 1] = cast(dchar)(val & 0xFF_FFFF); 9126 foreach (j; 1 .. nLeft) 9127 buf[nLeft - j - 1] = tableFn(idx + j); 9128 } 9129 } 9130 } 9131 return buf[nLeft - 1]; 9132 } 9133 9134 void popFront() 9135 { 9136 if (!nLeft) 9137 front; 9138 assert(nLeft); 9139 --nLeft; 9140 if (!nLeft) 9141 r.popFront(); 9142 } 9143 9144 static if (isForwardRange!Range) 9145 { 9146 @property auto save() 9147 { 9148 auto ret = this; 9149 ret.r = r.save; 9150 return ret; 9151 } 9152 } 9153 9154 private: 9155 Range r; 9156 uint nLeft; 9157 dchar[3] buf = void; 9158 } 9159 9160 return ToCaserImpl(str); 9161} 9162 9163/********************* 9164 * Convert an $(REF_ALTTEXT input range, isInputRange, std,range,primitives) 9165 * or a string to upper or lower case. 9166 * 9167 * Does not allocate memory. 9168 * Characters in UTF-8 or UTF-16 format that cannot be decoded 9169 * are treated as $(REF replacementDchar, std,utf). 9170 * 9171 * Params: 9172 * str = string or range of characters 9173 * 9174 * Returns: 9175 * an input range of `dchar`s 9176 * 9177 * See_Also: 9178 * $(LREF toUpper), $(LREF toLower) 9179 */ 9180 9181auto asLowerCase(Range)(Range str) 9182if (isInputRange!Range && isSomeChar!(ElementEncodingType!Range) && 9183 !isConvertibleToString!Range) 9184{ 9185 static if (ElementEncodingType!Range.sizeof < dchar.sizeof) 9186 { 9187 import std.utf : byDchar; 9188 9189 // Decode first 9190 return asLowerCase(str.byDchar); 9191 } 9192 else 9193 { 9194 static import std.ascii; 9195 return toCaser!(LowerTriple, std.ascii.toLower)(str); 9196 } 9197} 9198 9199/// ditto 9200auto asUpperCase(Range)(Range str) 9201if (isInputRange!Range && isSomeChar!(ElementEncodingType!Range) && 9202 !isConvertibleToString!Range) 9203{ 9204 static if (ElementEncodingType!Range.sizeof < dchar.sizeof) 9205 { 9206 import std.utf : byDchar; 9207 9208 // Decode first 9209 return asUpperCase(str.byDchar); 9210 } 9211 else 9212 { 9213 static import std.ascii; 9214 return toCaser!(UpperTriple, std.ascii.toUpper)(str); 9215 } 9216} 9217 9218/// 9219@safe pure unittest 9220{ 9221 import std.algorithm.comparison : equal; 9222 9223 assert("hEllo".asUpperCase.equal("HELLO")); 9224} 9225 9226// explicitly undocumented 9227auto asLowerCase(Range)(auto ref Range str) 9228if (isConvertibleToString!Range) 9229{ 9230 import std.traits : StringTypeOf; 9231 return asLowerCase!(StringTypeOf!Range)(str); 9232} 9233 9234// explicitly undocumented 9235auto asUpperCase(Range)(auto ref Range str) 9236if (isConvertibleToString!Range) 9237{ 9238 import std.traits : StringTypeOf; 9239 return asUpperCase!(StringTypeOf!Range)(str); 9240} 9241 9242@safe unittest 9243{ 9244 static struct TestAliasedString 9245 { 9246 string get() @safe @nogc pure nothrow { return _s; } 9247 alias get this; 9248 @disable this(this); 9249 string _s; 9250 } 9251 9252 static bool testAliasedString(alias func, Args...)(string s, Args args) 9253 { 9254 import std.algorithm.comparison : equal; 9255 auto a = func(TestAliasedString(s), args); 9256 auto b = func(s, args); 9257 static if (is(typeof(equal(a, b)))) 9258 { 9259 // For ranges, compare contents instead of object identity. 9260 return equal(a, b); 9261 } 9262 else 9263 { 9264 return a == b; 9265 } 9266 } 9267 assert(testAliasedString!asLowerCase("hEllo")); 9268 assert(testAliasedString!asUpperCase("hEllo")); 9269 assert(testAliasedString!asCapitalized("hEllo")); 9270} 9271 9272@safe unittest 9273{ 9274 import std.array : array; 9275 9276 auto a = "HELLo".asLowerCase; 9277 auto savea = a.save; 9278 auto s = a.array; 9279 assert(s == "hello"); 9280 s = savea.array; 9281 assert(s == "hello"); 9282 9283 string[] lower = ["123", "abc������", "\u0131\u023f\u03c9", "i\u0307\u1Fe2"]; 9284 string[] upper = ["123", "ABC������", "I\u2c7e\u2126", "\u0130\u03A5\u0308\u0300"]; 9285 9286 foreach (i, slwr; lower) 9287 { 9288 import std.utf : byChar; 9289 9290 auto sx = slwr.asUpperCase.byChar.array; 9291 assert(sx == toUpper(slwr)); 9292 auto sy = upper[i].asLowerCase.byChar.array; 9293 assert(sy == toLower(upper[i])); 9294 } 9295 9296 // Not necessary to call r.front 9297 for (auto r = lower[3].asUpperCase; !r.empty; r.popFront()) 9298 { 9299 } 9300 9301 import std.algorithm.comparison : equal; 9302 9303 "HELLo"w.asLowerCase.equal("hello"d); 9304 "HELLo"w.asUpperCase.equal("HELLO"d); 9305 "HELLo"d.asLowerCase.equal("hello"d); 9306 "HELLo"d.asUpperCase.equal("HELLO"d); 9307 9308 import std.utf : byChar; 9309 assert(toLower("\u1Fe2") == asLowerCase("\u1Fe2").byChar.array); 9310} 9311 9312// generic capitalizer on whole range, returns range 9313private auto toCapitalizer(alias indexFnUpper, uint maxIdxUpper, alias tableFnUpper, 9314 Range)(Range str) 9315 // Accept range of dchar's 9316if (isInputRange!Range && 9317 isSomeChar!(ElementEncodingType!Range) && 9318 ElementEncodingType!Range.sizeof == dchar.sizeof) 9319{ 9320 static struct ToCapitalizerImpl 9321 { 9322 @property bool empty() 9323 { 9324 return lower ? lwr.empty : !nLeft && r.empty; 9325 } 9326 9327 @property auto front() 9328 { 9329 if (lower) 9330 return lwr.front; 9331 9332 if (!nLeft) 9333 { 9334 immutable dchar c = r.front; 9335 const idx = indexFnUpper(c); 9336 if (idx == ushort.max) 9337 { 9338 buf[0] = c; 9339 nLeft = 1; 9340 } 9341 else if (idx < maxIdxUpper) 9342 { 9343 buf[0] = tableFnUpper(idx); 9344 nLeft = 1; 9345 } 9346 else 9347 { 9348 immutable val = tableFnUpper(idx); 9349 // unpack length + codepoint 9350 nLeft = val >> 24; 9351 if (nLeft == 0) 9352 nLeft = 1; 9353 assert(nLeft <= buf.length); 9354 buf[nLeft - 1] = cast(dchar)(val & 0xFF_FFFF); 9355 foreach (j; 1 .. nLeft) 9356 buf[nLeft - j - 1] = tableFnUpper(idx + j); 9357 } 9358 } 9359 return buf[nLeft - 1]; 9360 } 9361 9362 void popFront() 9363 { 9364 if (lower) 9365 lwr.popFront(); 9366 else 9367 { 9368 if (!nLeft) 9369 front; 9370 assert(nLeft); 9371 --nLeft; 9372 if (!nLeft) 9373 { 9374 r.popFront(); 9375 lwr = r.asLowerCase(); 9376 lower = true; 9377 } 9378 } 9379 } 9380 9381 static if (isForwardRange!Range) 9382 { 9383 @property auto save() 9384 { 9385 auto ret = this; 9386 ret.r = r.save; 9387 ret.lwr = lwr.save; 9388 return ret; 9389 } 9390 } 9391 9392 private: 9393 Range r; 9394 typeof(r.asLowerCase) lwr; // range representing the lower case rest of string 9395 bool lower = false; // false for first character, true for rest of string 9396 dchar[3] buf = void; 9397 uint nLeft = 0; 9398 } 9399 9400 return ToCapitalizerImpl(str); 9401} 9402 9403/********************* 9404 * Capitalize an $(REF_ALTTEXT input range, isInputRange, std,range,primitives) 9405 * or string, meaning convert the first 9406 * character to upper case and subsequent characters to lower case. 9407 * 9408 * Does not allocate memory. 9409 * Characters in UTF-8 or UTF-16 format that cannot be decoded 9410 * are treated as $(REF replacementDchar, std,utf). 9411 * 9412 * Params: 9413 * str = string or range of characters 9414 * 9415 * Returns: 9416 * an InputRange of dchars 9417 * 9418 * See_Also: 9419 * $(LREF toUpper), $(LREF toLower) 9420 * $(LREF asUpperCase), $(LREF asLowerCase) 9421 */ 9422 9423auto asCapitalized(Range)(Range str) 9424if (isInputRange!Range && isSomeChar!(ElementEncodingType!Range) && 9425 !isConvertibleToString!Range) 9426{ 9427 static if (ElementEncodingType!Range.sizeof < dchar.sizeof) 9428 { 9429 import std.utf : byDchar; 9430 9431 // Decode first 9432 return toCapitalizer!UpperTriple(str.byDchar); 9433 } 9434 else 9435 { 9436 return toCapitalizer!UpperTriple(str); 9437 } 9438} 9439 9440/// 9441@safe pure unittest 9442{ 9443 import std.algorithm.comparison : equal; 9444 9445 assert("hEllo".asCapitalized.equal("Hello")); 9446} 9447 9448auto asCapitalized(Range)(auto ref Range str) 9449if (isConvertibleToString!Range) 9450{ 9451 import std.traits : StringTypeOf; 9452 return asCapitalized!(StringTypeOf!Range)(str); 9453} 9454 9455@safe pure nothrow @nogc unittest 9456{ 9457 auto r = "hEllo".asCapitalized(); 9458 assert(r.front == 'H'); 9459} 9460 9461@safe unittest 9462{ 9463 import std.array : array; 9464 9465 auto a = "hELLo".asCapitalized; 9466 auto savea = a.save; 9467 auto s = a.array; 9468 assert(s == "Hello"); 9469 s = savea.array; 9470 assert(s == "Hello"); 9471 9472 string[2][] cases = 9473 [ 9474 ["", ""], 9475 ["h", "H"], 9476 ["H", "H"], 9477 ["3", "3"], 9478 ["123", "123"], 9479 ["h123A", "H123a"], 9480 ["������", "������"], 9481 ["\u1Fe2", "\u03a5\u0308\u0300"], 9482 ]; 9483 9484 foreach (i; 0 .. cases.length) 9485 { 9486 import std.utf : byChar; 9487 9488 auto r = cases[i][0].asCapitalized.byChar.array; 9489 auto result = cases[i][1]; 9490 assert(r == result); 9491 } 9492 9493 // Don't call r.front 9494 for (auto r = "\u1Fe2".asCapitalized; !r.empty; r.popFront()) 9495 { 9496 } 9497 9498 import std.algorithm.comparison : equal; 9499 9500 "HELLo"w.asCapitalized.equal("Hello"d); 9501 "hElLO"w.asCapitalized.equal("Hello"d); 9502 "hello"d.asCapitalized.equal("Hello"d); 9503 "HELLO"d.asCapitalized.equal("Hello"d); 9504 9505 import std.utf : byChar; 9506 assert(asCapitalized("\u0130").byChar.array == asUpperCase("\u0130").byChar.array); 9507} 9508 9509// TODO: helper, I wish std.utf was more flexible (and stright) 9510private size_t encodeTo(scope char[] buf, size_t idx, dchar c) @trusted pure nothrow @nogc 9511{ 9512 if (c <= 0x7F) 9513 { 9514 buf[idx] = cast(char) c; 9515 idx++; 9516 } 9517 else if (c <= 0x7FF) 9518 { 9519 buf[idx] = cast(char)(0xC0 | (c >> 6)); 9520 buf[idx+1] = cast(char)(0x80 | (c & 0x3F)); 9521 idx += 2; 9522 } 9523 else if (c <= 0xFFFF) 9524 { 9525 buf[idx] = cast(char)(0xE0 | (c >> 12)); 9526 buf[idx+1] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 9527 buf[idx+2] = cast(char)(0x80 | (c & 0x3F)); 9528 idx += 3; 9529 } 9530 else if (c <= 0x10FFFF) 9531 { 9532 buf[idx] = cast(char)(0xF0 | (c >> 18)); 9533 buf[idx+1] = cast(char)(0x80 | ((c >> 12) & 0x3F)); 9534 buf[idx+2] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 9535 buf[idx+3] = cast(char)(0x80 | (c & 0x3F)); 9536 idx += 4; 9537 } 9538 else 9539 assert(0); 9540 return idx; 9541} 9542 9543@safe unittest 9544{ 9545 char[] s = "abcd".dup; 9546 size_t i = 0; 9547 i = encodeTo(s, i, 'X'); 9548 assert(s == "Xbcd"); 9549 9550 i = encodeTo(s, i, cast(dchar)'\u00A9'); 9551 assert(s == "X\xC2\xA9d"); 9552} 9553 9554// TODO: helper, I wish std.utf was more flexible (and stright) 9555private size_t encodeTo(scope wchar[] buf, size_t idx, dchar c) @trusted pure 9556{ 9557 import std.utf : UTFException; 9558 if (c <= 0xFFFF) 9559 { 9560 if (0xD800 <= c && c <= 0xDFFF) 9561 throw (new UTFException("Encoding an isolated surrogate code point in UTF-16")).setSequence(c); 9562 buf[idx] = cast(wchar) c; 9563 idx++; 9564 } 9565 else if (c <= 0x10FFFF) 9566 { 9567 buf[idx] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800); 9568 buf[idx+1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00); 9569 idx += 2; 9570 } 9571 else 9572 assert(0); 9573 return idx; 9574} 9575 9576private size_t encodeTo(scope dchar[] buf, size_t idx, dchar c) @trusted pure nothrow @nogc 9577{ 9578 buf[idx] = c; 9579 idx++; 9580 return idx; 9581} 9582 9583private void toCaseInPlace(alias indexFn, uint maxIdx, alias tableFn, C)(ref C[] s) @trusted pure 9584if (is(C == char) || is(C == wchar) || is(C == dchar)) 9585{ 9586 import std.utf : decode, codeLength; 9587 size_t curIdx = 0; 9588 size_t destIdx = 0; 9589 alias slowToCase = toCaseInPlaceAlloc!(indexFn, maxIdx, tableFn); 9590 size_t lastUnchanged = 0; 9591 // in-buffer move of bytes to a new start index 9592 // the trick is that it may not need to copy at all 9593 static size_t moveTo(C[] str, size_t dest, size_t from, size_t to) 9594 { 9595 // Interestingly we may just bump pointer for a while 9596 // then have to copy if a re-cased char was smaller the original 9597 // later we may regain pace with char that got bigger 9598 // In the end it sometimes flip-flops between the 2 cases below 9599 if (dest == from) 9600 return to; 9601 // got to copy 9602 foreach (C c; str[from .. to]) 9603 str[dest++] = c; 9604 return dest; 9605 } 9606 while (curIdx != s.length) 9607 { 9608 size_t startIdx = curIdx; 9609 immutable ch = decode(s, curIdx); 9610 // TODO: special case for ASCII 9611 immutable caseIndex = indexFn(ch); 9612 if (caseIndex == ushort.max) // unchanged, skip over 9613 { 9614 continue; 9615 } 9616 else if (caseIndex < maxIdx) // 1:1 codepoint mapping 9617 { 9618 // previous cased chars had the same length as uncased ones 9619 // thus can just adjust pointer 9620 destIdx = moveTo(s, destIdx, lastUnchanged, startIdx); 9621 lastUnchanged = curIdx; 9622 immutable cased = tableFn(caseIndex); 9623 immutable casedLen = codeLength!C(cased); 9624 if (casedLen + destIdx > curIdx) // no place to fit cased char 9625 { 9626 // switch to slow codepath, where we allocate 9627 return slowToCase(s, startIdx, destIdx); 9628 } 9629 else 9630 { 9631 destIdx = encodeTo(s, destIdx, cased); 9632 } 9633 } 9634 else // 1:m codepoint mapping, slow codepath 9635 { 9636 destIdx = moveTo(s, destIdx, lastUnchanged, startIdx); 9637 lastUnchanged = curIdx; 9638 return slowToCase(s, startIdx, destIdx); 9639 } 9640 assert(destIdx <= curIdx); 9641 } 9642 if (lastUnchanged != s.length) 9643 { 9644 destIdx = moveTo(s, destIdx, lastUnchanged, s.length); 9645 } 9646 s = s[0 .. destIdx]; 9647} 9648 9649// helper to precalculate size of case-converted string 9650private template toCaseLength(alias indexFn, uint maxIdx, alias tableFn) 9651{ 9652 size_t toCaseLength(C)(const scope C[] str) 9653 { 9654 import std.utf : decode, codeLength; 9655 size_t codeLen = 0; 9656 size_t lastNonTrivial = 0; 9657 size_t curIdx = 0; 9658 while (curIdx != str.length) 9659 { 9660 immutable startIdx = curIdx; 9661 immutable ch = decode(str, curIdx); 9662 immutable ushort caseIndex = indexFn(ch); 9663 if (caseIndex == ushort.max) 9664 continue; 9665 else if (caseIndex < maxIdx) 9666 { 9667 codeLen += startIdx - lastNonTrivial; 9668 lastNonTrivial = curIdx; 9669 immutable cased = tableFn(caseIndex); 9670 codeLen += codeLength!C(cased); 9671 } 9672 else 9673 { 9674 codeLen += startIdx - lastNonTrivial; 9675 lastNonTrivial = curIdx; 9676 immutable val = tableFn(caseIndex); 9677 immutable len = val >> 24; 9678 immutable dchar cased = val & 0xFF_FFFF; 9679 codeLen += codeLength!C(cased); 9680 foreach (j; caseIndex+1 .. caseIndex+len) 9681 codeLen += codeLength!C(tableFn(j)); 9682 } 9683 } 9684 if (lastNonTrivial != str.length) 9685 codeLen += str.length - lastNonTrivial; 9686 return codeLen; 9687 } 9688} 9689 9690@safe unittest 9691{ 9692 alias toLowerLength = toCaseLength!(LowerTriple); 9693 assert(toLowerLength("abcd") == 4); 9694 assert(toLowerLength("����������456") == 10+3); 9695} 9696 9697// slower code path that preallocates and then copies 9698// case-converted stuf to the new string 9699private template toCaseInPlaceAlloc(alias indexFn, uint maxIdx, alias tableFn) 9700{ 9701 void toCaseInPlaceAlloc(C)(ref C[] s, size_t curIdx, 9702 size_t destIdx) @trusted pure 9703 if (is(C == char) || is(C == wchar) || is(C == dchar)) 9704 { 9705 import std.utf : decode; 9706 alias caseLength = toCaseLength!(indexFn, maxIdx, tableFn); 9707 auto trueLength = destIdx + caseLength(s[curIdx..$]); 9708 C[] ns = new C[trueLength]; 9709 ns[0 .. destIdx] = s[0 .. destIdx]; 9710 size_t lastUnchanged = curIdx; 9711 while (curIdx != s.length) 9712 { 9713 immutable startIdx = curIdx; // start of current codepoint 9714 immutable ch = decode(s, curIdx); 9715 immutable caseIndex = indexFn(ch); 9716 if (caseIndex == ushort.max) // skip over 9717 { 9718 continue; 9719 } 9720 else if (caseIndex < maxIdx) // 1:1 codepoint mapping 9721 { 9722 immutable cased = tableFn(caseIndex); 9723 auto toCopy = startIdx - lastUnchanged; 9724 ns[destIdx .. destIdx+toCopy] = s[lastUnchanged .. startIdx]; 9725 lastUnchanged = curIdx; 9726 destIdx += toCopy; 9727 destIdx = encodeTo(ns, destIdx, cased); 9728 } 9729 else // 1:m codepoint mapping, slow codepath 9730 { 9731 auto toCopy = startIdx - lastUnchanged; 9732 ns[destIdx .. destIdx+toCopy] = s[lastUnchanged .. startIdx]; 9733 lastUnchanged = curIdx; 9734 destIdx += toCopy; 9735 auto val = tableFn(caseIndex); 9736 // unpack length + codepoint 9737 immutable uint len = val >> 24; 9738 destIdx = encodeTo(ns, destIdx, cast(dchar)(val & 0xFF_FFFF)); 9739 foreach (j; caseIndex+1 .. caseIndex+len) 9740 destIdx = encodeTo(ns, destIdx, tableFn(j)); 9741 } 9742 } 9743 if (lastUnchanged != s.length) 9744 { 9745 auto toCopy = s.length - lastUnchanged; 9746 ns[destIdx .. destIdx+toCopy] = s[lastUnchanged..$]; 9747 destIdx += toCopy; 9748 } 9749 assert(ns.length == destIdx); 9750 s = ns; 9751 } 9752} 9753 9754/++ 9755 Converts `s` to lowercase (by performing Unicode lowercase mapping) in place. 9756 For a few characters string length may increase after the transformation, 9757 in such a case the function reallocates exactly once. 9758 If `s` does not have any uppercase characters, then `s` is unaltered. 9759+/ 9760void toLowerInPlace(C)(ref C[] s) @trusted pure 9761if (is(C == char) || is(C == wchar) || is(C == dchar)) 9762{ 9763 toCaseInPlace!(LowerTriple)(s); 9764} 9765// overloads for the most common cases to reduce compile time 9766@safe pure /*TODO nothrow*/ 9767{ 9768 void toLowerInPlace(ref char[] s) 9769 { toLowerInPlace!char(s); } 9770 void toLowerInPlace(ref wchar[] s) 9771 { toLowerInPlace!wchar(s); } 9772 void toLowerInPlace(ref dchar[] s) 9773 { toLowerInPlace!dchar(s); } 9774} 9775 9776/++ 9777 Converts `s` to uppercase (by performing Unicode uppercase mapping) in place. 9778 For a few characters string length may increase after the transformation, 9779 in such a case the function reallocates exactly once. 9780 If `s` does not have any lowercase characters, then `s` is unaltered. 9781+/ 9782void toUpperInPlace(C)(ref C[] s) @trusted pure 9783if (is(C == char) || is(C == wchar) || is(C == dchar)) 9784{ 9785 toCaseInPlace!(UpperTriple)(s); 9786} 9787// overloads for the most common cases to reduce compile time/code size 9788@safe pure /*TODO nothrow*/ 9789{ 9790 void toUpperInPlace(ref char[] s) 9791 { toUpperInPlace!char(s); } 9792 void toUpperInPlace(ref wchar[] s) 9793 { toUpperInPlace!wchar(s); } 9794 void toUpperInPlace(ref dchar[] s) 9795 { toUpperInPlace!dchar(s); } 9796} 9797 9798/++ 9799 If `c` is a Unicode uppercase $(CHARACTER), then its lowercase equivalent 9800 is returned. Otherwise `c` is returned. 9801 9802 Warning: certain alphabets like German and Greek have no 1:1 9803 upper-lower mapping. Use overload of toLower which takes full string instead. 9804+/ 9805@safe pure nothrow @nogc 9806dchar toLower(dchar c) 9807{ 9808 // optimize ASCII case 9809 if (c < 0xAA) 9810 { 9811 if (c < 'A') 9812 return c; 9813 if (c <= 'Z') 9814 return c + 32; 9815 return c; 9816 } 9817 size_t idx = toLowerSimpleIndex(c); 9818 if (idx != ushort.max) 9819 { 9820 return toLowerTab(idx); 9821 } 9822 return c; 9823} 9824 9825/++ 9826 Creates a new array which is identical to `s` except that all of its 9827 characters are converted to lowercase (by performing Unicode lowercase mapping). 9828 If none of `s` characters were affected, then `s` itself is returned if `s` is a 9829 `string`-like type. 9830 9831 Params: 9832 s = A $(REF_ALTTEXT random access range, isRandomAccessRange, std,range,primitives) 9833 of characters 9834 Returns: 9835 An array with the same element type as `s`. 9836+/ 9837ElementEncodingType!S[] toLower(S)(return scope S s) @trusted 9838if (isSomeString!S) 9839{ 9840 static import std.ascii; 9841 return toCase!(LowerTriple, std.ascii.toLower)(s); 9842} 9843 9844/// ditto 9845ElementEncodingType!S[] toLower(S)(S s) 9846if (!isSomeString!S && (isRandomAccessRange!S && hasLength!S && hasSlicing!S && isSomeChar!(ElementType!S))) 9847{ 9848 static import std.ascii; 9849 return toCase!(LowerTriple, std.ascii.toLower)(s); 9850} 9851 9852// overloads for the most common cases to reduce compile time 9853@safe pure /*TODO nothrow*/ 9854{ 9855 string toLower(return scope string s) 9856 { return toLower!string(s); } 9857 wstring toLower(return scope wstring s) 9858 { return toLower!wstring(s); } 9859 dstring toLower(return scope dstring s) 9860 { return toLower!dstring(s); } 9861 9862 @safe unittest 9863 { 9864 // https://issues.dlang.org/show_bug.cgi?id=16663 9865 9866 static struct String 9867 { 9868 string data; 9869 alias data this; 9870 } 9871 9872 void foo() 9873 { 9874 auto u = toLower(String("")); 9875 } 9876 } 9877} 9878 9879 9880@safe unittest 9881{ 9882 static import std.ascii; 9883 import std.format : format; 9884 foreach (ch; 0 .. 0x80) 9885 assert(std.ascii.toLower(ch) == toLower(ch)); 9886 assert(toLower('��') == '��'); 9887 assert(toLower('��') == '��'); 9888 foreach (ch; unicode.upperCase.byCodepoint) 9889 { 9890 dchar low = ch.toLower(); 9891 assert(low == ch || isLower(low), format("%s -> %s", ch, low)); 9892 } 9893 assert(toLower("����") == "����"); 9894 9895 assert("\u1E9E".toLower == "\u00df"); 9896 assert("\u00df".toUpper == "SS"); 9897} 9898 9899// https://issues.dlang.org/show_bug.cgi?id=9629 9900@safe unittest 9901{ 9902 wchar[] test = "hello �� world"w.dup; 9903 auto piece = test[6 .. 7]; 9904 toUpperInPlace(piece); 9905 assert(test == "hello �� world"); 9906} 9907 9908 9909@safe unittest 9910{ 9911 import std.algorithm.comparison : cmp; 9912 string s1 = "FoL"; 9913 string s2 = toLower(s1); 9914 assert(cmp(s2, "fol") == 0, s2); 9915 assert(s2 != s1); 9916 9917 char[] s3 = s1.dup; 9918 toLowerInPlace(s3); 9919 assert(s3 == s2); 9920 9921 s1 = "A\u0100B\u0101d"; 9922 s2 = toLower(s1); 9923 s3 = s1.dup; 9924 assert(cmp(s2, "a\u0101b\u0101d") == 0); 9925 assert(s2 !is s1); 9926 toLowerInPlace(s3); 9927 assert(s3 == s2); 9928 9929 s1 = "A\u0460B\u0461d"; 9930 s2 = toLower(s1); 9931 s3 = s1.dup; 9932 assert(cmp(s2, "a\u0461b\u0461d") == 0); 9933 assert(s2 !is s1); 9934 toLowerInPlace(s3); 9935 assert(s3 == s2); 9936 9937 s1 = "\u0130"; 9938 s2 = toLower(s1); 9939 s3 = s1.dup; 9940 assert(s2 == "i\u0307"); 9941 assert(s2 !is s1); 9942 toLowerInPlace(s3); 9943 assert(s3 == s2); 9944 9945 // Test on wchar and dchar strings. 9946 assert(toLower("Some String"w) == "some string"w); 9947 assert(toLower("Some String"d) == "some string"d); 9948 9949 // https://issues.dlang.org/show_bug.cgi?id=12455 9950 dchar c = '��'; // '\U0130' LATIN CAPITAL LETTER I WITH DOT ABOVE 9951 assert(isUpper(c)); 9952 assert(toLower(c) == 'i'); 9953 // extends on https://issues.dlang.org/show_bug.cgi?id=12455 report 9954 // check simple-case toUpper too 9955 c = '\u1f87'; 9956 assert(isLower(c)); 9957 assert(toUpper(c) == '\u1F8F'); 9958} 9959 9960@safe pure unittest 9961{ 9962 import std.algorithm.comparison : cmp, equal; 9963 import std.utf : byCodeUnit; 9964 auto r1 = "FoL".byCodeUnit; 9965 assert(r1.toLower.cmp("fol") == 0); 9966 auto r2 = "A\u0460B\u0461d".byCodeUnit; 9967 assert(r2.toLower.cmp("a\u0461b\u0461d") == 0); 9968} 9969 9970/++ 9971 If `c` is a Unicode lowercase $(CHARACTER), then its uppercase equivalent 9972 is returned. Otherwise `c` is returned. 9973 9974 Warning: 9975 Certain alphabets like German and Greek have no 1:1 9976 upper-lower mapping. Use overload of toUpper which takes full string instead. 9977 9978 toUpper can be used as an argument to $(REF map, std,algorithm,iteration) 9979 to produce an algorithm that can convert a range of characters to upper case 9980 without allocating memory. 9981 A string can then be produced by using $(REF copy, std,algorithm,mutation) 9982 to send it to an $(REF appender, std,array). 9983+/ 9984@safe pure nothrow @nogc 9985dchar toUpper(dchar c) 9986{ 9987 // optimize ASCII case 9988 if (c < 0xAA) 9989 { 9990 if (c < 'a') 9991 return c; 9992 if (c <= 'z') 9993 return c - 32; 9994 return c; 9995 } 9996 size_t idx = toUpperSimpleIndex(c); 9997 if (idx != ushort.max) 9998 { 9999 return toUpperTab(idx); 10000 } 10001 return c; 10002} 10003 10004/// 10005@safe unittest 10006{ 10007 import std.algorithm.iteration : map; 10008 import std.algorithm.mutation : copy; 10009 import std.array : appender; 10010 10011 auto abuf = appender!(char[])(); 10012 "hello".map!toUpper.copy(abuf); 10013 assert(abuf.data == "HELLO"); 10014} 10015 10016@safe unittest 10017{ 10018 static import std.ascii; 10019 import std.format : format; 10020 foreach (ch; 0 .. 0x80) 10021 assert(std.ascii.toUpper(ch) == toUpper(ch)); 10022 assert(toUpper('��') == '��'); 10023 assert(toUpper('��') == '��'); 10024 auto title = unicode.Titlecase_Letter; 10025 foreach (ch; unicode.lowerCase.byCodepoint) 10026 { 10027 dchar up = ch.toUpper(); 10028 assert(up == ch || isUpper(up) || title[up], 10029 format("%x -> %x", ch, up)); 10030 } 10031} 10032 10033/++ 10034 Allocates a new array which is identical to `s` except that all of its 10035 characters are converted to uppercase (by performing Unicode uppercase mapping). 10036 If none of `s` characters were affected, then `s` itself is returned if `s` 10037 is a `string`-like type. 10038 10039 Params: 10040 s = A $(REF_ALTTEXT random access range, isRandomAccessRange, std,range,primitives) 10041 of characters 10042 Returns: 10043 An new array with the same element type as `s`. 10044+/ 10045ElementEncodingType!S[] toUpper(S)(return scope S s) @trusted 10046if (isSomeString!S) 10047{ 10048 static import std.ascii; 10049 return toCase!(UpperTriple, std.ascii.toUpper)(s); 10050} 10051 10052/// ditto 10053ElementEncodingType!S[] toUpper(S)(S s) 10054if (!isSomeString!S && (isRandomAccessRange!S && hasLength!S && hasSlicing!S && isSomeChar!(ElementType!S))) 10055{ 10056 static import std.ascii; 10057 return toCase!(UpperTriple, std.ascii.toUpper)(s); 10058} 10059 10060// overloads for the most common cases to reduce compile time 10061@safe pure /*TODO nothrow*/ 10062{ 10063 string toUpper(return scope string s) 10064 { return toUpper!string(s); } 10065 wstring toUpper(return scope wstring s) 10066 { return toUpper!wstring(s); } 10067 dstring toUpper(return scope dstring s) 10068 { return toUpper!dstring(s); } 10069 10070 @safe unittest 10071 { 10072 // https://issues.dlang.org/show_bug.cgi?id=16663 10073 10074 static struct String 10075 { 10076 string data; 10077 alias data this; 10078 } 10079 10080 void foo() 10081 { 10082 auto u = toUpper(String("")); 10083 } 10084 } 10085} 10086 10087@safe unittest 10088{ 10089 import std.algorithm.comparison : cmp; 10090 10091 string s1 = "FoL"; 10092 string s2; 10093 char[] s3; 10094 10095 s2 = toUpper(s1); 10096 s3 = s1.dup; toUpperInPlace(s3); 10097 assert(s3 == s2, s3); 10098 assert(cmp(s2, "FOL") == 0); 10099 assert(s2 !is s1); 10100 10101 s1 = "a\u0100B\u0101d"; 10102 s2 = toUpper(s1); 10103 s3 = s1.dup; toUpperInPlace(s3); 10104 assert(s3 == s2); 10105 assert(cmp(s2, "A\u0100B\u0100D") == 0); 10106 assert(s2 !is s1); 10107 10108 s1 = "a\u0460B\u0461d"; 10109 s2 = toUpper(s1); 10110 s3 = s1.dup; toUpperInPlace(s3); 10111 assert(s3 == s2); 10112 assert(cmp(s2, "A\u0460B\u0460D") == 0); 10113 assert(s2 !is s1); 10114} 10115 10116@safe unittest 10117{ 10118 static void doTest(C)(const(C)[] s, const(C)[] trueUp, const(C)[] trueLow) 10119 { 10120 import std.format : format; 10121 string diff = "src: %( %x %)\nres: %( %x %)\ntru: %( %x %)"; 10122 auto low = s.toLower() , up = s.toUpper(); 10123 auto lowInp = s.dup, upInp = s.dup; 10124 lowInp.toLowerInPlace(); 10125 upInp.toUpperInPlace(); 10126 assert(low == trueLow, format(diff, low, trueLow)); 10127 assert(up == trueUp, format(diff, up, trueUp)); 10128 assert(lowInp == trueLow, 10129 format(diff, cast(const(ubyte)[]) s, cast(const(ubyte)[]) lowInp, cast(const(ubyte)[]) trueLow)); 10130 assert(upInp == trueUp, 10131 format(diff, cast(const(ubyte)[]) s, cast(const(ubyte)[]) upInp, cast(const(ubyte)[]) trueUp)); 10132 } 10133 static foreach (S; AliasSeq!(dstring, wstring, string)) 10134 {{ 10135 10136 S easy = "123"; 10137 S good = "abC������"; 10138 S awful = "\u0131\u023f\u2126"; 10139 S wicked = "\u0130\u1FE2"; 10140 auto options = [easy, good, awful, wicked]; 10141 S[] lower = ["123", "abc������", "\u0131\u023f\u03c9", "i\u0307\u1Fe2"]; 10142 S[] upper = ["123", "ABC������", "I\u2c7e\u2126", "\u0130\u03A5\u0308\u0300"]; 10143 10144 foreach (val; [easy, good]) 10145 { 10146 auto e = val.dup; 10147 auto g = e; 10148 e.toUpperInPlace(); 10149 assert(e is g); 10150 e.toLowerInPlace(); 10151 assert(e is g); 10152 } 10153 foreach (i, v; options) 10154 { 10155 doTest(v, upper[i], lower[i]); 10156 } 10157 10158 // a few combinatorial runs 10159 foreach (i; 0 .. options.length) 10160 foreach (j; i .. options.length) 10161 foreach (k; j .. options.length) 10162 { 10163 auto sample = options[i] ~ options[j] ~ options[k]; 10164 auto sample2 = options[k] ~ options[j] ~ options[i]; 10165 doTest(sample, upper[i] ~ upper[j] ~ upper[k], 10166 lower[i] ~ lower[j] ~ lower[k]); 10167 doTest(sample2, upper[k] ~ upper[j] ~ upper[i], 10168 lower[k] ~ lower[j] ~ lower[i]); 10169 } 10170 }} 10171} 10172 10173// test random access ranges 10174@safe pure unittest 10175{ 10176 import std.algorithm.comparison : cmp; 10177 import std.utf : byCodeUnit; 10178 auto s1 = "FoL".byCodeUnit; 10179 assert(s1.toUpper.cmp("FOL") == 0); 10180 auto s2 = "a\u0460B\u0461d".byCodeUnit; 10181 assert(s2.toUpper.cmp("A\u0460B\u0460D") == 0); 10182} 10183 10184/++ 10185 Returns whether `c` is a Unicode alphabetic $(CHARACTER) 10186 (general Unicode category: Alphabetic). 10187+/ 10188@safe pure nothrow @nogc 10189bool isAlpha(dchar c) 10190{ 10191 // optimization 10192 if (c < 0xAA) 10193 { 10194 size_t x = c - 'A'; 10195 if (x <= 'Z' - 'A') 10196 return true; 10197 else 10198 { 10199 x = c - 'a'; 10200 if (x <= 'z'-'a') 10201 return true; 10202 } 10203 return false; 10204 } 10205 10206 return alphaTrie[c]; 10207} 10208 10209@safe unittest 10210{ 10211 auto alpha = unicode("Alphabetic"); 10212 foreach (ch; alpha.byCodepoint) 10213 assert(isAlpha(ch)); 10214 foreach (ch; 0 .. 0x4000) 10215 assert((ch in alpha) == isAlpha(ch)); 10216} 10217 10218 10219/++ 10220 Returns whether `c` is a Unicode mark 10221 (general Unicode category: Mn, Me, Mc). 10222+/ 10223@safe pure nothrow @nogc 10224bool isMark(dchar c) 10225{ 10226 return markTrie[c]; 10227} 10228 10229@safe unittest 10230{ 10231 auto mark = unicode("Mark"); 10232 foreach (ch; mark.byCodepoint) 10233 assert(isMark(ch)); 10234 foreach (ch; 0 .. 0x4000) 10235 assert((ch in mark) == isMark(ch)); 10236} 10237 10238/++ 10239 Returns whether `c` is a Unicode numerical $(CHARACTER) 10240 (general Unicode category: Nd, Nl, No). 10241+/ 10242@safe pure nothrow @nogc 10243bool isNumber(dchar c) 10244{ 10245 // optimization for ascii case 10246 if (c <= 0x7F) 10247 { 10248 return c >= '0' && c <= '9'; 10249 } 10250 else 10251 { 10252 return numberTrie[c]; 10253 } 10254} 10255 10256@safe unittest 10257{ 10258 auto n = unicode("N"); 10259 foreach (ch; n.byCodepoint) 10260 assert(isNumber(ch)); 10261 foreach (ch; 0 .. 0x4000) 10262 assert((ch in n) == isNumber(ch)); 10263} 10264 10265/++ 10266 Returns whether `c` is a Unicode alphabetic $(CHARACTER) or number. 10267 (general Unicode category: Alphabetic, Nd, Nl, No). 10268 10269 Params: 10270 c = any Unicode character 10271 Returns: 10272 `true` if the character is in the Alphabetic, Nd, Nl, or No Unicode 10273 categories 10274+/ 10275@safe pure nothrow @nogc 10276bool isAlphaNum(dchar c) 10277{ 10278 static import std.ascii; 10279 10280 // optimization for ascii case 10281 if (std.ascii.isASCII(c)) 10282 { 10283 return std.ascii.isAlphaNum(c); 10284 } 10285 else 10286 { 10287 return isAlpha(c) || isNumber(c); 10288 } 10289} 10290 10291@safe unittest 10292{ 10293 auto n = unicode("N"); 10294 auto alpha = unicode("Alphabetic"); 10295 10296 foreach (ch; n.byCodepoint) 10297 assert(isAlphaNum(ch)); 10298 10299 foreach (ch; alpha.byCodepoint) 10300 assert(isAlphaNum(ch)); 10301 10302 foreach (ch; 0 .. 0x4000) 10303 { 10304 assert(((ch in n) || (ch in alpha)) == isAlphaNum(ch)); 10305 } 10306} 10307 10308/++ 10309 Returns whether `c` is a Unicode punctuation $(CHARACTER) 10310 (general Unicode category: Pd, Ps, Pe, Pc, Po, Pi, Pf). 10311+/ 10312@safe pure nothrow @nogc 10313bool isPunctuation(dchar c) 10314{ 10315 static import std.ascii; 10316 10317 // optimization for ascii case 10318 if (c <= 0x7F) 10319 { 10320 return std.ascii.isPunctuation(c); 10321 } 10322 else 10323 { 10324 return punctuationTrie[c]; 10325 } 10326} 10327 10328@safe unittest 10329{ 10330 assert(isPunctuation('\u0021')); 10331 assert(isPunctuation('\u0028')); 10332 assert(isPunctuation('\u0029')); 10333 assert(isPunctuation('\u002D')); 10334 assert(isPunctuation('\u005F')); 10335 assert(isPunctuation('\u00AB')); 10336 assert(isPunctuation('\u00BB')); 10337 foreach (ch; unicode("P").byCodepoint) 10338 assert(isPunctuation(ch)); 10339} 10340 10341/++ 10342 Returns whether `c` is a Unicode symbol $(CHARACTER) 10343 (general Unicode category: Sm, Sc, Sk, So). 10344+/ 10345@safe pure nothrow @nogc 10346bool isSymbol(dchar c) 10347{ 10348 return symbolTrie[c]; 10349} 10350 10351@safe unittest 10352{ 10353 import std.format : format; 10354 assert(isSymbol('\u0024')); 10355 assert(isSymbol('\u002B')); 10356 assert(isSymbol('\u005E')); 10357 assert(isSymbol('\u00A6')); 10358 foreach (ch; unicode("S").byCodepoint) 10359 assert(isSymbol(ch), format("%04x", ch)); 10360} 10361 10362/++ 10363 Returns whether `c` is a Unicode space $(CHARACTER) 10364 (general Unicode category: Zs) 10365 Note: This doesn't include '\n', '\r', \t' and other non-space $(CHARACTER). 10366 For commonly used less strict semantics see $(LREF isWhite). 10367+/ 10368@safe pure nothrow @nogc 10369bool isSpace(dchar c) 10370{ 10371 import std.internal.unicode_tables : isSpaceGen; // generated file 10372 return isSpaceGen(c); 10373} 10374 10375@safe unittest 10376{ 10377 assert(isSpace('\u0020')); 10378 auto space = unicode.Zs; 10379 foreach (ch; space.byCodepoint) 10380 assert(isSpace(ch)); 10381 foreach (ch; 0 .. 0x1000) 10382 assert(isSpace(ch) == space[ch]); 10383} 10384 10385 10386/++ 10387 Returns whether `c` is a Unicode graphical $(CHARACTER) 10388 (general Unicode category: L, M, N, P, S, Zs). 10389 10390+/ 10391@safe pure nothrow @nogc 10392bool isGraphical(dchar c) 10393{ 10394 return graphicalTrie[c]; 10395} 10396 10397 10398@safe unittest 10399{ 10400 auto set = unicode("Graphical"); 10401 import std.format : format; 10402 foreach (ch; set.byCodepoint) 10403 assert(isGraphical(ch), format("%4x", ch)); 10404 foreach (ch; 0 .. 0x4000) 10405 assert((ch in set) == isGraphical(ch)); 10406} 10407 10408 10409/++ 10410 Returns whether `c` is a Unicode control $(CHARACTER) 10411 (general Unicode category: Cc). 10412+/ 10413@safe pure nothrow @nogc 10414bool isControl(dchar c) 10415{ 10416 import std.internal.unicode_tables : isControlGen; // generated file 10417 return isControlGen(c); 10418} 10419 10420@safe unittest 10421{ 10422 assert(isControl('\u0000')); 10423 assert(isControl('\u0081')); 10424 assert(!isControl('\u0100')); 10425 auto cc = unicode.Cc; 10426 foreach (ch; cc.byCodepoint) 10427 assert(isControl(ch)); 10428 foreach (ch; 0 .. 0x1000) 10429 assert(isControl(ch) == cc[ch]); 10430} 10431 10432 10433/++ 10434 Returns whether `c` is a Unicode formatting $(CHARACTER) 10435 (general Unicode category: Cf). 10436+/ 10437@safe pure nothrow @nogc 10438bool isFormat(dchar c) 10439{ 10440 import std.internal.unicode_tables : isFormatGen; // generated file 10441 return isFormatGen(c); 10442} 10443 10444 10445@safe unittest 10446{ 10447 assert(isFormat('\u00AD')); 10448 foreach (ch; unicode("Format").byCodepoint) 10449 assert(isFormat(ch)); 10450} 10451 10452// code points for private use, surrogates are not likely to change in near feature 10453// if need be they can be generated from unicode data as well 10454 10455/++ 10456 Returns whether `c` is a Unicode Private Use $(CODEPOINT) 10457 (general Unicode category: Co). 10458+/ 10459@safe pure nothrow @nogc 10460bool isPrivateUse(dchar c) 10461{ 10462 return (0x00_E000 <= c && c <= 0x00_F8FF) 10463 || (0x0F_0000 <= c && c <= 0x0F_FFFD) 10464 || (0x10_0000 <= c && c <= 0x10_FFFD); 10465} 10466 10467/++ 10468 Returns whether `c` is a Unicode surrogate $(CODEPOINT) 10469 (general Unicode category: Cs). 10470+/ 10471@safe pure nothrow @nogc 10472bool isSurrogate(dchar c) 10473{ 10474 return (0xD800 <= c && c <= 0xDFFF); 10475} 10476 10477/++ 10478 Returns whether `c` is a Unicode high surrogate (lead surrogate). 10479+/ 10480@safe pure nothrow @nogc 10481bool isSurrogateHi(dchar c) 10482{ 10483 return (0xD800 <= c && c <= 0xDBFF); 10484} 10485 10486/++ 10487 Returns whether `c` is a Unicode low surrogate (trail surrogate). 10488+/ 10489@safe pure nothrow @nogc 10490bool isSurrogateLo(dchar c) 10491{ 10492 return (0xDC00 <= c && c <= 0xDFFF); 10493} 10494 10495/++ 10496 Returns whether `c` is a Unicode non-character i.e. 10497 a $(CODEPOINT) with no assigned abstract character. 10498 (general Unicode category: Cn) 10499+/ 10500@safe pure nothrow @nogc 10501bool isNonCharacter(dchar c) 10502{ 10503 return nonCharacterTrie[c]; 10504} 10505 10506@safe unittest 10507{ 10508 auto set = unicode("Cn"); 10509 foreach (ch; set.byCodepoint) 10510 assert(isNonCharacter(ch)); 10511} 10512 10513private: 10514// load static data from pre-generated tables into usable datastructures 10515 10516 10517@safe auto asSet(const (ubyte)[] compressed) pure 10518{ 10519 return CodepointSet.fromIntervals(decompressIntervals(compressed)); 10520} 10521 10522@safe pure nothrow auto asTrie(T...)(const scope TrieEntry!T e) 10523{ 10524 return const(CodepointTrie!T)(e.offsets, e.sizes, e.data); 10525} 10526 10527@safe pure nothrow @nogc @property 10528{ 10529 import std.internal.unicode_tables; // generated file 10530 10531 // It's important to use auto return here, so that the compiler 10532 // only runs semantic on the return type if the function gets 10533 // used. Also these are functions rather than templates to not 10534 // increase the object size of the caller. 10535 auto lowerCaseTrie() { static immutable res = asTrie(lowerCaseTrieEntries); return res; } 10536 auto upperCaseTrie() { static immutable res = asTrie(upperCaseTrieEntries); return res; } 10537 auto simpleCaseTrie() { static immutable res = asTrie(simpleCaseTrieEntries); return res; } 10538 auto fullCaseTrie() { static immutable res = asTrie(fullCaseTrieEntries); return res; } 10539 auto alphaTrie() { static immutable res = asTrie(alphaTrieEntries); return res; } 10540 auto markTrie() { static immutable res = asTrie(markTrieEntries); return res; } 10541 auto numberTrie() { static immutable res = asTrie(numberTrieEntries); return res; } 10542 auto punctuationTrie() { static immutable res = asTrie(punctuationTrieEntries); return res; } 10543 auto symbolTrie() { static immutable res = asTrie(symbolTrieEntries); return res; } 10544 auto graphicalTrie() { static immutable res = asTrie(graphicalTrieEntries); return res; } 10545 auto nonCharacterTrie() { static immutable res = asTrie(nonCharacterTrieEntries); return res; } 10546 10547 //normalization quick-check tables 10548 auto nfcQCTrie() 10549 { 10550 import std.internal.unicode_norm : nfcQCTrieEntries; 10551 static immutable res = asTrie(nfcQCTrieEntries); 10552 return res; 10553 } 10554 10555 auto nfdQCTrie() 10556 { 10557 import std.internal.unicode_norm : nfdQCTrieEntries; 10558 static immutable res = asTrie(nfdQCTrieEntries); 10559 return res; 10560 } 10561 10562 auto nfkcQCTrie() 10563 { 10564 import std.internal.unicode_norm : nfkcQCTrieEntries; 10565 static immutable res = asTrie(nfkcQCTrieEntries); 10566 return res; 10567 } 10568 10569 auto nfkdQCTrie() 10570 { 10571 import std.internal.unicode_norm : nfkdQCTrieEntries; 10572 static immutable res = asTrie(nfkdQCTrieEntries); 10573 return res; 10574 } 10575 10576 //grapheme breaking algorithm tables 10577 auto mcTrie() 10578 { 10579 import std.internal.unicode_grapheme : mcTrieEntries; 10580 static immutable res = asTrie(mcTrieEntries); 10581 return res; 10582 } 10583 10584 auto graphemeExtendTrie() 10585 { 10586 import std.internal.unicode_grapheme : graphemeExtendTrieEntries; 10587 static immutable res = asTrie(graphemeExtendTrieEntries); 10588 return res; 10589 } 10590 10591 auto hangLV() 10592 { 10593 import std.internal.unicode_grapheme : hangulLVTrieEntries; 10594 static immutable res = asTrie(hangulLVTrieEntries); 10595 return res; 10596 } 10597 10598 auto hangLVT() 10599 { 10600 import std.internal.unicode_grapheme : hangulLVTTrieEntries; 10601 static immutable res = asTrie(hangulLVTTrieEntries); 10602 return res; 10603 } 10604 10605 // tables below are used for composition/decomposition 10606 auto combiningClassTrie() 10607 { 10608 import std.internal.unicode_comp : combiningClassTrieEntries; 10609 static immutable res = asTrie(combiningClassTrieEntries); 10610 return res; 10611 } 10612 10613 auto compatMappingTrie() 10614 { 10615 import std.internal.unicode_decomp : compatMappingTrieEntries; 10616 static immutable res = asTrie(compatMappingTrieEntries); 10617 return res; 10618 } 10619 10620 auto canonMappingTrie() 10621 { 10622 import std.internal.unicode_decomp : canonMappingTrieEntries; 10623 static immutable res = asTrie(canonMappingTrieEntries); 10624 return res; 10625 } 10626 10627 auto compositionJumpTrie() 10628 { 10629 import std.internal.unicode_comp : compositionJumpTrieEntries; 10630 static immutable res = asTrie(compositionJumpTrieEntries); 10631 return res; 10632 } 10633 10634 //case conversion tables 10635 auto toUpperIndexTrie() { static immutable res = asTrie(toUpperIndexTrieEntries); return res; } 10636 auto toLowerIndexTrie() { static immutable res = asTrie(toLowerIndexTrieEntries); return res; } 10637 auto toTitleIndexTrie() { static immutable res = asTrie(toTitleIndexTrieEntries); return res; } 10638 //simple case conversion tables 10639 auto toUpperSimpleIndexTrie() { static immutable res = asTrie(toUpperSimpleIndexTrieEntries); return res; } 10640 auto toLowerSimpleIndexTrie() { static immutable res = asTrie(toLowerSimpleIndexTrieEntries); return res; } 10641 auto toTitleSimpleIndexTrie() { static immutable res = asTrie(toTitleSimpleIndexTrieEntries); return res; } 10642 10643} 10644 10645}// version (!std_uni_bootstrap) 10646