1303275Sdelphij/* 2303275Sdelphij * divsufsort.c for libdivsufsort 3303275Sdelphij * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved. 4303275Sdelphij * 5303275Sdelphij * Permission is hereby granted, free of charge, to any person 6303275Sdelphij * obtaining a copy of this software and associated documentation 7303275Sdelphij * files (the "Software"), to deal in the Software without 8303275Sdelphij * restriction, including without limitation the rights to use, 9303275Sdelphij * copy, modify, merge, publish, distribute, sublicense, and/or sell 10303275Sdelphij * copies of the Software, and to permit persons to whom the 11303275Sdelphij * Software is furnished to do so, subject to the following 12303275Sdelphij * conditions: 13303275Sdelphij * 14303275Sdelphij * The above copyright notice and this permission notice shall be 15303275Sdelphij * included in all copies or substantial portions of the Software. 16303275Sdelphij * 17303275Sdelphij * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18303275Sdelphij * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 19303275Sdelphij * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 20303275Sdelphij * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 21303275Sdelphij * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 22303275Sdelphij * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 23303275Sdelphij * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 24303275Sdelphij * OTHER DEALINGS IN THE SOFTWARE. 25303275Sdelphij */ 26303275Sdelphij 27303275Sdelphij#include "divsufsort_private.h" 28303275Sdelphij#ifdef _OPENMP 29303275Sdelphij# include <omp.h> 30303275Sdelphij#endif 31303275Sdelphij 32303275Sdelphij 33303275Sdelphij/*- Private Functions -*/ 34303275Sdelphij 35303275Sdelphij/* Sorts suffixes of type B*. */ 36303275Sdelphijstatic 37303275Sdelphijsaidx_t 38303275Sdelphijsort_typeBstar(const sauchar_t *T, saidx_t *SA, 39303275Sdelphij saidx_t *bucket_A, saidx_t *bucket_B, 40303275Sdelphij saidx_t n) { 41303275Sdelphij saidx_t *PAb, *ISAb, *buf; 42303275Sdelphij#ifdef _OPENMP 43303275Sdelphij saidx_t *curbuf; 44303275Sdelphij saidx_t l; 45303275Sdelphij#endif 46303275Sdelphij saidx_t i, j, k, t, m, bufsize; 47303275Sdelphij saint_t c0, c1; 48303275Sdelphij#ifdef _OPENMP 49303275Sdelphij saint_t d0, d1; 50303275Sdelphij int tmp; 51303275Sdelphij#endif 52303275Sdelphij 53303275Sdelphij /* Initialize bucket arrays. */ 54303275Sdelphij for(i = 0; i < BUCKET_A_SIZE; ++i) { bucket_A[i] = 0; } 55303275Sdelphij for(i = 0; i < BUCKET_B_SIZE; ++i) { bucket_B[i] = 0; } 56303275Sdelphij 57303275Sdelphij /* Count the number of occurrences of the first one or two characters of each 58303275Sdelphij type A, B and B* suffix. Moreover, store the beginning position of all 59303275Sdelphij type B* suffixes into the array SA. */ 60303275Sdelphij for(i = n - 1, m = n, c0 = T[n - 1]; 0 <= i;) { 61303275Sdelphij /* type A suffix. */ 62303275Sdelphij do { ++BUCKET_A(c1 = c0); } while((0 <= --i) && ((c0 = T[i]) >= c1)); 63303275Sdelphij if(0 <= i) { 64303275Sdelphij /* type B* suffix. */ 65303275Sdelphij ++BUCKET_BSTAR(c0, c1); 66303275Sdelphij SA[--m] = i; 67303275Sdelphij /* type B suffix. */ 68303275Sdelphij for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) <= c1); --i, c1 = c0) { 69303275Sdelphij ++BUCKET_B(c0, c1); 70303275Sdelphij } 71303275Sdelphij } 72303275Sdelphij } 73303275Sdelphij m = n - m; 74303275Sdelphij/* 75303275Sdelphijnote: 76303275Sdelphij A type B* suffix is lexicographically smaller than a type B suffix that 77303275Sdelphij begins with the same first two characters. 78303275Sdelphij*/ 79303275Sdelphij 80303275Sdelphij /* Calculate the index of start/end point of each bucket. */ 81303275Sdelphij for(c0 = 0, i = 0, j = 0; c0 < ALPHABET_SIZE; ++c0) { 82303275Sdelphij t = i + BUCKET_A(c0); 83303275Sdelphij BUCKET_A(c0) = i + j; /* start point */ 84303275Sdelphij i = t + BUCKET_B(c0, c0); 85303275Sdelphij for(c1 = c0 + 1; c1 < ALPHABET_SIZE; ++c1) { 86303275Sdelphij j += BUCKET_BSTAR(c0, c1); 87303275Sdelphij BUCKET_BSTAR(c0, c1) = j; /* end point */ 88303275Sdelphij i += BUCKET_B(c0, c1); 89303275Sdelphij } 90303275Sdelphij } 91303275Sdelphij 92303275Sdelphij if(0 < m) { 93303275Sdelphij /* Sort the type B* suffixes by their first two characters. */ 94303275Sdelphij PAb = SA + n - m; ISAb = SA + m; 95303275Sdelphij for(i = m - 2; 0 <= i; --i) { 96303275Sdelphij t = PAb[i], c0 = T[t], c1 = T[t + 1]; 97303275Sdelphij SA[--BUCKET_BSTAR(c0, c1)] = i; 98303275Sdelphij } 99303275Sdelphij t = PAb[m - 1], c0 = T[t], c1 = T[t + 1]; 100303275Sdelphij SA[--BUCKET_BSTAR(c0, c1)] = m - 1; 101303275Sdelphij 102303275Sdelphij /* Sort the type B* substrings using sssort. */ 103303275Sdelphij#ifdef _OPENMP 104303275Sdelphij tmp = omp_get_max_threads(); 105303275Sdelphij buf = SA + m, bufsize = (n - (2 * m)) / tmp; 106303275Sdelphij c0 = ALPHABET_SIZE - 2, c1 = ALPHABET_SIZE - 1, j = m; 107303275Sdelphij#pragma omp parallel default(shared) private(curbuf, k, l, d0, d1, tmp) 108303275Sdelphij { 109303275Sdelphij tmp = omp_get_thread_num(); 110303275Sdelphij curbuf = buf + tmp * bufsize; 111303275Sdelphij k = 0; 112303275Sdelphij for(;;) { 113303275Sdelphij #pragma omp critical(sssort_lock) 114303275Sdelphij { 115303275Sdelphij if(0 < (l = j)) { 116303275Sdelphij d0 = c0, d1 = c1; 117303275Sdelphij do { 118303275Sdelphij k = BUCKET_BSTAR(d0, d1); 119303275Sdelphij if(--d1 <= d0) { 120303275Sdelphij d1 = ALPHABET_SIZE - 1; 121303275Sdelphij if(--d0 < 0) { break; } 122303275Sdelphij } 123303275Sdelphij } while(((l - k) <= 1) && (0 < (l = k))); 124303275Sdelphij c0 = d0, c1 = d1, j = k; 125303275Sdelphij } 126303275Sdelphij } 127303275Sdelphij if(l == 0) { break; } 128303275Sdelphij sssort(T, PAb, SA + k, SA + l, 129303275Sdelphij curbuf, bufsize, 2, n, *(SA + k) == (m - 1)); 130303275Sdelphij } 131303275Sdelphij } 132303275Sdelphij#else 133303275Sdelphij buf = SA + m, bufsize = n - (2 * m); 134303275Sdelphij for(c0 = ALPHABET_SIZE - 2, j = m; 0 < j; --c0) { 135303275Sdelphij for(c1 = ALPHABET_SIZE - 1; c0 < c1; j = i, --c1) { 136303275Sdelphij i = BUCKET_BSTAR(c0, c1); 137303275Sdelphij if(1 < (j - i)) { 138303275Sdelphij sssort(T, PAb, SA + i, SA + j, 139303275Sdelphij buf, bufsize, 2, n, *(SA + i) == (m - 1)); 140303275Sdelphij } 141303275Sdelphij } 142303275Sdelphij } 143303275Sdelphij#endif 144303275Sdelphij 145303275Sdelphij /* Compute ranks of type B* substrings. */ 146303275Sdelphij for(i = m - 1; 0 <= i; --i) { 147303275Sdelphij if(0 <= SA[i]) { 148303275Sdelphij j = i; 149303275Sdelphij do { ISAb[SA[i]] = i; } while((0 <= --i) && (0 <= SA[i])); 150303275Sdelphij SA[i + 1] = i - j; 151303275Sdelphij if(i <= 0) { break; } 152303275Sdelphij } 153303275Sdelphij j = i; 154303275Sdelphij do { ISAb[SA[i] = ~SA[i]] = j; } while(SA[--i] < 0); 155303275Sdelphij ISAb[SA[i]] = j; 156303275Sdelphij } 157303275Sdelphij 158303275Sdelphij /* Construct the inverse suffix array of type B* suffixes using trsort. */ 159303275Sdelphij trsort(ISAb, SA, m, 1); 160303275Sdelphij 161303275Sdelphij /* Set the sorted order of tyoe B* suffixes. */ 162303275Sdelphij for(i = n - 1, j = m, c0 = T[n - 1]; 0 <= i;) { 163303275Sdelphij for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) >= c1); --i, c1 = c0) { } 164303275Sdelphij if(0 <= i) { 165303275Sdelphij t = i; 166303275Sdelphij for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) <= c1); --i, c1 = c0) { } 167303275Sdelphij SA[ISAb[--j]] = ((t == 0) || (1 < (t - i))) ? t : ~t; 168303275Sdelphij } 169303275Sdelphij } 170303275Sdelphij 171303275Sdelphij /* Calculate the index of start/end point of each bucket. */ 172303275Sdelphij BUCKET_B(ALPHABET_SIZE - 1, ALPHABET_SIZE - 1) = n; /* end point */ 173303275Sdelphij for(c0 = ALPHABET_SIZE - 2, k = m - 1; 0 <= c0; --c0) { 174303275Sdelphij i = BUCKET_A(c0 + 1) - 1; 175303275Sdelphij for(c1 = ALPHABET_SIZE - 1; c0 < c1; --c1) { 176303275Sdelphij t = i - BUCKET_B(c0, c1); 177303275Sdelphij BUCKET_B(c0, c1) = i; /* end point */ 178303275Sdelphij 179303275Sdelphij /* Move all type B* suffixes to the correct position. */ 180303275Sdelphij for(i = t, j = BUCKET_BSTAR(c0, c1); 181303275Sdelphij j <= k; 182303275Sdelphij --i, --k) { SA[i] = SA[k]; } 183303275Sdelphij } 184303275Sdelphij BUCKET_BSTAR(c0, c0 + 1) = i - BUCKET_B(c0, c0) + 1; /* start point */ 185303275Sdelphij BUCKET_B(c0, c0) = i; /* end point */ 186303275Sdelphij } 187303275Sdelphij } 188303275Sdelphij 189303275Sdelphij return m; 190303275Sdelphij} 191303275Sdelphij 192303275Sdelphij/* Constructs the suffix array by using the sorted order of type B* suffixes. */ 193303275Sdelphijstatic 194303275Sdelphijvoid 195303275Sdelphijconstruct_SA(const sauchar_t *T, saidx_t *SA, 196303275Sdelphij saidx_t *bucket_A, saidx_t *bucket_B, 197303275Sdelphij saidx_t n, saidx_t m) { 198303275Sdelphij saidx_t *i, *j, *k; 199303275Sdelphij saidx_t s; 200303275Sdelphij saint_t c0, c1, c2; 201303275Sdelphij 202303275Sdelphij if(0 < m) { 203303275Sdelphij /* Construct the sorted order of type B suffixes by using 204303275Sdelphij the sorted order of type B* suffixes. */ 205303275Sdelphij for(c1 = ALPHABET_SIZE - 2; 0 <= c1; --c1) { 206303275Sdelphij /* Scan the suffix array from right to left. */ 207303275Sdelphij for(i = SA + BUCKET_BSTAR(c1, c1 + 1), 208303275Sdelphij j = SA + BUCKET_A(c1 + 1) - 1, k = NULL, c2 = -1; 209303275Sdelphij i <= j; 210303275Sdelphij --j) { 211303275Sdelphij if(0 < (s = *j)) { 212303275Sdelphij assert(T[s] == c1); 213303275Sdelphij assert(((s + 1) < n) && (T[s] <= T[s + 1])); 214303275Sdelphij assert(T[s - 1] <= T[s]); 215303275Sdelphij *j = ~s; 216303275Sdelphij c0 = T[--s]; 217303275Sdelphij if((0 < s) && (T[s - 1] > c0)) { s = ~s; } 218303275Sdelphij if(c0 != c2) { 219303275Sdelphij if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; } 220303275Sdelphij k = SA + BUCKET_B(c2 = c0, c1); 221303275Sdelphij } 222303275Sdelphij assert(k < j); 223303275Sdelphij *k-- = s; 224303275Sdelphij } else { 225303275Sdelphij assert(((s == 0) && (T[s] == c1)) || (s < 0)); 226303275Sdelphij *j = ~s; 227303275Sdelphij } 228303275Sdelphij } 229303275Sdelphij } 230303275Sdelphij } 231303275Sdelphij 232303275Sdelphij /* Construct the suffix array by using 233303275Sdelphij the sorted order of type B suffixes. */ 234303275Sdelphij k = SA + BUCKET_A(c2 = T[n - 1]); 235303275Sdelphij *k++ = (T[n - 2] < c2) ? ~(n - 1) : (n - 1); 236303275Sdelphij /* Scan the suffix array from left to right. */ 237303275Sdelphij for(i = SA, j = SA + n; i < j; ++i) { 238303275Sdelphij if(0 < (s = *i)) { 239303275Sdelphij assert(T[s - 1] >= T[s]); 240303275Sdelphij c0 = T[--s]; 241303275Sdelphij if((s == 0) || (T[s - 1] < c0)) { s = ~s; } 242303275Sdelphij if(c0 != c2) { 243303275Sdelphij BUCKET_A(c2) = k - SA; 244303275Sdelphij k = SA + BUCKET_A(c2 = c0); 245303275Sdelphij } 246303275Sdelphij assert(i < k); 247303275Sdelphij *k++ = s; 248303275Sdelphij } else { 249303275Sdelphij assert(s < 0); 250303275Sdelphij *i = ~s; 251303275Sdelphij } 252303275Sdelphij } 253303275Sdelphij} 254303275Sdelphij 255303275Sdelphij/* Constructs the burrows-wheeler transformed string directly 256303275Sdelphij by using the sorted order of type B* suffixes. */ 257303275Sdelphijstatic 258303275Sdelphijsaidx_t 259303275Sdelphijconstruct_BWT(const sauchar_t *T, saidx_t *SA, 260303275Sdelphij saidx_t *bucket_A, saidx_t *bucket_B, 261303275Sdelphij saidx_t n, saidx_t m) { 262303275Sdelphij saidx_t *i, *j, *k, *orig; 263303275Sdelphij saidx_t s; 264303275Sdelphij saint_t c0, c1, c2; 265303275Sdelphij 266303275Sdelphij if(0 < m) { 267303275Sdelphij /* Construct the sorted order of type B suffixes by using 268303275Sdelphij the sorted order of type B* suffixes. */ 269303275Sdelphij for(c1 = ALPHABET_SIZE - 2; 0 <= c1; --c1) { 270303275Sdelphij /* Scan the suffix array from right to left. */ 271303275Sdelphij for(i = SA + BUCKET_BSTAR(c1, c1 + 1), 272303275Sdelphij j = SA + BUCKET_A(c1 + 1) - 1, k = NULL, c2 = -1; 273303275Sdelphij i <= j; 274303275Sdelphij --j) { 275303275Sdelphij if(0 < (s = *j)) { 276303275Sdelphij assert(T[s] == c1); 277303275Sdelphij assert(((s + 1) < n) && (T[s] <= T[s + 1])); 278303275Sdelphij assert(T[s - 1] <= T[s]); 279303275Sdelphij c0 = T[--s]; 280303275Sdelphij *j = ~((saidx_t)c0); 281303275Sdelphij if((0 < s) && (T[s - 1] > c0)) { s = ~s; } 282303275Sdelphij if(c0 != c2) { 283303275Sdelphij if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; } 284303275Sdelphij k = SA + BUCKET_B(c2 = c0, c1); 285303275Sdelphij } 286303275Sdelphij assert(k < j); 287303275Sdelphij *k-- = s; 288303275Sdelphij } else if(s != 0) { 289303275Sdelphij *j = ~s; 290303275Sdelphij#ifndef NDEBUG 291303275Sdelphij } else { 292303275Sdelphij assert(T[s] == c1); 293303275Sdelphij#endif 294303275Sdelphij } 295303275Sdelphij } 296303275Sdelphij } 297303275Sdelphij } 298303275Sdelphij 299303275Sdelphij /* Construct the BWTed string by using 300303275Sdelphij the sorted order of type B suffixes. */ 301303275Sdelphij k = SA + BUCKET_A(c2 = T[n - 1]); 302303275Sdelphij *k++ = (T[n - 2] < c2) ? ~((saidx_t)T[n - 2]) : (n - 1); 303303275Sdelphij /* Scan the suffix array from left to right. */ 304303275Sdelphij for(i = SA, j = SA + n, orig = SA; i < j; ++i) { 305303275Sdelphij if(0 < (s = *i)) { 306303275Sdelphij assert(T[s - 1] >= T[s]); 307303275Sdelphij c0 = T[--s]; 308303275Sdelphij *i = c0; 309303275Sdelphij if((0 < s) && (T[s - 1] < c0)) { s = ~((saidx_t)T[s - 1]); } 310303275Sdelphij if(c0 != c2) { 311303275Sdelphij BUCKET_A(c2) = k - SA; 312303275Sdelphij k = SA + BUCKET_A(c2 = c0); 313303275Sdelphij } 314303275Sdelphij assert(i < k); 315303275Sdelphij *k++ = s; 316303275Sdelphij } else if(s != 0) { 317303275Sdelphij *i = ~s; 318303275Sdelphij } else { 319303275Sdelphij orig = i; 320303275Sdelphij } 321303275Sdelphij } 322303275Sdelphij 323303275Sdelphij return orig - SA; 324303275Sdelphij} 325303275Sdelphij 326303275Sdelphij 327303275Sdelphij/*---------------------------------------------------------------------------*/ 328303275Sdelphij 329303275Sdelphij/*- Function -*/ 330303275Sdelphij 331303275Sdelphijsaint_t 332303275Sdelphijdivsufsort(const sauchar_t *T, saidx_t *SA, saidx_t n) { 333303275Sdelphij saidx_t *bucket_A, *bucket_B; 334303275Sdelphij saidx_t m; 335303275Sdelphij saint_t err = 0; 336303275Sdelphij 337303275Sdelphij /* Check arguments. */ 338303275Sdelphij if((T == NULL) || (SA == NULL) || (n < 0)) { return -1; } 339303275Sdelphij else if(n == 0) { return 0; } 340303275Sdelphij else if(n == 1) { SA[0] = 0; return 0; } 341303275Sdelphij else if(n == 2) { m = (T[0] < T[1]); SA[m ^ 1] = 0, SA[m] = 1; return 0; } 342303275Sdelphij 343303275Sdelphij bucket_A = (saidx_t *)malloc(BUCKET_A_SIZE * sizeof(saidx_t)); 344303275Sdelphij bucket_B = (saidx_t *)malloc(BUCKET_B_SIZE * sizeof(saidx_t)); 345303275Sdelphij 346303275Sdelphij /* Suffixsort. */ 347303275Sdelphij if((bucket_A != NULL) && (bucket_B != NULL)) { 348303275Sdelphij m = sort_typeBstar(T, SA, bucket_A, bucket_B, n); 349303275Sdelphij construct_SA(T, SA, bucket_A, bucket_B, n, m); 350303275Sdelphij } else { 351303275Sdelphij err = -2; 352303275Sdelphij } 353303275Sdelphij 354303275Sdelphij free(bucket_B); 355303275Sdelphij free(bucket_A); 356303275Sdelphij 357303275Sdelphij return err; 358303275Sdelphij} 359303275Sdelphij 360303275Sdelphijsaidx_t 361303275Sdelphijdivbwt(const sauchar_t *T, sauchar_t *U, saidx_t *A, saidx_t n) { 362303275Sdelphij saidx_t *B; 363303275Sdelphij saidx_t *bucket_A, *bucket_B; 364303275Sdelphij saidx_t m, pidx, i; 365303275Sdelphij 366303275Sdelphij /* Check arguments. */ 367303275Sdelphij if((T == NULL) || (U == NULL) || (n < 0)) { return -1; } 368303275Sdelphij else if(n <= 1) { if(n == 1) { U[0] = T[0]; } return n; } 369303275Sdelphij 370303275Sdelphij if((B = A) == NULL) { B = (saidx_t *)malloc((size_t)(n + 1) * sizeof(saidx_t)); } 371303275Sdelphij bucket_A = (saidx_t *)malloc(BUCKET_A_SIZE * sizeof(saidx_t)); 372303275Sdelphij bucket_B = (saidx_t *)malloc(BUCKET_B_SIZE * sizeof(saidx_t)); 373303275Sdelphij 374303275Sdelphij /* Burrows-Wheeler Transform. */ 375303275Sdelphij if((B != NULL) && (bucket_A != NULL) && (bucket_B != NULL)) { 376303275Sdelphij m = sort_typeBstar(T, B, bucket_A, bucket_B, n); 377303275Sdelphij pidx = construct_BWT(T, B, bucket_A, bucket_B, n, m); 378303275Sdelphij 379303275Sdelphij /* Copy to output string. */ 380303275Sdelphij U[0] = T[n - 1]; 381303275Sdelphij for(i = 0; i < pidx; ++i) { U[i + 1] = (sauchar_t)B[i]; } 382303275Sdelphij for(i += 1; i < n; ++i) { U[i] = (sauchar_t)B[i]; } 383303275Sdelphij pidx += 1; 384303275Sdelphij } else { 385303275Sdelphij pidx = -2; 386303275Sdelphij } 387303275Sdelphij 388303275Sdelphij free(bucket_B); 389303275Sdelphij free(bucket_A); 390303275Sdelphij if(A == NULL) { free(B); } 391303275Sdelphij 392303275Sdelphij return pidx; 393303275Sdelphij} 394303275Sdelphij 395303275Sdelphijconst char * 396303275Sdelphijdivsufsort_version(void) { 397303275Sdelphij return PROJECT_VERSION_FULL; 398303275Sdelphij} 399