file.c revision 281535
1170530Ssam/*- 2178354Ssam * Copyright (C) 2009 Gabor Kovesdan <gabor@FreeBSD.org> 3170530Ssam * Copyright (C) 2012 Oleg Moskalenko <mom040267@gmail.com> 4170530Ssam * All rights reserved. 5170530Ssam * 6170530Ssam * Redistribution and use in source and binary forms, with or without 7170530Ssam * modification, are permitted provided that the following conditions 8170530Ssam * are met: 9170530Ssam * 1. Redistributions of source code must retain the above copyright 10170530Ssam * notice, this list of conditions and the following disclaimer. 11170530Ssam * 2. Redistributions in binary form must reproduce the above copyright 12170530Ssam * notice, this list of conditions and the following disclaimer in the 13170530Ssam * documentation and/or other materials provided with the distribution. 14170530Ssam * 15170530Ssam * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16170530Ssam * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17170530Ssam * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18170530Ssam * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19170530Ssam * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20170530Ssam * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21170530Ssam * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22170530Ssam * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23170530Ssam * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24170530Ssam * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25170530Ssam * SUCH DAMAGE. 26170530Ssam */ 27170530Ssam 28170530Ssam#include <sys/cdefs.h> 29170530Ssam__FBSDID("$FreeBSD: stable/10/usr.bin/sort/file.c 281535 2015-04-14 18:57:50Z pfg $"); 30170530Ssam 31170530Ssam#include <sys/mman.h> 32170530Ssam#include <sys/stat.h> 33170530Ssam#include <sys/types.h> 34170530Ssam#include <sys/queue.h> 35170530Ssam 36178354Ssam#include <err.h> 37170530Ssam#include <fcntl.h> 38170530Ssam#if defined(SORT_THREADS) 39170530Ssam#include <pthread.h> 40170530Ssam#endif 41170530Ssam#include <semaphore.h> 42170530Ssam#include <stdio.h> 43170530Ssam#include <stdlib.h> 44170530Ssam#include <string.h> 45170530Ssam#include <unistd.h> 46170530Ssam#include <wchar.h> 47170530Ssam#include <wctype.h> 48170530Ssam 49170530Ssam#include "coll.h" 50178354Ssam#include "file.h" 51170530Ssam#include "radixsort.h" 52170530Ssam 53170530Ssamunsigned long long free_memory = 1000000; 54170530Ssamunsigned long long available_free_memory = 1000000; 55170530Ssam 56178354Ssambool use_mmap; 57178354Ssam 58178354Ssamconst char *tmpdir = "/var/tmp"; 59178354Ssamconst char *compress_program; 60178354Ssam 61178354Ssamsize_t max_open_files = 16; 62178354Ssam 63178354Ssam/* 64178354Ssam * How much space we read from file at once 65178354Ssam */ 66178354Ssam#define READ_CHUNK (4096) 67178354Ssam 68178354Ssam/* 69178354Ssam * File reader structure 70178354Ssam */ 71178354Ssamstruct file_reader 72178354Ssam{ 73170530Ssam struct reader_buffer rb; 74170530Ssam FILE *file; 75170530Ssam char *fname; 76170530Ssam unsigned char *buffer; 77170530Ssam unsigned char *mmapaddr; 78170530Ssam unsigned char *mmapptr; 79170530Ssam size_t bsz; 80170530Ssam size_t cbsz; 81173273Ssam size_t mmapsize; 82173273Ssam size_t strbeg; 83173273Ssam int fd; 84173273Ssam char elsymb; 85173273Ssam}; 86178354Ssam 87178354Ssam/* 88178354Ssam * Structure to be used in file merge process. 89184280Ssam */ 90184280Ssamstruct file_header 91173273Ssam{ 92178354Ssam struct file_reader *fr; 93178354Ssam struct sort_list_item *si; /* current top line */ 94178354Ssam size_t file_pos; 95178354Ssam}; 96178354Ssam 97178354Ssam/* 98178354Ssam * List elements of "cleanable" files list. 99178354Ssam */ 100178354Ssamstruct CLEANABLE_FILE 101178354Ssam{ 102178354Ssam char *fn; 103184280Ssam LIST_ENTRY(CLEANABLE_FILE) files; 104178354Ssam}; 105178354Ssam 106170530Ssam/* 107178354Ssam * List header of "cleanable" files list. 108178354Ssam */ 109170530Ssamstatic LIST_HEAD(CLEANABLE_FILES,CLEANABLE_FILE) tmp_files; 110170530Ssam 111170530Ssam/* 112170530Ssam * Semaphore to protect the tmp file list. 113170530Ssam * We use semaphore here because it is signal-safe, according to POSIX. 114170530Ssam * And semaphore does not require pthread library. 115170530Ssam */ 116170530Ssamstatic sem_t tmp_files_sem; 117170530Ssam 118170530Ssamstatic void mt_sort(struct sort_list *list, 119184280Ssam int (*sort_func)(void *, size_t, size_t, 120184280Ssam int (*)(const void *, const void *)), const char* fn); 121184280Ssam 122184280Ssam/* 123191552Ssam * Init tmp files list 124191552Ssam */ 125191552Ssamvoid 126170530Ssaminit_tmp_files(void) 127170530Ssam{ 128170530Ssam 129170530Ssam LIST_INIT(&tmp_files); 130170530Ssam sem_init(&tmp_files_sem, 0, 1); 131170530Ssam} 132170530Ssam 133178354Ssam/* 134170530Ssam * Save name of a tmp file for signal cleanup 135170530Ssam */ 136170530Ssamvoid 137184280Ssamtmp_file_atexit(const char *tmp_file) 138191552Ssam{ 139191552Ssam 140170530Ssam if (tmp_file) { 141173273Ssam sem_wait(&tmp_files_sem); 142173273Ssam struct CLEANABLE_FILE *item = 143178354Ssam sort_malloc(sizeof(struct CLEANABLE_FILE)); 144173273Ssam item->fn = sort_strdup(tmp_file); 145178354Ssam LIST_INSERT_HEAD(&tmp_files, item, files); 146178354Ssam sem_post(&tmp_files_sem); 147178354Ssam } 148178354Ssam} 149173273Ssam 150178354Ssam/* 151178354Ssam * Clear tmp files 152178354Ssam */ 153178354Ssamvoid 154178354Ssamclear_tmp_files(void) 155178354Ssam{ 156178354Ssam struct CLEANABLE_FILE *item; 157178354Ssam 158178354Ssam sem_wait(&tmp_files_sem); 159178354Ssam LIST_FOREACH(item,&tmp_files,files) { 160178354Ssam if ((item) && (item->fn)) 161178354Ssam unlink(item->fn); 162178354Ssam } 163178354Ssam sem_post(&tmp_files_sem); 164178354Ssam} 165178354Ssam 166170530Ssam/* 167173273Ssam * Check whether a file is a temporary file 168173273Ssam */ 169170530Ssamstatic bool 170170530Ssamfile_is_tmp(const char* fn) 171178354Ssam{ 172178354Ssam struct CLEANABLE_FILE *item; 173178354Ssam bool ret = false; 174178354Ssam 175178354Ssam if (fn) { 176173273Ssam sem_wait(&tmp_files_sem); 177178354Ssam LIST_FOREACH(item,&tmp_files,files) { 178178354Ssam if ((item) && (item->fn)) 179178354Ssam if (strcmp(item->fn, fn) == 0) { 180178354Ssam ret = true; 181170530Ssam break; 182183256Ssam } 183183256Ssam } 184183256Ssam sem_post(&tmp_files_sem); 185183256Ssam } 186170530Ssam 187178354Ssam return (ret); 188178354Ssam} 189178354Ssam 190178354Ssam/* 191178354Ssam * Generate new temporary file name 192178354Ssam */ 193170530Ssamchar * 194178354Ssamnew_tmp_file_name(void) 195178354Ssam{ 196178354Ssam static size_t tfcounter = 0; 197170530Ssam static const char *fn = ".bsdsort."; 198170530Ssam char *ret; 199170530Ssam size_t sz; 200178354Ssam 201170530Ssam sz = strlen(tmpdir) + 1 + strlen(fn) + 32 + 1; 202170530Ssam ret = sort_malloc(sz); 203170530Ssam 204170530Ssam sprintf(ret, "%s/%s%d.%lu", tmpdir, fn, (int) getpid(), (unsigned long)(tfcounter++)); 205170530Ssam tmp_file_atexit(ret); 206170530Ssam return (ret); 207170530Ssam} 208170530Ssam 209170530Ssam/* 210170530Ssam * Initialize file list 211170530Ssam */ 212170530Ssamvoid 213172226Ssamfile_list_init(struct file_list *fl, bool tmp) 214172226Ssam{ 215170530Ssam 216170530Ssam if (fl) { 217178354Ssam fl->count = 0; 218170530Ssam fl->sz = 0; 219170530Ssam fl->fns = NULL; 220170530Ssam fl->tmp = tmp; 221170530Ssam } 222170530Ssam} 223170530Ssam 224170530Ssam/* 225170530Ssam * Add a file name to the list 226170530Ssam */ 227170530Ssamvoid 228170530Ssamfile_list_add(struct file_list *fl, char *fn, bool allocate) 229170530Ssam{ 230170530Ssam 231170530Ssam if (fl && fn) { 232170530Ssam if (fl->count >= fl->sz || (fl->fns == NULL)) { 233170530Ssam fl->sz = (fl->sz) * 2 + 1; 234170530Ssam fl->fns = sort_realloc(fl->fns, fl->sz * 235170530Ssam sizeof(char *)); 236170530Ssam } 237173273Ssam fl->fns[fl->count] = allocate ? sort_strdup(fn) : fn; 238170530Ssam fl->count += 1; 239170530Ssam } 240170530Ssam} 241170530Ssam 242170530Ssam/* 243170530Ssam * Populate file list from array of file names 244170530Ssam */ 245170530Ssamvoid 246170530Ssamfile_list_populate(struct file_list *fl, int argc, char **argv, bool allocate) 247170530Ssam{ 248170530Ssam 249170530Ssam if (fl && argv) { 250170530Ssam int i; 251170530Ssam 252178354Ssam for (i = 0; i < argc; i++) 253173462Ssam file_list_add(fl, argv[i], allocate); 254170530Ssam } 255170530Ssam} 256170530Ssam 257170530Ssam/* 258170530Ssam * Clean file list data and delete the files, 259178354Ssam * if this is a list of temporary files 260170530Ssam */ 261170530Ssamvoid 262170530Ssamfile_list_clean(struct file_list *fl) 263170530Ssam{ 264170530Ssam 265170530Ssam if (fl) { 266170530Ssam if (fl->fns) { 267170530Ssam size_t i; 268170530Ssam 269170530Ssam for (i = 0; i < fl->count; i++) { 270178354Ssam if (fl->fns[i]) { 271173462Ssam if (fl->tmp) 272178354Ssam unlink(fl->fns[i]); 273170530Ssam sort_free(fl->fns[i]); 274170530Ssam fl->fns[i] = 0; 275173462Ssam } 276170530Ssam } 277170530Ssam sort_free(fl->fns); 278170530Ssam fl->fns = NULL; 279178354Ssam } 280170530Ssam fl->sz = 0; 281170530Ssam fl->count = 0; 282178354Ssam fl->tmp = false; 283170530Ssam } 284170530Ssam} 285170530Ssam 286178354Ssam/* 287170530Ssam * Init sort list 288170530Ssam */ 289170530Ssamvoid 290170530Ssamsort_list_init(struct sort_list *l) 291170530Ssam{ 292170530Ssam 293170530Ssam if (l) { 294170530Ssam l->count = 0; 295170530Ssam l->size = 0; 296170530Ssam l->memsize = sizeof(struct sort_list); 297170530Ssam l->list = NULL; 298170530Ssam } 299170530Ssam} 300170530Ssam 301170530Ssam/* 302170530Ssam * Add string to sort list 303170530Ssam */ 304170530Ssamvoid 305170530Ssamsort_list_add(struct sort_list *l, struct bwstring *str) 306170530Ssam{ 307170530Ssam 308170530Ssam if (l && str) { 309170530Ssam size_t indx = l->count; 310170530Ssam 311170530Ssam if ((l->list == NULL) || (indx >= l->size)) { 312170530Ssam size_t newsize = (l->size + 1) + 1024; 313170530Ssam 314170530Ssam l->list = sort_realloc(l->list, 315170530Ssam sizeof(struct sort_list_item*) * newsize); 316170530Ssam l->memsize += (newsize - l->size) * 317170530Ssam sizeof(struct sort_list_item*); 318170530Ssam l->size = newsize; 319170530Ssam } 320170530Ssam l->list[indx] = sort_list_item_alloc(); 321170530Ssam sort_list_item_set(l->list[indx], str); 322170530Ssam l->memsize += sort_list_item_size(l->list[indx]); 323178354Ssam l->count += 1; 324178354Ssam } 325191552Ssam} 326191552Ssam 327191552Ssam/* 328178354Ssam * Clean sort list data 329191552Ssam */ 330191552Ssamvoid 331178354Ssamsort_list_clean(struct sort_list *l) 332178354Ssam{ 333178354Ssam 334178354Ssam if (l) { 335178354Ssam if (l->list) { 336178354Ssam size_t i; 337178354Ssam 338178354Ssam for (i = 0; i < l->count; i++) { 339178354Ssam struct sort_list_item *item; 340178354Ssam 341191552Ssam item = l->list[i]; 342178354Ssam 343191552Ssam if (item) { 344191552Ssam sort_list_item_clean(item); 345178354Ssam sort_free(item); 346178354Ssam l->list[i] = NULL; 347178354Ssam } 348170530Ssam } 349170530Ssam sort_free(l->list); 350170530Ssam l->list = NULL; 351191552Ssam } 352170530Ssam l->count = 0; 353170530Ssam l->size = 0; 354178354Ssam l->memsize = sizeof(struct sort_list); 355170530Ssam } 356170530Ssam} 357170530Ssam 358170530Ssam/* 359170530Ssam * Write sort list to file 360183247Ssam */ 361170530Ssamvoid 362170530Ssamsort_list_dump(struct sort_list *l, const char *fn) 363170530Ssam{ 364170530Ssam 365170530Ssam if (l && fn) { 366183247Ssam FILE *f; 367183247Ssam 368178354Ssam f = openfile(fn, "w"); 369170530Ssam if (f == NULL) 370170530Ssam err(2, NULL); 371170530Ssam 372170530Ssam if (l->list) { 373170530Ssam size_t i; 374170530Ssam if (!(sort_opts_vals.uflag)) { 375170530Ssam for (i = 0; i < l->count; ++i) 376170530Ssam bwsfwrite(l->list[i]->str, f, 377170530Ssam sort_opts_vals.zflag); 378170530Ssam } else { 379170530Ssam struct sort_list_item *last_printed_item = NULL; 380170530Ssam struct sort_list_item *item; 381170530Ssam for (i = 0; i < l->count; ++i) { 382178354Ssam item = l->list[i]; 383170530Ssam if ((last_printed_item == NULL) || 384170530Ssam list_coll(&last_printed_item, &item)) { 385170530Ssam bwsfwrite(item->str, f, sort_opts_vals.zflag); 386170530Ssam last_printed_item = item; 387170530Ssam } 388170530Ssam } 389170530Ssam } 390170530Ssam } 391170530Ssam 392170530Ssam closefile(f, fn); 393170530Ssam } 394170530Ssam} 395170530Ssam 396170530Ssam/* 397170530Ssam * Checks if the given file is sorted. Stops at the first disorder, 398170530Ssam * prints the disordered line and returns 1. 399170530Ssam */ 400170530Ssamint 401170530Ssamcheck(const char *fn) 402170530Ssam{ 403170530Ssam struct bwstring *s1, *s2, *s1disorder, *s2disorder; 404170530Ssam struct file_reader *fr; 405170530Ssam struct keys_array *ka1, *ka2; 406170530Ssam int res; 407170530Ssam size_t pos, posdisorder; 408170530Ssam 409170530Ssam s1 = s2 = s1disorder = s2disorder = NULL; 410170530Ssam ka1 = ka2 = NULL; 411170530Ssam 412178354Ssam fr = file_reader_init(fn); 413170530Ssam 414173273Ssam res = 0; 415173273Ssam pos = 1; 416173273Ssam posdisorder = 1; 417173273Ssam 418173273Ssam if (fr == NULL) { 419178354Ssam err(2, NULL); 420170530Ssam goto end; 421170530Ssam } 422173273Ssam 423170530Ssam s1 = file_reader_readline(fr); 424173273Ssam if (s1 == NULL) 425170530Ssam goto end; 426170530Ssam 427173273Ssam ka1 = keys_array_alloc(); 428170530Ssam preproc(s1, ka1); 429178354Ssam 430170530Ssam s2 = file_reader_readline(fr); 431170530Ssam if (s2 == NULL) 432170530Ssam goto end; 433173273Ssam 434170530Ssam ka2 = keys_array_alloc(); 435170530Ssam preproc(s2, ka2); 436170530Ssam 437170530Ssam for (;;) { 438170530Ssam 439173273Ssam if (debug_sort) { 440178354Ssam bwsprintf(stdout, s2, "s1=<", ">"); 441173273Ssam bwsprintf(stdout, s1, "s2=<", ">"); 442170530Ssam } 443173273Ssam int cmp = key_coll(ka2, ka1, 0); 444170530Ssam if (debug_sort) 445170530Ssam printf("; cmp1=%d", cmp); 446170530Ssam 447173273Ssam if (!cmp && sort_opts_vals.complex_sort && 448170530Ssam !(sort_opts_vals.uflag) && !(sort_opts_vals.sflag)) { 449170530Ssam cmp = top_level_str_coll(s2, s1); 450173273Ssam if (debug_sort) 451173273Ssam printf("; cmp2=%d", cmp); 452173273Ssam } 453173273Ssam if (debug_sort) 454173273Ssam printf("\n"); 455173273Ssam 456173273Ssam if ((sort_opts_vals.uflag && (cmp <= 0)) || (cmp < 0)) { 457173273Ssam if (!(sort_opts_vals.csilentflag)) { 458178354Ssam s2disorder = bwsdup(s2); 459173273Ssam posdisorder = pos; 460173273Ssam if (debug_sort) 461173273Ssam s1disorder = bwsdup(s1); 462173273Ssam } 463173273Ssam res = 1; 464173273Ssam goto end; 465173273Ssam } 466173273Ssam 467173273Ssam pos++; 468173273Ssam 469173273Ssam clean_keys_array(s1, ka1); 470173273Ssam sort_free(ka1); 471173273Ssam ka1 = ka2; 472173273Ssam ka2 = NULL; 473173273Ssam 474173273Ssam bwsfree(s1); 475173273Ssam s1 = s2; 476173273Ssam 477178354Ssam s2 = file_reader_readline(fr); 478173273Ssam if (s2 == NULL) 479173273Ssam goto end; 480173273Ssam 481173273Ssam ka2 = keys_array_alloc(); 482173273Ssam preproc(s2, ka2); 483173273Ssam } 484173273Ssam 485173273Ssamend: 486173273Ssam if (ka1) { 487173273Ssam clean_keys_array(s1, ka1); 488173273Ssam sort_free(ka1); 489173273Ssam } 490173273Ssam 491173273Ssam if (s1) 492178354Ssam bwsfree(s1); 493178354Ssam 494178354Ssam if (ka2) { 495178354Ssam clean_keys_array(s2, ka2); 496173273Ssam sort_free(ka2); 497173273Ssam } 498173273Ssam 499173273Ssam if (s2) 500173273Ssam bwsfree(s2); 501173273Ssam 502173273Ssam if ((fn == NULL) || (*fn == 0) || (strcmp(fn, "-") == 0)) { 503173273Ssam for (;;) { 504173273Ssam s2 = file_reader_readline(fr); 505173273Ssam if (s2 == NULL) 506173273Ssam break; 507173273Ssam bwsfree(s2); 508173273Ssam } 509178354Ssam } 510173273Ssam 511173273Ssam file_reader_free(fr); 512173273Ssam 513173273Ssam if (s2disorder) { 514173273Ssam bws_disorder_warnx(s2disorder, fn, posdisorder); 515173273Ssam if (s1disorder) { 516173273Ssam bws_disorder_warnx(s1disorder, fn, posdisorder); 517173273Ssam if (s1disorder != s2disorder) 518173273Ssam bwsfree(s1disorder); 519173273Ssam } 520173273Ssam bwsfree(s2disorder); 521170530Ssam s1disorder = NULL; 522170530Ssam s2disorder = NULL; 523170530Ssam } 524173273Ssam 525170530Ssam if (res) 526170530Ssam exit(res); 527170530Ssam 528170530Ssam return (0); 529170530Ssam} 530170530Ssam 531170530Ssam/* 532170530Ssam * Opens a file. If the given filename is "-", stdout will be 533173273Ssam * opened. 534173273Ssam */ 535178354SsamFILE * 536170530Ssamopenfile(const char *fn, const char *mode) 537170530Ssam{ 538170530Ssam FILE *file; 539170530Ssam 540170530Ssam if (strcmp(fn, "-") == 0) { 541170530Ssam return ((mode && mode[0] == 'r') ? stdin : stdout); 542183247Ssam } else { 543183247Ssam mode_t orig_file_mask = 0; 544170530Ssam int is_tmp = file_is_tmp(fn); 545170530Ssam 546170530Ssam if (is_tmp && (mode[0] == 'w')) 547170530Ssam orig_file_mask = umask(S_IWGRP | S_IWOTH | 548183247Ssam S_IRGRP | S_IROTH); 549183247Ssam 550183247Ssam if (is_tmp && (compress_program != NULL)) { 551183247Ssam char *cmd; 552183247Ssam size_t cmdsz; 553183247Ssam 554183247Ssam cmdsz = strlen(fn) + 128; 555173273Ssam cmd = sort_malloc(cmdsz); 556173273Ssam 557173273Ssam fflush(stdout); 558173273Ssam 559173273Ssam if (mode[0] == 'r') 560170530Ssam snprintf(cmd, cmdsz - 1, "cat %s | %s -d", 561170530Ssam fn, compress_program); 562170530Ssam else if (mode[0] == 'w') 563170530Ssam snprintf(cmd, cmdsz - 1, "%s > %s", 564170530Ssam compress_program, fn); 565173273Ssam else 566170530Ssam err(2, "%s", getstr(7)); 567182827Ssam 568182827Ssam if ((file = popen(cmd, mode)) == NULL) 569182827Ssam err(2, NULL); 570182827Ssam 571182827Ssam sort_free(cmd); 572182827Ssam 573182827Ssam } else 574182827Ssam if ((file = fopen(fn, mode)) == NULL) 575182827Ssam err(2, NULL); 576182827Ssam 577182827Ssam if (is_tmp && (mode[0] == 'w')) 578182827Ssam umask(orig_file_mask); 579182827Ssam } 580182827Ssam 581182827Ssam return (file); 582173273Ssam} 583173273Ssam 584170530Ssam/* 585170530Ssam * Close file 586170530Ssam */ 587170530Ssamvoid 588170530Ssamclosefile(FILE *f, const char *fn) 589170530Ssam{ 590170530Ssam if (f == NULL) { 591170530Ssam ; 592170530Ssam } else if (f == stdin) { 593170530Ssam ; 594170530Ssam } else if (f == stdout) { 595173273Ssam fflush(f); 596170530Ssam } else { 597170530Ssam if (file_is_tmp(fn) && compress_program != NULL) { 598170530Ssam if(pclose(f)<0) 599170530Ssam err(2,NULL); 600170530Ssam } else 601170530Ssam fclose(f); 602173273Ssam } 603170530Ssam} 604170530Ssam 605170530Ssam/* 606173273Ssam * Reads a file into the internal buffer. 607170530Ssam */ 608170530Ssamstruct file_reader * 609170530Ssamfile_reader_init(const char *fsrc) 610173273Ssam{ 611170530Ssam struct file_reader *ret; 612173273Ssam 613173273Ssam if (fsrc == NULL) 614173273Ssam fsrc = "-"; 615173273Ssam 616173273Ssam ret = sort_malloc(sizeof(struct file_reader)); 617173273Ssam memset(ret, 0, sizeof(struct file_reader)); 618173273Ssam 619173273Ssam ret->elsymb = '\n'; 620173273Ssam if (sort_opts_vals.zflag) 621173273Ssam ret->elsymb = 0; 622173273Ssam 623173273Ssam ret->fname = sort_strdup(fsrc); 624173273Ssam 625173273Ssam if (strcmp(fsrc, "-") && (compress_program == NULL) && use_mmap) { 626170530Ssam 627173273Ssam do { 628173273Ssam struct stat stat_buf; 629173273Ssam void *addr; 630173273Ssam size_t sz = 0; 631173273Ssam int fd, flags; 632170530Ssam 633173273Ssam flags = MAP_NOCORE | MAP_NOSYNC; 634173273Ssam addr = MAP_FAILED; 635173273Ssam 636173273Ssam fd = open(fsrc, O_RDONLY); 637173273Ssam if (fd < 0) 638173273Ssam err(2, NULL); 639173273Ssam 640173273Ssam if (fstat(fd, &stat_buf) < 0) { 641178354Ssam close(fd); 642173273Ssam break; 643173273Ssam } 644173273Ssam 645173273Ssam sz = stat_buf.st_size; 646173273Ssam 647173273Ssam#if defined(MAP_PREFAULT_READ) 648173273Ssam flags |= MAP_PREFAULT_READ; 649173273Ssam#endif 650173273Ssam 651173273Ssam addr = mmap(NULL, sz, PROT_READ, flags, fd, 0); 652173273Ssam if (addr == MAP_FAILED) { 653173273Ssam close(fd); 654173273Ssam break; 655173273Ssam } 656173273Ssam 657173273Ssam ret->fd = fd; 658173273Ssam ret->mmapaddr = addr; 659173273Ssam ret->mmapsize = sz; 660178354Ssam ret->mmapptr = ret->mmapaddr; 661173273Ssam 662178354Ssam } while (0); 663173273Ssam } 664173273Ssam 665173273Ssam if (ret->mmapaddr == NULL) { 666173273Ssam ret->file = openfile(fsrc, "r"); 667173273Ssam if (ret->file == NULL) 668178354Ssam err(2, NULL); 669173273Ssam 670173273Ssam if (strcmp(fsrc, "-")) { 671173273Ssam ret->cbsz = READ_CHUNK; 672173273Ssam ret->buffer = sort_malloc(ret->cbsz); 673173273Ssam ret->bsz = 0; 674173273Ssam ret->strbeg = 0; 675173273Ssam 676173273Ssam ret->bsz = fread(ret->buffer, 1, ret->cbsz, ret->file); 677173273Ssam if (ret->bsz == 0) { 678173273Ssam if (ferror(ret->file)) 679173273Ssam err(2, NULL); 680178354Ssam } 681173273Ssam } 682170530Ssam } 683173273Ssam 684170530Ssam return (ret); 685178354Ssam} 686170530Ssam 687173273Ssamstruct bwstring * 688173273Ssamfile_reader_readline(struct file_reader *fr) 689173273Ssam{ 690173273Ssam struct bwstring *ret = NULL; 691173273Ssam 692173273Ssam if (fr->mmapaddr) { 693173273Ssam unsigned char *mmapend; 694173273Ssam 695173273Ssam mmapend = fr->mmapaddr + fr->mmapsize; 696173273Ssam if (fr->mmapptr >= mmapend) 697173273Ssam return (NULL); 698173273Ssam else { 699170530Ssam unsigned char *strend; 700170530Ssam size_t sz; 701173273Ssam 702173273Ssam sz = mmapend - fr->mmapptr; 703170530Ssam strend = memchr(fr->mmapptr, fr->elsymb, sz); 704178354Ssam 705173273Ssam if (strend == NULL) { 706178354Ssam ret = bwscsbdup(fr->mmapptr, sz); 707173273Ssam fr->mmapptr = mmapend; 708173273Ssam } else { 709173273Ssam ret = bwscsbdup(fr->mmapptr, strend - 710173273Ssam fr->mmapptr); 711178354Ssam fr->mmapptr = strend + 1; 712173273Ssam } 713170530Ssam } 714173273Ssam 715170530Ssam } else if (fr->file != stdin) { 716173273Ssam unsigned char *strend; 717173273Ssam size_t bsz1, remsz, search_start; 718170530Ssam 719170530Ssam search_start = 0; 720170530Ssam remsz = 0; 721170530Ssam strend = NULL; 722170530Ssam 723170530Ssam if (fr->bsz > fr->strbeg) 724173273Ssam remsz = fr->bsz - fr->strbeg; 725170530Ssam 726170530Ssam /* line read cycle */ 727170530Ssam for (;;) { 728170530Ssam if (remsz > search_start) 729178354Ssam strend = memchr(fr->buffer + fr->strbeg + 730170530Ssam search_start, fr->elsymb, remsz - 731170530Ssam search_start); 732170530Ssam else 733170530Ssam strend = NULL; 734170530Ssam 735173273Ssam if (strend) 736173273Ssam break; 737178354Ssam if (feof(fr->file)) 738173273Ssam break; 739173273Ssam 740178354Ssam if (fr->bsz != fr->cbsz) 741173273Ssam /* NOTREACHED */ 742173273Ssam err(2, "File read software error 1"); 743170530Ssam 744170530Ssam if (remsz > (READ_CHUNK >> 1)) { 745170530Ssam search_start = fr->cbsz - fr->strbeg; 746170530Ssam fr->cbsz += READ_CHUNK; 747170530Ssam fr->buffer = sort_realloc(fr->buffer, 748170530Ssam fr->cbsz); 749170530Ssam bsz1 = fread(fr->buffer + fr->bsz, 1, 750170530Ssam READ_CHUNK, fr->file); 751178354Ssam if (bsz1 == 0) { 752170530Ssam if (ferror(fr->file)) 753170530Ssam err(2, NULL); 754178354Ssam break; 755170530Ssam } 756170530Ssam fr->bsz += bsz1; 757178354Ssam remsz += bsz1; 758170530Ssam } else { 759173273Ssam if (remsz > 0 && fr->strbeg>0) 760173273Ssam bcopy(fr->buffer + fr->strbeg, 761170530Ssam fr->buffer, remsz); 762170530Ssam 763173273Ssam fr->strbeg = 0; 764170530Ssam search_start = remsz; 765173273Ssam bsz1 = fread(fr->buffer + remsz, 1, 766173273Ssam fr->cbsz - remsz, fr->file); 767170530Ssam if (bsz1 == 0) { 768178354Ssam if (ferror(fr->file)) 769173273Ssam err(2, NULL); 770170530Ssam break; 771173273Ssam } 772173273Ssam fr->bsz = remsz + bsz1; 773178354Ssam remsz = fr->bsz; 774173273Ssam } 775173273Ssam } 776173273Ssam 777173273Ssam if (strend == NULL) 778173273Ssam strend = fr->buffer + fr->bsz; 779173273Ssam 780173273Ssam if ((fr->buffer + fr->strbeg <= strend) && 781173273Ssam (fr->strbeg < fr->bsz) && (remsz>0)) 782173273Ssam ret = bwscsbdup(fr->buffer + fr->strbeg, strend - 783170530Ssam fr->buffer - fr->strbeg); 784173273Ssam 785170530Ssam fr->strbeg = (strend - fr->buffer) + 1; 786173273Ssam 787173273Ssam } else { 788170530Ssam size_t len = 0; 789178354Ssam 790173273Ssam ret = bwsfgetln(fr->file, &len, sort_opts_vals.zflag, 791173273Ssam &(fr->rb)); 792173273Ssam } 793173273Ssam 794173273Ssam return (ret); 795173273Ssam} 796178354Ssam 797173273Ssamstatic void 798170530Ssamfile_reader_clean(struct file_reader *fr) 799170530Ssam{ 800170530Ssam 801170530Ssam if (fr) { 802170530Ssam if (fr->mmapaddr) 803170530Ssam munmap(fr->mmapaddr, fr->mmapsize); 804170530Ssam 805170530Ssam if (fr->fd) 806170530Ssam close(fr->fd); 807183254Ssam 808170530Ssam if (fr->buffer) 809170530Ssam sort_free(fr->buffer); 810170530Ssam 811170530Ssam if (fr->file) 812173273Ssam if (fr->file != stdin) 813173273Ssam closefile(fr->file, fr->fname); 814173273Ssam 815173273Ssam if(fr->fname) 816173273Ssam sort_free(fr->fname); 817173273Ssam 818173273Ssam memset(fr, 0, sizeof(struct file_reader)); 819173273Ssam } 820170530Ssam} 821170530Ssam 822170530Ssamvoid 823184280Ssamfile_reader_free(struct file_reader *fr) 824173273Ssam{ 825170530Ssam 826173273Ssam if (fr) { 827170530Ssam file_reader_clean(fr); 828170530Ssam sort_free(fr); 829170530Ssam } 830170530Ssam} 831170530Ssam 832170530Ssamint 833170530Ssamprocfile(const char *fsrc, struct sort_list *list, struct file_list *fl) 834170530Ssam{ 835170530Ssam struct file_reader *fr; 836191552Ssam 837170530Ssam fr = file_reader_init(fsrc); 838170530Ssam if (fr == NULL) 839170530Ssam err(2, NULL); 840170530Ssam 841170530Ssam /* file browse cycle */ 842170530Ssam for (;;) { 843170530Ssam struct bwstring *bws; 844184280Ssam 845184280Ssam bws = file_reader_readline(fr); 846170530Ssam 847170530Ssam if (bws == NULL) 848191552Ssam break; 849170530Ssam 850170530Ssam sort_list_add(list, bws); 851182828Ssam 852170530Ssam if (list->memsize >= available_free_memory) { 853170530Ssam char *fn; 854178354Ssam 855178354Ssam fn = new_tmp_file_name(); 856178354Ssam sort_list_to_file(list, fn); 857178354Ssam file_list_add(fl, fn, false); 858178354Ssam sort_list_clean(list); 859178354Ssam } 860178354Ssam } 861178354Ssam 862178354Ssam file_reader_free(fr); 863178354Ssam 864178354Ssam return (0); 865178354Ssam} 866178354Ssam 867178354Ssam/* 868178354Ssam * Compare file headers. Files with EOF always go to the end of the list. 869178354Ssam */ 870178354Ssamstatic int 871178354Ssamfile_header_cmp(struct file_header *f1, struct file_header *f2) 872178354Ssam{ 873178354Ssam 874178354Ssam if (f1 == f2) 875178354Ssam return (0); 876178354Ssam else { 877178354Ssam if (f1->fr == NULL) { 878178354Ssam return ((f2->fr == NULL) ? 0 : +1); 879178354Ssam } else if (f2->fr == NULL) 880178354Ssam return (-1); 881178354Ssam else { 882178354Ssam int ret; 883178354Ssam 884178354Ssam ret = list_coll(&(f1->si), &(f2->si)); 885178354Ssam if (!ret) 886178354Ssam return ((f1->file_pos < f2->file_pos) ? -1 : +1); 887178354Ssam return (ret); 888178354Ssam } 889178354Ssam } 890178354Ssam} 891178354Ssam 892178354Ssam/* 893173273Ssam * Allocate and init file header structure 894173273Ssam */ 895173273Ssamstatic void 896173273Ssamfile_header_init(struct file_header **fh, const char *fn, size_t file_pos) 897173273Ssam{ 898173273Ssam 899173273Ssam if (fh && fn) { 900173273Ssam struct bwstring *line; 901173273Ssam 902173273Ssam *fh = sort_malloc(sizeof(struct file_header)); 903173273Ssam (*fh)->file_pos = file_pos; 904173273Ssam (*fh)->fr = file_reader_init(fn); 905173273Ssam if ((*fh)->fr == NULL) { 906173273Ssam perror(fn); 907173273Ssam err(2, "%s", getstr(8)); 908173273Ssam } 909173273Ssam line = file_reader_readline((*fh)->fr); 910173273Ssam if (line == NULL) { 911173273Ssam file_reader_free((*fh)->fr); 912173273Ssam (*fh)->fr = NULL; 913173273Ssam (*fh)->si = NULL; 914173273Ssam } else { 915173273Ssam (*fh)->si = sort_list_item_alloc(); 916173273Ssam sort_list_item_set((*fh)->si, line); 917173273Ssam } 918173273Ssam } 919173273Ssam} 920173273Ssam 921173273Ssam/* 922173273Ssam * Close file 923173273Ssam */ 924173273Ssamstatic void 925173273Ssamfile_header_close(struct file_header **fh) 926173273Ssam{ 927173273Ssam 928173273Ssam if (fh && *fh) { 929173273Ssam if ((*fh)->fr) { 930173273Ssam file_reader_free((*fh)->fr); 931173273Ssam (*fh)->fr = NULL; 932173273Ssam } 933173273Ssam if ((*fh)->si) { 934173273Ssam sort_list_item_clean((*fh)->si); 935173273Ssam sort_free((*fh)->si); 936173273Ssam (*fh)->si = NULL; 937173273Ssam } 938173273Ssam sort_free(*fh); 939173273Ssam *fh = NULL; 940173273Ssam } 941173273Ssam} 942173273Ssam 943173273Ssam/* 944173273Ssam * Swap two array elements 945178354Ssam */ 946173273Ssamstatic void 947173273Ssamfile_header_swap(struct file_header **fh, size_t i1, size_t i2) 948173273Ssam{ 949178354Ssam struct file_header *tmp; 950173273Ssam 951173273Ssam tmp = fh[i1]; 952173273Ssam fh[i1] = fh[i2]; 953173273Ssam fh[i2] = tmp; 954173273Ssam} 955173273Ssam 956173273Ssam/* heap algorithm ==>> */ 957173273Ssam 958178354Ssam/* 959178354Ssam * See heap sort algorithm 960173273Ssam * "Raises" last element to its right place 961173273Ssam */ 962178354Ssamstatic void 963173273Ssamfile_header_heap_swim(struct file_header **fh, size_t indx) 964173273Ssam{ 965173273Ssam 966173273Ssam if (indx > 0) { 967173273Ssam size_t parent_index; 968173273Ssam 969173273Ssam parent_index = (indx - 1) >> 1; 970173273Ssam 971178354Ssam if (file_header_cmp(fh[indx], fh[parent_index]) < 0) { 972173273Ssam /* swap child and parent and continue */ 973173273Ssam file_header_swap(fh, indx, parent_index); 974173273Ssam file_header_heap_swim(fh, parent_index); 975173273Ssam } 976173273Ssam } 977173273Ssam} 978183256Ssam 979183256Ssam/* 980183256Ssam * Sink the top element to its correct position 981173273Ssam */ 982173273Ssamstatic void 983173273Ssamfile_header_heap_sink(struct file_header **fh, size_t indx, size_t size) 984173273Ssam{ 985173273Ssam size_t left_child_index; 986173273Ssam size_t right_child_index; 987173273Ssam 988173273Ssam left_child_index = indx + indx + 1; 989173273Ssam right_child_index = left_child_index + 1; 990173273Ssam 991173273Ssam if (left_child_index < size) { 992173273Ssam size_t min_child_index; 993173273Ssam 994173273Ssam min_child_index = left_child_index; 995173273Ssam 996173273Ssam if ((right_child_index < size) && 997173273Ssam (file_header_cmp(fh[left_child_index], 998173273Ssam fh[right_child_index]) > 0)) 999178354Ssam min_child_index = right_child_index; 1000178354Ssam if (file_header_cmp(fh[indx], fh[min_child_index]) > 0) { 1001178354Ssam file_header_swap(fh, indx, min_child_index); 1002178354Ssam file_header_heap_sink(fh, min_child_index, size); 1003178354Ssam } 1004178354Ssam } 1005178354Ssam} 1006178354Ssam 1007183253Ssam/* <<== heap algorithm */ 1008183253Ssam 1009183253Ssam/* 1010178354Ssam * Adds element to the "left" end 1011178354Ssam */ 1012178354Ssamstatic void 1013178354Ssamfile_header_list_rearrange_from_header(struct file_header **fh, size_t size) 1014178354Ssam{ 1015178354Ssam 1016178354Ssam file_header_heap_sink(fh, 0, size); 1017178354Ssam} 1018178354Ssam 1019178354Ssam/* 1020178354Ssam * Adds element to the "right" end 1021178354Ssam */ 1022178354Ssamstatic void 1023178354Ssamfile_header_list_push(struct file_header *f, struct file_header **fh, size_t size) 1024178354Ssam{ 1025178354Ssam 1026173273Ssam fh[size++] = f; 1027173273Ssam file_header_heap_swim(fh, size - 1); 1028173273Ssam} 1029173273Ssam 1030173273Ssamstruct last_printed 1031173273Ssam{ 1032173273Ssam struct bwstring *str; 1033173273Ssam}; 1034173273Ssam 1035173273Ssam/* 1036173273Ssam * Prints the current line of the file 1037178354Ssam */ 1038178354Ssamstatic void 1039178354Ssamfile_header_print(struct file_header *fh, FILE *f_out, struct last_printed *lp) 1040178354Ssam{ 1041173273Ssam 1042178354Ssam if (fh && fh->fr && f_out && fh->si && fh->si->str) { 1043178354Ssam if (sort_opts_vals.uflag) { 1044178354Ssam if ((lp->str == NULL) || (str_list_coll(lp->str, &(fh->si)))) { 1045173273Ssam bwsfwrite(fh->si->str, f_out, sort_opts_vals.zflag); 1046173273Ssam if (lp->str) 1047173273Ssam bwsfree(lp->str); 1048173273Ssam lp->str = bwsdup(fh->si->str); 1049173273Ssam } 1050173273Ssam } else 1051173273Ssam bwsfwrite(fh->si->str, f_out, sort_opts_vals.zflag); 1052173273Ssam } 1053173273Ssam} 1054173273Ssam 1055173273Ssam/* 1056173273Ssam * Read next line 1057173273Ssam */ 1058173273Ssamstatic void 1059173273Ssamfile_header_read_next(struct file_header *fh) 1060173273Ssam{ 1061173273Ssam 1062173273Ssam if (fh && fh->fr) { 1063173273Ssam struct bwstring *tmp; 1064173273Ssam 1065173273Ssam tmp = file_reader_readline(fh->fr); 1066173273Ssam if (tmp == NULL) { 1067173273Ssam file_reader_free(fh->fr); 1068173273Ssam fh->fr = NULL; 1069173273Ssam if (fh->si) { 1070173273Ssam sort_list_item_clean(fh->si); 1071173273Ssam sort_free(fh->si); 1072173273Ssam fh->si = NULL; 1073173273Ssam } 1074173273Ssam } else { 1075173273Ssam if (fh->si == NULL) 1076173273Ssam fh->si = sort_list_item_alloc(); 1077173273Ssam sort_list_item_set(fh->si, tmp); 1078173273Ssam } 1079173273Ssam } 1080173273Ssam} 1081173273Ssam 1082173273Ssam/* 1083173273Ssam * Merge array of "files headers" 1084173273Ssam */ 1085173273Ssamstatic void 1086173273Ssamfile_headers_merge(size_t fnum, struct file_header **fh, FILE *f_out) 1087173273Ssam{ 1088173273Ssam struct last_printed lp; 1089173273Ssam size_t i; 1090173273Ssam 1091173273Ssam memset(&lp, 0, sizeof(lp)); 1092173273Ssam 1093173273Ssam /* 1094178354Ssam * construct the initial sort structure 1095178354Ssam */ 1096178354Ssam for (i = 0; i < fnum; i++) 1097178354Ssam file_header_list_push(fh[i], fh, i); 1098178354Ssam 1099178354Ssam while (fh[0]->fr) { /* unfinished files are always in front */ 1100178354Ssam /* output the smallest line: */ 1101178354Ssam file_header_print(fh[0], f_out, &lp); 1102178354Ssam /* read a new line, if possible: */ 1103173273Ssam file_header_read_next(fh[0]); 1104173273Ssam /* re-arrange the list: */ 1105178354Ssam file_header_list_rearrange_from_header(fh, fnum); 1106173273Ssam } 1107178354Ssam 1108183246Ssam if (lp.str) 1109178354Ssam bwsfree(lp.str); 1110178354Ssam} 1111178354Ssam 1112183246Ssam/* 1113178354Ssam * Merges the given files into the output file, which can be 1114178354Ssam * stdout. 1115178354Ssam */ 1116183246Ssamstatic void 1117183246Ssammerge_files_array(size_t argc, char **argv, const char *fn_out) 1118183246Ssam{ 1119183246Ssam 1120183246Ssam if (argv && fn_out) { 1121183246Ssam struct file_header **fh; 1122183246Ssam FILE *f_out; 1123181197Ssam size_t i; 1124178354Ssam 1125173273Ssam f_out = openfile(fn_out, "w"); 1126173273Ssam 1127173273Ssam if (f_out == NULL) 1128173273Ssam err(2, NULL); 1129173273Ssam 1130173273Ssam fh = sort_malloc((argc + 1) * sizeof(struct file_header *)); 1131173273Ssam 1132173273Ssam for (i = 0; i < argc; i++) 1133173273Ssam file_header_init(fh + i, argv[i], (size_t) i); 1134173273Ssam 1135173273Ssam file_headers_merge(argc, fh, f_out); 1136173273Ssam 1137173273Ssam for (i = 0; i < argc; i++) 1138173273Ssam file_header_close(fh + i); 1139173273Ssam 1140173273Ssam sort_free(fh); 1141173273Ssam 1142173273Ssam closefile(f_out, fn_out); 1143178354Ssam } 1144173273Ssam} 1145173273Ssam 1146173273Ssam/* 1147173273Ssam * Shrinks the file list until its size smaller than max number of opened files 1148173273Ssam */ 1149173273Ssamstatic int 1150173273Ssamshrink_file_list(struct file_list *fl) 1151170530Ssam{ 1152170530Ssam 1153170530Ssam if ((fl == NULL) || (size_t) (fl->count) < max_open_files) 1154170530Ssam return (0); 1155170530Ssam else { 1156170530Ssam struct file_list new_fl; 1157170530Ssam size_t indx = 0; 1158170530Ssam 1159170530Ssam file_list_init(&new_fl, true); 1160170530Ssam while (indx < fl->count) { 1161170530Ssam char *fnew; 1162170530Ssam size_t num; 1163170530Ssam 1164170530Ssam num = fl->count - indx; 1165170530Ssam fnew = new_tmp_file_name(); 1166170530Ssam 1167170530Ssam if ((size_t) num >= max_open_files) 1168170530Ssam num = max_open_files - 1; 1169170530Ssam merge_files_array(num, fl->fns + indx, fnew); 1170170530Ssam if (fl->tmp) { 1171170530Ssam size_t i; 1172170530Ssam 1173170530Ssam for (i = 0; i < num; i++) 1174170530Ssam unlink(fl->fns[indx + i]); 1175170530Ssam } 1176170530Ssam file_list_add(&new_fl, fnew, false); 1177170530Ssam indx += num; 1178170530Ssam } 1179183254Ssam fl->tmp = false; /* already taken care of */ 1180183254Ssam file_list_clean(fl); 1181183254Ssam 1182170530Ssam fl->count = new_fl.count; 1183170530Ssam fl->fns = new_fl.fns; 1184170530Ssam fl->sz = new_fl.sz; 1185170530Ssam fl->tmp = new_fl.tmp; 1186170530Ssam 1187172055Ssam return (1); 1188170530Ssam } 1189170530Ssam} 1190170530Ssam 1191183254Ssam/* 1192183254Ssam * Merge list of files 1193183254Ssam */ 1194183254Ssamvoid 1195183254Ssammerge_files(struct file_list *fl, const char *fn_out) 1196183254Ssam{ 1197183254Ssam 1198183254Ssam if (fl && fn_out) { 1199183254Ssam while (shrink_file_list(fl)); 1200183254Ssam 1201183254Ssam merge_files_array(fl->count, fl->fns, fn_out); 1202183254Ssam } 1203183254Ssam} 1204183254Ssam 1205183254Ssamstatic const char * 1206183254Ssamget_sort_method_name(int sm) 1207183254Ssam{ 1208183254Ssam 1209183254Ssam if (sm == SORT_MERGESORT) 1210183254Ssam return "mergesort"; 1211183254Ssam else if (sort_opts_vals.sort_method == SORT_RADIXSORT) 1212183254Ssam return "radixsort"; 1213183254Ssam else if (sort_opts_vals.sort_method == SORT_HEAPSORT) 1214183254Ssam return "heapsort"; 1215183254Ssam else 1216183254Ssam return "quicksort"; 1217183254Ssam} 1218183254Ssam 1219183254Ssam/* 1220183254Ssam * Wrapper for qsort 1221183254Ssam */ 1222173273Ssamstatic int sort_qsort(void *list, size_t count, size_t elem_size, 1223173273Ssam int (*cmp_func)(const void *, const void *)) 1224183254Ssam{ 1225173273Ssam 1226178354Ssam qsort(list, count, elem_size, cmp_func); 1227173273Ssam return (0); 1228173273Ssam} 1229173273Ssam 1230173273Ssam/* 1231173273Ssam * Sort list of lines and writes it to the file 1232183254Ssam */ 1233183254Ssamvoid 1234173273Ssamsort_list_to_file(struct sort_list *list, const char *outfile) 1235173273Ssam{ 1236173273Ssam struct sort_mods *sm = &(keys[0].sm); 1237183254Ssam 1238173273Ssam if (!(sm->Mflag) && !(sm->Rflag) && !(sm->Vflag) && !(sm->Vflag) && 1239173273Ssam !(sm->gflag) && !(sm->hflag) && !(sm->nflag)) { 1240173273Ssam if ((sort_opts_vals.sort_method == SORT_DEFAULT) && byte_sort) 1241183254Ssam sort_opts_vals.sort_method = SORT_RADIXSORT; 1242173273Ssam 1243173273Ssam } else if (sort_opts_vals.sort_method == SORT_RADIXSORT) 1244173273Ssam err(2, "%s", getstr(9)); 1245173273Ssam 1246173273Ssam /* 1247173273Ssam * to handle stable sort and the unique cases in the 1248173273Ssam * right order, we need stable basic algorithm 1249173273Ssam */ 1250173273Ssam if (sort_opts_vals.sflag) { 1251173273Ssam switch (sort_opts_vals.sort_method){ 1252170530Ssam case SORT_MERGESORT: 1253170530Ssam break; 1254170530Ssam case SORT_RADIXSORT: 1255183255Ssam break; 1256183255Ssam case SORT_DEFAULT: 1257183255Ssam sort_opts_vals.sort_method = SORT_MERGESORT; 1258183255Ssam break; 1259183255Ssam default: 1260183255Ssam errx(2, "%s", getstr(10)); 1261183255Ssam }; 1262183255Ssam } 1263183255Ssam 1264183255Ssam if (sort_opts_vals.sort_method == SORT_DEFAULT) 1265183255Ssam sort_opts_vals.sort_method = DEFAULT_SORT_ALGORITHM; 1266183255Ssam 1267183255Ssam if (debug_sort) 1268183255Ssam printf("sort_method=%s\n", 1269183255Ssam get_sort_method_name(sort_opts_vals.sort_method)); 1270183255Ssam 1271183255Ssam switch (sort_opts_vals.sort_method){ 1272183255Ssam case SORT_RADIXSORT: 1273183255Ssam rxsort(list->list, list->count); 1274183255Ssam sort_list_dump(list, outfile); 1275183255Ssam break; 1276183255Ssam case SORT_MERGESORT: 1277183255Ssam mt_sort(list, mergesort, outfile); 1278183255Ssam break; 1279183255Ssam case SORT_HEAPSORT: 1280183255Ssam mt_sort(list, heapsort, outfile); 1281183257Ssam break; 1282183257Ssam case SORT_QSORT: 1283183257Ssam mt_sort(list, sort_qsort, outfile); 1284183257Ssam break; 1285183257Ssam default: 1286183257Ssam mt_sort(list, DEFAULT_SORT_FUNC, outfile); 1287183257Ssam break; 1288183257Ssam } 1289183257Ssam} 1290183257Ssam 1291183257Ssam/******************* MT SORT ************************/ 1292183257Ssam 1293183257Ssam#if defined(SORT_THREADS) 1294183257Ssam/* semaphore to count threads */ 1295183257Ssamstatic sem_t mtsem; 1296183257Ssam 1297183257Ssam/* current system sort function */ 1298183257Ssamstatic int (*g_sort_func)(void *, size_t, size_t, 1299183254Ssam int(*)(const void *, const void *)); 1300183254Ssam 1301183254Ssam/* 1302183254Ssam * Sort cycle thread (in multi-threaded mode) 1303183254Ssam */ 1304183254Ssamstatic void* 1305183254Ssammt_sort_thread(void* arg) 1306183254Ssam{ 1307183254Ssam struct sort_list *list = arg; 1308183254Ssam 1309183254Ssam g_sort_func(list->list, list->count, sizeof(struct sort_list_item *), 1310183254Ssam (int(*)(const void *, const void *)) list_coll); 1311183255Ssam 1312183255Ssam sem_post(&mtsem); 1313183257Ssam 1314183254Ssam return (arg); 1315183254Ssam} 1316183254Ssam 1317183254Ssam/* 1318183254Ssam * Compare sub-lists. Empty sub-lists always go to the end of the list. 1319183254Ssam */ 1320183254Ssamstatic int 1321183254Ssamsub_list_cmp(struct sort_list *l1, struct sort_list *l2) 1322183254Ssam{ 1323183254Ssam 1324183254Ssam if (l1 == l2) 1325183254Ssam return (0); 1326183254Ssam else { 1327183254Ssam if (l1->count == 0) { 1328183254Ssam return ((l2->count == 0) ? 0 : +1); 1329183254Ssam } else if (l2->count == 0) { 1330183254Ssam return (-1); 1331183256Ssam } else { 1332183256Ssam int ret; 1333183256Ssam 1334183256Ssam ret = list_coll(&(l1->list[0]), &(l2->list[0])); 1335183256Ssam if (!ret) 1336183256Ssam return ((l1->sub_list_pos < l2->sub_list_pos) ? 1337183254Ssam -1 : +1); 1338183254Ssam return (ret); 1339183254Ssam } 1340183254Ssam } 1341183254Ssam} 1342183254Ssam 1343183254Ssam/* 1344183254Ssam * Swap two array elements 1345183254Ssam */ 1346183254Ssamstatic void 1347183254Ssamsub_list_swap(struct sort_list **sl, size_t i1, size_t i2) 1348183254Ssam{ 1349183254Ssam struct sort_list *tmp; 1350183255Ssam 1351183255Ssam tmp = sl[i1]; 1352183257Ssam sl[i1] = sl[i2]; 1353183254Ssam sl[i2] = tmp; 1354183254Ssam} 1355183254Ssam 1356183254Ssam/* heap algorithm ==>> */ 1357183254Ssam 1358183254Ssam/* 1359183254Ssam * See heap sort algorithm 1360183254Ssam * "Raises" last element to its right place 1361183254Ssam */ 1362183254Ssamstatic void 1363183254Ssamsub_list_swim(struct sort_list **sl, size_t indx) 1364183254Ssam{ 1365183254Ssam 1366183254Ssam if (indx > 0) { 1367183254Ssam size_t parent_index; 1368183254Ssam 1369170530Ssam parent_index = (indx - 1) >> 1; 1370170530Ssam 1371170530Ssam if (sub_list_cmp(sl[indx], sl[parent_index]) < 0) { 1372170530Ssam /* swap child and parent and continue */ 1373170530Ssam sub_list_swap(sl, indx, parent_index); 1374178354Ssam sub_list_swim(sl, parent_index); 1375170530Ssam } 1376170530Ssam } 1377170530Ssam} 1378170530Ssam 1379170530Ssam/* 1380170530Ssam * Sink the top element to its correct position 1381170530Ssam */ 1382170530Ssamstatic void 1383170530Ssamsub_list_sink(struct sort_list **sl, size_t indx, size_t size) 1384170530Ssam{ 1385170530Ssam size_t left_child_index; 1386170530Ssam size_t right_child_index; 1387170530Ssam 1388170530Ssam left_child_index = indx + indx + 1; 1389178354Ssam right_child_index = left_child_index + 1; 1390170530Ssam 1391170530Ssam if (left_child_index < size) { 1392170530Ssam size_t min_child_index; 1393178354Ssam 1394170530Ssam min_child_index = left_child_index; 1395170530Ssam 1396170530Ssam if ((right_child_index < size) && 1397170530Ssam (sub_list_cmp(sl[left_child_index], 1398170530Ssam sl[right_child_index]) > 0)) 1399170530Ssam min_child_index = right_child_index; 1400170530Ssam if (sub_list_cmp(sl[indx], sl[min_child_index]) > 0) { 1401170530Ssam sub_list_swap(sl, indx, min_child_index); 1402170530Ssam sub_list_sink(sl, min_child_index, size); 1403170530Ssam } 1404170530Ssam } 1405170530Ssam} 1406170530Ssam 1407170530Ssam/* <<== heap algorithm */ 1408170530Ssam 1409170530Ssam/* 1410170530Ssam * Adds element to the "right" end 1411170530Ssam */ 1412170530Ssamstatic void 1413170530Ssamsub_list_push(struct sort_list *s, struct sort_list **sl, size_t size) 1414170530Ssam{ 1415170530Ssam 1416170530Ssam sl[size++] = s; 1417170530Ssam sub_list_swim(sl, size - 1); 1418178354Ssam} 1419170530Ssam 1420170530Ssamstruct last_printed_item 1421170530Ssam{ 1422170530Ssam struct sort_list_item *item; 1423170530Ssam}; 1424170530Ssam 1425170530Ssam/* 1426170530Ssam * Prints the current line of the file 1427170530Ssam */ 1428170530Ssamstatic void 1429170530Ssamsub_list_header_print(struct sort_list *sl, FILE *f_out, 1430170530Ssam struct last_printed_item *lp) 1431170530Ssam{ 1432170530Ssam 1433184280Ssam if (sl && sl->count && f_out && sl->list[0]->str) { 1434184280Ssam if (sort_opts_vals.uflag) { 1435184280Ssam if ((lp->item == NULL) || (list_coll(&(lp->item), 1436184280Ssam &(sl->list[0])))) { 1437184280Ssam bwsfwrite(sl->list[0]->str, f_out, 1438184280Ssam sort_opts_vals.zflag); 1439184280Ssam lp->item = sl->list[0]; 1440184280Ssam } 1441184280Ssam } else 1442184280Ssam bwsfwrite(sl->list[0]->str, f_out, 1443184280Ssam sort_opts_vals.zflag); 1444184280Ssam } 1445184280Ssam} 1446184280Ssam 1447184280Ssam/* 1448184280Ssam * Read next line 1449184280Ssam */ 1450184280Ssamstatic void 1451184280Ssamsub_list_next(struct sort_list *sl) 1452184280Ssam{ 1453184280Ssam 1454184280Ssam if (sl && sl->count) { 1455184280Ssam sl->list += 1; 1456184280Ssam sl->count -= 1; 1457184280Ssam } 1458184280Ssam} 1459184280Ssam 1460184280Ssam/* 1461184280Ssam * Merge sub-lists to a file 1462184280Ssam */ 1463184280Ssamstatic void 1464184280Ssammerge_sub_lists(struct sort_list **sl, size_t n, FILE* f_out) 1465170530Ssam{ 1466170530Ssam struct last_printed_item lp; 1467170530Ssam size_t i; 1468170530Ssam 1469170530Ssam memset(&lp,0,sizeof(lp)); 1470170530Ssam 1471170530Ssam /* construct the initial list: */ 1472170530Ssam for (i = 0; i < n; i++) 1473170530Ssam sub_list_push(sl[i], sl, i); 1474170530Ssam 1475170530Ssam while (sl[0]->count) { /* unfinished lists are always in front */ 1476170530Ssam /* output the smallest line: */ 1477170530Ssam sub_list_header_print(sl[0], f_out, &lp); 1478178354Ssam /* move to a new line, if possible: */ 1479170530Ssam sub_list_next(sl[0]); 1480170530Ssam /* re-arrange the list: */ 1481178354Ssam sub_list_sink(sl, 0, n); 1482170530Ssam } 1483170530Ssam} 1484170530Ssam 1485170530Ssam/* 1486170530Ssam * Merge sub-lists to a file 1487170530Ssam */ 1488170530Ssamstatic void 1489170530Ssammerge_list_parts(struct sort_list **parts, size_t n, const char *fn) 1490170530Ssam{ 1491170530Ssam FILE* f_out; 1492170530Ssam 1493170530Ssam f_out = openfile(fn,"w"); 1494170530Ssam 1495170530Ssam merge_sub_lists(parts, n, f_out); 1496170530Ssam 1497170530Ssam closefile(f_out, fn); 1498170530Ssam} 1499170530Ssam 1500170530Ssam#endif /* defined(SORT_THREADS) */ 1501170530Ssam/* 1502170530Ssam * Multi-threaded sort algorithm "driver" 1503170530Ssam */ 1504170530Ssamstatic void 1505170530Ssammt_sort(struct sort_list *list, 1506170530Ssam int(*sort_func)(void *, size_t, size_t, int(*)(const void *, const void *)), 1507170530Ssam const char* fn) 1508170530Ssam{ 1509170530Ssam#if defined(SORT_THREADS) 1510170530Ssam if (nthreads < 2 || list->count < MT_SORT_THRESHOLD) { 1511170530Ssam size_t nthreads_save = nthreads; 1512170530Ssam nthreads = 1; 1513170530Ssam#endif 1514170530Ssam /* if single thread or small data, do simple sort */ 1515170530Ssam sort_func(list->list, list->count, 1516170530Ssam sizeof(struct sort_list_item *), 1517170530Ssam (int(*)(const void *, const void *)) list_coll); 1518170530Ssam sort_list_dump(list, fn); 1519170530Ssam#if defined(SORT_THREADS) 1520170530Ssam nthreads = nthreads_save; 1521170530Ssam } else { 1522170530Ssam /* multi-threaded sort */ 1523170530Ssam struct sort_list **parts; 1524170530Ssam size_t avgsize, cstart, i; 1525170530Ssam 1526184280Ssam /* array of sub-lists */ 1527170530Ssam parts = sort_malloc(sizeof(struct sort_list*) * nthreads); 1528170530Ssam cstart = 0; 1529170530Ssam avgsize = list->count / nthreads; 1530170530Ssam 1531170530Ssam /* set global system sort function */ 1532170530Ssam g_sort_func = sort_func; 1533170530Ssam 1534170530Ssam /* set sublists */ 1535184280Ssam for (i = 0; i < nthreads; ++i) { 1536184280Ssam size_t sz = 0; 1537170530Ssam 1538184280Ssam parts[i] = sort_malloc(sizeof(struct sort_list)); 1539173273Ssam parts[i]->list = list->list + cstart; 1540173273Ssam parts[i]->memsize = 0; 1541173273Ssam parts[i]->sub_list_pos = i; 1542170530Ssam 1543170530Ssam sz = (i == nthreads - 1) ? list->count - cstart : 1544170530Ssam avgsize; 1545170530Ssam 1546170530Ssam parts[i]->count = sz; 1547170530Ssam 1548170530Ssam parts[i]->size = parts[i]->count; 1549170530Ssam 1550170530Ssam cstart += sz; 1551170530Ssam } 1552170530Ssam 1553170530Ssam /* init threads counting semaphore */ 1554170530Ssam sem_init(&mtsem, 0, 0); 1555170530Ssam 1556182830Ssam /* start threads */ 1557170530Ssam for (i = 0; i < nthreads; ++i) { 1558170530Ssam pthread_t pth; 1559170530Ssam pthread_attr_t attr; 1560170530Ssam 1561170530Ssam pthread_attr_init(&attr); 1562170530Ssam pthread_attr_setdetachstate(&attr, PTHREAD_DETACHED); 1563170530Ssam 1564170530Ssam for (;;) { 1565170530Ssam int res = pthread_create(&pth, &attr, 1566170530Ssam mt_sort_thread, parts[i]); 1567170530Ssam 1568170530Ssam if (res >= 0) 1569170530Ssam break; 1570170530Ssam if (errno == EAGAIN) { 1571170530Ssam pthread_yield(); 1572170530Ssam continue; 1573178354Ssam } 1574170530Ssam err(2, NULL); 1575170530Ssam } 1576170530Ssam 1577182829Ssam pthread_attr_destroy(&attr); 1578170530Ssam } 1579170530Ssam 1580170530Ssam /* wait for threads completion */ 1581170530Ssam for (i = 0; i < nthreads; ++i) { 1582170530Ssam sem_wait(&mtsem); 1583170530Ssam } 1584170530Ssam /* destroy the semaphore - we do not need it anymore */ 1585170530Ssam sem_destroy(&mtsem); 1586170530Ssam 1587170530Ssam /* merge sorted sub-lists to the file */ 1588170530Ssam merge_list_parts(parts, nthreads, fn); 1589170530Ssam 1590170530Ssam /* free sub-lists data */ 1591170530Ssam for (i = 0; i < nthreads; ++i) { 1592170530Ssam sort_free(parts[i]); 1593170530Ssam } 1594178354Ssam sort_free(parts); 1595170530Ssam } 1596170530Ssam#endif /* defined(SORT_THREADS) */ 1597170530Ssam} 1598173273Ssam