1/* sort - sort lines of text (with all kinds of options).
2   Copyright (C) 1988-2016 Free Software Foundation, Inc.
3
4   This program is free software: you can redistribute it and/or modify
5   it under the terms of the GNU General Public License as published by
6   the Free Software Foundation, either version 3 of the License, or
7   (at your option) any later version.
8
9   This program is distributed in the hope that it will be useful,
10   but WITHOUT ANY WARRANTY; without even the implied warranty of
11   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12   GNU General Public License for more details.
13
14   You should have received a copy of the GNU General Public License
15   along with this program.  If not, see <http://www.gnu.org/licenses/>.
16
17   Written December 1988 by Mike Haertel.
18   The author may be reached (Email) at the address mike@gnu.ai.mit.edu,
19   or (US mail) as Mike Haertel c/o Free Software Foundation.
20
21   Ørn E. Hansen added NLS support in 1997.  */
22
23#include <config.h>
24
25#include <getopt.h>
26#include <pthread.h>
27#include <sys/resource.h>
28#include <sys/types.h>
29#include <sys/wait.h>
30#include <signal.h>
31#include <assert.h>
32#include "system.h"
33#include "argmatch.h"
34#include "error.h"
35#include "fadvise.h"
36#include "filevercmp.h"
37#include "hard-locale.h"
38#include "hash.h"
39#include "heap.h"
40#include "ignore-value.h"
41#include "md5.h"
42#include "mbswidth.h"
43#include "nproc.h"
44#include "physmem.h"
45#include "posixver.h"
46#include "quote.h"
47#include "randread.h"
48#include "readtokens0.h"
49#include "stdio--.h"
50#include "stdlib--.h"
51#include "strnumcmp.h"
52#include "xmemcoll.h"
53#include "xnanosleep.h"
54#include "xstrtol.h"
55
56#ifndef RLIMIT_DATA
57struct rlimit { size_t rlim_cur; };
58# define getrlimit(Resource, Rlp) (-1)
59#endif
60
61/* The official name of this program (e.g., no 'g' prefix).  */
62#define PROGRAM_NAME "sort"
63
64#define AUTHORS \
65  proper_name ("Mike Haertel"), \
66  proper_name ("Paul Eggert")
67
68#if HAVE_LANGINFO_CODESET
69# include <langinfo.h>
70#endif
71
72/* Use SA_NOCLDSTOP as a proxy for whether the sigaction machinery is
73   present.  */
74#ifndef SA_NOCLDSTOP
75# define SA_NOCLDSTOP 0
76/* No sigprocmask.  Always 'return' zero. */
77# define sigprocmask(How, Set, Oset) (0)
78# define sigset_t int
79# if ! HAVE_SIGINTERRUPT
80#  define siginterrupt(sig, flag) /* empty */
81# endif
82#endif
83
84#if !defined OPEN_MAX && defined NR_OPEN
85# define OPEN_MAX NR_OPEN
86#endif
87#if !defined OPEN_MAX
88# define OPEN_MAX 20
89#endif
90
91#define UCHAR_LIM (UCHAR_MAX + 1)
92
93#if HAVE_C99_STRTOLD
94# define long_double long double
95#else
96# define long_double double
97# undef strtold
98# define strtold strtod
99#endif
100
101#ifndef DEFAULT_TMPDIR
102# define DEFAULT_TMPDIR "/tmp"
103#endif
104
105/* Maximum number of lines to merge every time a NODE is taken from
106   the merge queue.  Node is at LEVEL in the binary merge tree,
107   and is responsible for merging TOTAL lines. */
108#define MAX_MERGE(total, level) (((total) >> (2 * ((level) + 1))) + 1)
109
110/* Heuristic value for the number of lines for which it is worth creating
111   a subthread, during an internal merge sort.  I.e., it is a small number
112   of "average" lines for which sorting via two threads is faster than
113   sorting via one on an "average" system.  On a dual-core 2.0 GHz i686
114   system with 3GB of RAM and 2MB of L2 cache, a file containing 128K
115   lines of gensort -a output is sorted slightly faster with --parallel=2
116   than with --parallel=1.  By contrast, using --parallel=1 is about 10%
117   faster than using --parallel=2 with a 64K-line input.  */
118enum { SUBTHREAD_LINES_HEURISTIC = 128 * 1024 };
119verify (4 <= SUBTHREAD_LINES_HEURISTIC);
120
121/* The number of threads after which there are
122   diminishing performance gains.  */
123enum { DEFAULT_MAX_THREADS = 8 };
124
125/* Exit statuses.  */
126enum
127  {
128    /* POSIX says to exit with status 1 if invoked with -c and the
129       input is not properly sorted.  */
130    SORT_OUT_OF_ORDER = 1,
131
132    /* POSIX says any other irregular exit must exit with a status
133       code greater than 1.  */
134    SORT_FAILURE = 2
135  };
136
137enum
138  {
139    /* The number of times we should try to fork a compression process
140       (we retry if the fork call fails).  We don't _need_ to compress
141       temp files, this is just to reduce disk access, so this number
142       can be small.  Each retry doubles in duration.  */
143    MAX_FORK_TRIES_COMPRESS = 4,
144
145    /* The number of times we should try to fork a decompression process.
146       If we can't fork a decompression process, we can't sort, so this
147       number should be big.  Each retry doubles in duration.  */
148    MAX_FORK_TRIES_DECOMPRESS = 9
149  };
150
151enum
152  {
153    /* Level of the end-of-merge node, one level above the root. */
154    MERGE_END = 0,
155
156    /* Level of the root node in merge tree. */
157    MERGE_ROOT = 1
158  };
159
160/* The representation of the decimal point in the current locale.  */
161static int decimal_point;
162
163/* Thousands separator; if -1, then there isn't one.  */
164static int thousands_sep;
165
166/* Nonzero if the corresponding locales are hard.  */
167static bool hard_LC_COLLATE;
168#if HAVE_NL_LANGINFO
169static bool hard_LC_TIME;
170#endif
171
172#define NONZERO(x) ((x) != 0)
173
174/* The kind of blanks for '-b' to skip in various options. */
175enum blanktype { bl_start, bl_end, bl_both };
176
177/* The character marking end of line. Default to \n. */
178static char eolchar = '\n';
179
180/* Lines are held in core as counted strings. */
181struct line
182{
183  char *text;			/* Text of the line. */
184  size_t length;		/* Length including final newline. */
185  char *keybeg;			/* Start of first key. */
186  char *keylim;			/* Limit of first key. */
187};
188
189/* Input buffers. */
190struct buffer
191{
192  char *buf;			/* Dynamically allocated buffer,
193                                   partitioned into 3 regions:
194                                   - input data;
195                                   - unused area;
196                                   - an array of lines, in reverse order.  */
197  size_t used;			/* Number of bytes used for input data.  */
198  size_t nlines;		/* Number of lines in the line array.  */
199  size_t alloc;			/* Number of bytes allocated. */
200  size_t left;			/* Number of bytes left from previous reads. */
201  size_t line_bytes;		/* Number of bytes to reserve for each line. */
202  bool eof;			/* An EOF has been read.  */
203};
204
205/* Sort key.  */
206struct keyfield
207{
208  size_t sword;			/* Zero-origin 'word' to start at. */
209  size_t schar;			/* Additional characters to skip. */
210  size_t eword;			/* Zero-origin last 'word' of key. */
211  size_t echar;			/* Additional characters in field. */
212  bool const *ignore;		/* Boolean array of characters to ignore. */
213  char const *translate;	/* Translation applied to characters. */
214  bool skipsblanks;		/* Skip leading blanks when finding start.  */
215  bool skipeblanks;		/* Skip leading blanks when finding end.  */
216  bool numeric;			/* Flag for numeric comparison.  Handle
217                                   strings of digits with optional decimal
218                                   point, but no exponential notation. */
219  bool random;			/* Sort by random hash of key.  */
220  bool general_numeric;		/* Flag for general, numeric comparison.
221                                   Handle numbers in exponential notation. */
222  bool human_numeric;		/* Flag for sorting by human readable
223                                   units with either SI or IEC prefixes. */
224  bool month;			/* Flag for comparison by month name. */
225  bool reverse;			/* Reverse the sense of comparison. */
226  bool version;			/* sort by version number */
227  bool traditional_used;	/* Traditional key option format is used. */
228  struct keyfield *next;	/* Next keyfield to try. */
229};
230
231struct month
232{
233  char const *name;
234  int val;
235};
236
237/* Binary merge tree node. */
238struct merge_node
239{
240  struct line *lo;              /* Lines to merge from LO child node. */
241  struct line *hi;              /* Lines to merge from HI child node. */
242  struct line *end_lo;          /* End of available lines from LO. */
243  struct line *end_hi;          /* End of available lines from HI. */
244  struct line **dest;           /* Pointer to destination of merge. */
245  size_t nlo;                   /* Total Lines remaining from LO. */
246  size_t nhi;                   /* Total lines remaining from HI. */
247  struct merge_node *parent;    /* Parent node. */
248  struct merge_node *lo_child;  /* LO child node. */
249  struct merge_node *hi_child;  /* HI child node. */
250  unsigned int level;           /* Level in merge tree. */
251  bool queued;                  /* Node is already in heap. */
252  pthread_mutex_t lock;         /* Lock for node operations. */
253};
254
255/* Priority queue of merge nodes. */
256struct merge_node_queue
257{
258  struct heap *priority_queue;  /* Priority queue of merge tree nodes. */
259  pthread_mutex_t mutex;        /* Lock for queue operations. */
260  pthread_cond_t cond;          /* Conditional wait for empty queue to populate
261                                   when popping. */
262};
263
264/* Used to implement --unique (-u).  */
265static struct line saved_line;
266
267/* FIXME: None of these tables work with multibyte character sets.
268   Also, there are many other bugs when handling multibyte characters.
269   One way to fix this is to rewrite 'sort' to use wide characters
270   internally, but doing this with good performance is a bit
271   tricky.  */
272
273/* Table of blanks.  */
274static bool blanks[UCHAR_LIM];
275
276/* Table of non-printing characters. */
277static bool nonprinting[UCHAR_LIM];
278
279/* Table of non-dictionary characters (not letters, digits, or blanks). */
280static bool nondictionary[UCHAR_LIM];
281
282/* Translation table folding lower case to upper.  */
283static char fold_toupper[UCHAR_LIM];
284
285#define MONTHS_PER_YEAR 12
286
287/* Table mapping month names to integers.
288   Alphabetic order allows binary search. */
289static struct month monthtab[] =
290{
291  {"APR", 4},
292  {"AUG", 8},
293  {"DEC", 12},
294  {"FEB", 2},
295  {"JAN", 1},
296  {"JUL", 7},
297  {"JUN", 6},
298  {"MAR", 3},
299  {"MAY", 5},
300  {"NOV", 11},
301  {"OCT", 10},
302  {"SEP", 9}
303};
304
305/* During the merge phase, the number of files to merge at once. */
306#define NMERGE_DEFAULT 16
307
308/* Minimum size for a merge or check buffer.  */
309#define MIN_MERGE_BUFFER_SIZE (2 + sizeof (struct line))
310
311/* Minimum sort size; the code might not work with smaller sizes.  */
312#define MIN_SORT_SIZE (nmerge * MIN_MERGE_BUFFER_SIZE)
313
314/* The number of bytes needed for a merge or check buffer, which can
315   function relatively efficiently even if it holds only one line.  If
316   a longer line is seen, this value is increased.  */
317static size_t merge_buffer_size = MAX (MIN_MERGE_BUFFER_SIZE, 256 * 1024);
318
319/* The approximate maximum number of bytes of main memory to use, as
320   specified by the user.  Zero if the user has not specified a size.  */
321static size_t sort_size;
322
323/* The initial allocation factor for non-regular files.
324   This is used, e.g., when reading from a pipe.
325   Don't make it too big, since it is multiplied by ~130 to
326   obtain the size of the actual buffer sort will allocate.
327   Also, there may be 8 threads all doing this at the same time.  */
328#define INPUT_FILE_SIZE_GUESS (128 * 1024)
329
330/* Array of directory names in which any temporary files are to be created. */
331static char const **temp_dirs;
332
333/* Number of temporary directory names used.  */
334static size_t temp_dir_count;
335
336/* Number of allocated slots in temp_dirs.  */
337static size_t temp_dir_alloc;
338
339/* Flag to reverse the order of all comparisons. */
340static bool reverse;
341
342/* Flag for stable sort.  This turns off the last ditch bytewise
343   comparison of lines, and instead leaves lines in the same order
344   they were read if all keys compare equal.  */
345static bool stable;
346
347/* If TAB has this value, blanks separate fields.  */
348enum { TAB_DEFAULT = CHAR_MAX + 1 };
349
350/* Tab character separating fields.  If TAB_DEFAULT, then fields are
351   separated by the empty string between a non-blank character and a blank
352   character. */
353static int tab = TAB_DEFAULT;
354
355/* Flag to remove consecutive duplicate lines from the output.
356   Only the last of a sequence of equal lines will be output. */
357static bool unique;
358
359/* Nonzero if any of the input files are the standard input. */
360static bool have_read_stdin;
361
362/* List of key field comparisons to be tried.  */
363static struct keyfield *keylist;
364
365/* Program used to (de)compress temp files.  Must accept -d.  */
366static char const *compress_program;
367
368/* Annotate the output with extra info to aid the user.  */
369static bool debug;
370
371/* Maximum number of files to merge in one go.  If more than this
372   number are present, temp files will be used. */
373static unsigned int nmerge = NMERGE_DEFAULT;
374
375/* Output an error to stderr using async-signal-safe routines, and _exit().
376   This can be used safely from signal handlers,
377   and between fork() and exec() of multithreaded processes.  */
378
379static void async_safe_die (int, const char *) ATTRIBUTE_NORETURN;
380static void
381async_safe_die (int errnum, const char *errstr)
382{
383  ignore_value (write (STDERR_FILENO, errstr, strlen (errstr)));
384
385  /* Even if defined HAVE_STRERROR_R, we can't use it,
386     as it may return a translated string etc. and even if not
387     may malloc() which is unsafe.  We might improve this
388     by testing for sys_errlist and using that if available.
389     For now just report the error number.  */
390  if (errnum)
391    {
392      char errbuf[INT_BUFSIZE_BOUND (errnum)];
393      char *p = inttostr (errnum, errbuf);
394      ignore_value (write (STDERR_FILENO, ": errno ", 8));
395      ignore_value (write (STDERR_FILENO, p, strlen (p)));
396    }
397
398  ignore_value (write (STDERR_FILENO, "\n", 1));
399
400  _exit (SORT_FAILURE);
401}
402
403/* Report MESSAGE for FILE, then clean up and exit.
404   If FILE is null, it represents standard output.  */
405
406static void die (char const *, char const *) ATTRIBUTE_NORETURN;
407static void
408die (char const *message, char const *file)
409{
410  error (0, errno, "%s: %s", message,
411         quotef (file ? file : _("standard output")));
412  exit (SORT_FAILURE);
413}
414
415void
416usage (int status)
417{
418  if (status != EXIT_SUCCESS)
419    emit_try_help ();
420  else
421    {
422      printf (_("\
423Usage: %s [OPTION]... [FILE]...\n\
424  or:  %s [OPTION]... --files0-from=F\n\
425"),
426              program_name, program_name);
427      fputs (_("\
428Write sorted concatenation of all FILE(s) to standard output.\n\
429"), stdout);
430
431      emit_stdin_note ();
432      emit_mandatory_arg_note ();
433
434      fputs (_("\
435Ordering options:\n\
436\n\
437"), stdout);
438      fputs (_("\
439  -b, --ignore-leading-blanks  ignore leading blanks\n\
440  -d, --dictionary-order      consider only blanks and alphanumeric characters\
441\n\
442  -f, --ignore-case           fold lower case to upper case characters\n\
443"), stdout);
444      fputs (_("\
445  -g, --general-numeric-sort  compare according to general numerical value\n\
446  -i, --ignore-nonprinting    consider only printable characters\n\
447  -M, --month-sort            compare (unknown) < 'JAN' < ... < 'DEC'\n\
448"), stdout);
449      fputs (_("\
450  -h, --human-numeric-sort    compare human readable numbers (e.g., 2K 1G)\n\
451"), stdout);
452      fputs (_("\
453  -n, --numeric-sort          compare according to string numerical value\n\
454  -R, --random-sort           shuffle, but group identical keys.  See shuf(1)\n\
455      --random-source=FILE    get random bytes from FILE\n\
456  -r, --reverse               reverse the result of comparisons\n\
457"), stdout);
458      fputs (_("\
459      --sort=WORD             sort according to WORD:\n\
460                                general-numeric -g, human-numeric -h, month -M,\
461\n\
462                                numeric -n, random -R, version -V\n\
463  -V, --version-sort          natural sort of (version) numbers within text\n\
464\n\
465"), stdout);
466      fputs (_("\
467Other options:\n\
468\n\
469"), stdout);
470      fputs (_("\
471      --batch-size=NMERGE   merge at most NMERGE inputs at once;\n\
472                            for more use temp files\n\
473"), stdout);
474      fputs (_("\
475  -c, --check, --check=diagnose-first  check for sorted input; do not sort\n\
476  -C, --check=quiet, --check=silent  like -c, but do not report first bad line\
477\n\
478      --compress-program=PROG  compress temporaries with PROG;\n\
479                              decompress them with PROG -d\n\
480"), stdout);
481      fputs (_("\
482      --debug               annotate the part of the line used to sort,\n\
483                              and warn about questionable usage to stderr\n\
484      --files0-from=F       read input from the files specified by\n\
485                            NUL-terminated names in file F;\n\
486                            If F is - then read names from standard input\n\
487"), stdout);
488      fputs (_("\
489  -k, --key=KEYDEF          sort via a key; KEYDEF gives location and type\n\
490  -m, --merge               merge already sorted files; do not sort\n\
491"), stdout);
492      fputs (_("\
493  -o, --output=FILE         write result to FILE instead of standard output\n\
494  -s, --stable              stabilize sort by disabling last-resort comparison\
495\n\
496  -S, --buffer-size=SIZE    use SIZE for main memory buffer\n\
497"), stdout);
498      printf (_("\
499  -t, --field-separator=SEP  use SEP instead of non-blank to blank transition\n\
500  -T, --temporary-directory=DIR  use DIR for temporaries, not $TMPDIR or %s;\n\
501                              multiple options specify multiple directories\n\
502      --parallel=N          change the number of sorts run concurrently to N\n\
503  -u, --unique              with -c, check for strict ordering;\n\
504                              without -c, output only the first of an equal run\
505\n\
506"), DEFAULT_TMPDIR);
507      fputs (_("\
508  -z, --zero-terminated     line delimiter is NUL, not newline\n\
509"), stdout);
510      fputs (HELP_OPTION_DESCRIPTION, stdout);
511      fputs (VERSION_OPTION_DESCRIPTION, stdout);
512      fputs (_("\
513\n\
514KEYDEF is F[.C][OPTS][,F[.C][OPTS]] for start and stop position, where F is a\n\
515field number and C a character position in the field; both are origin 1, and\n\
516the stop position defaults to the line's end.  If neither -t nor -b is in\n\
517effect, characters in a field are counted from the beginning of the preceding\n\
518whitespace.  OPTS is one or more single-letter ordering options [bdfgiMhnRrV],\
519\n\
520which override global ordering options for that key.  If no key is given, use\n\
521the entire line as the key.  Use --debug to diagnose incorrect key usage.\n\
522\n\
523SIZE may be followed by the following multiplicative suffixes:\n\
524"), stdout);
525      fputs (_("\
526% 1% of memory, b 1, K 1024 (default), and so on for M, G, T, P, E, Z, Y.\n\
527\n\
528*** WARNING ***\n\
529The locale specified by the environment affects sort order.\n\
530Set LC_ALL=C to get the traditional sort order that uses\n\
531native byte values.\n\
532"), stdout );
533      emit_ancillary_info (PROGRAM_NAME);
534    }
535
536  exit (status);
537}
538
539/* For long options that have no equivalent short option, use a
540   non-character as a pseudo short option, starting with CHAR_MAX + 1.  */
541enum
542{
543  CHECK_OPTION = CHAR_MAX + 1,
544  COMPRESS_PROGRAM_OPTION,
545  DEBUG_PROGRAM_OPTION,
546  FILES0_FROM_OPTION,
547  NMERGE_OPTION,
548  RANDOM_SOURCE_OPTION,
549  SORT_OPTION,
550  PARALLEL_OPTION
551};
552
553static char const short_options[] = "-bcCdfghik:mMno:rRsS:t:T:uVy:z";
554
555static struct option const long_options[] =
556{
557  {"ignore-leading-blanks", no_argument, NULL, 'b'},
558  {"check", optional_argument, NULL, CHECK_OPTION},
559  {"compress-program", required_argument, NULL, COMPRESS_PROGRAM_OPTION},
560  {"debug", no_argument, NULL, DEBUG_PROGRAM_OPTION},
561  {"dictionary-order", no_argument, NULL, 'd'},
562  {"ignore-case", no_argument, NULL, 'f'},
563  {"files0-from", required_argument, NULL, FILES0_FROM_OPTION},
564  {"general-numeric-sort", no_argument, NULL, 'g'},
565  {"ignore-nonprinting", no_argument, NULL, 'i'},
566  {"key", required_argument, NULL, 'k'},
567  {"merge", no_argument, NULL, 'm'},
568  {"month-sort", no_argument, NULL, 'M'},
569  {"numeric-sort", no_argument, NULL, 'n'},
570  {"human-numeric-sort", no_argument, NULL, 'h'},
571  {"version-sort", no_argument, NULL, 'V'},
572  {"random-sort", no_argument, NULL, 'R'},
573  {"random-source", required_argument, NULL, RANDOM_SOURCE_OPTION},
574  {"sort", required_argument, NULL, SORT_OPTION},
575  {"output", required_argument, NULL, 'o'},
576  {"reverse", no_argument, NULL, 'r'},
577  {"stable", no_argument, NULL, 's'},
578  {"batch-size", required_argument, NULL, NMERGE_OPTION},
579  {"buffer-size", required_argument, NULL, 'S'},
580  {"field-separator", required_argument, NULL, 't'},
581  {"temporary-directory", required_argument, NULL, 'T'},
582  {"unique", no_argument, NULL, 'u'},
583  {"zero-terminated", no_argument, NULL, 'z'},
584  {"parallel", required_argument, NULL, PARALLEL_OPTION},
585  {GETOPT_HELP_OPTION_DECL},
586  {GETOPT_VERSION_OPTION_DECL},
587  {NULL, 0, NULL, 0},
588};
589
590#define CHECK_TABLE \
591  _ct_("quiet",          'C') \
592  _ct_("silent",         'C') \
593  _ct_("diagnose-first", 'c')
594
595static char const *const check_args[] =
596{
597#define _ct_(_s, _c) _s,
598  CHECK_TABLE NULL
599#undef  _ct_
600};
601static char const check_types[] =
602{
603#define _ct_(_s, _c) _c,
604  CHECK_TABLE
605#undef  _ct_
606};
607
608#define SORT_TABLE \
609  _st_("general-numeric", 'g') \
610  _st_("human-numeric",   'h') \
611  _st_("month",           'M') \
612  _st_("numeric",         'n') \
613  _st_("random",          'R') \
614  _st_("version",         'V')
615
616static char const *const sort_args[] =
617{
618#define _st_(_s, _c) _s,
619  SORT_TABLE NULL
620#undef  _st_
621};
622static char const sort_types[] =
623{
624#define _st_(_s, _c) _c,
625  SORT_TABLE
626#undef  _st_
627};
628
629/* The set of signals that are caught.  */
630static sigset_t caught_signals;
631
632/* Critical section status.  */
633struct cs_status
634{
635  bool valid;
636  sigset_t sigs;
637};
638
639/* Enter a critical section.  */
640static struct cs_status
641cs_enter (void)
642{
643  struct cs_status status;
644  status.valid = (sigprocmask (SIG_BLOCK, &caught_signals, &status.sigs) == 0);
645  return status;
646}
647
648/* Leave a critical section.  */
649static void
650cs_leave (struct cs_status status)
651{
652  if (status.valid)
653    {
654      /* Ignore failure when restoring the signal mask. */
655      sigprocmask (SIG_SETMASK, &status.sigs, NULL);
656    }
657}
658
659/* Possible states for a temp file.  If compressed, the file's status
660   is unreaped or reaped, depending on whether 'sort' has waited for
661   the subprocess to finish.  */
662enum { UNCOMPRESSED, UNREAPED, REAPED };
663
664/* The list of temporary files. */
665struct tempnode
666{
667  struct tempnode *volatile next;
668  pid_t pid;     /* The subprocess PID; undefined if state == UNCOMPRESSED.  */
669  char state;
670  char name[1];  /* Actual size is 1 + file name length.  */
671};
672static struct tempnode *volatile temphead;
673static struct tempnode *volatile *temptail = &temphead;
674
675/* A file to be sorted.  */
676struct sortfile
677{
678  /* The file's name.  */
679  char const *name;
680
681  /* Non-null if this is a temporary file, in which case NAME == TEMP->name.  */
682  struct tempnode *temp;
683};
684
685/* Map PIDs of unreaped subprocesses to their struct tempnode objects.  */
686static Hash_table *proctab;
687
688enum { INIT_PROCTAB_SIZE = 47 };
689
690static size_t
691proctab_hasher (void const *entry, size_t tabsize)
692{
693  struct tempnode const *node = entry;
694  return node->pid % tabsize;
695}
696
697static bool
698proctab_comparator (void const *e1, void const *e2)
699{
700  struct tempnode const *n1 = e1;
701  struct tempnode const *n2 = e2;
702  return n1->pid == n2->pid;
703}
704
705/* The number of unreaped child processes.  */
706static pid_t nprocs;
707
708static bool delete_proc (pid_t);
709
710/* If PID is positive, wait for the child process with that PID to
711   exit, and assume that PID has already been removed from the process
712   table.  If PID is 0 or -1, clean up some child that has exited (by
713   waiting for it, and removing it from the proc table) and return the
714   child's process ID.  However, if PID is 0 and no children have
715   exited, return 0 without waiting.  */
716
717static pid_t
718reap (pid_t pid)
719{
720  int status;
721  pid_t cpid = waitpid ((pid ? pid : -1), &status, (pid ? 0 : WNOHANG));
722
723  if (cpid < 0)
724    error (SORT_FAILURE, errno, _("waiting for %s [-d]"),
725           quoteaf (compress_program));
726  else if (0 < cpid && (0 < pid || delete_proc (cpid)))
727    {
728      if (! WIFEXITED (status) || WEXITSTATUS (status))
729        error (SORT_FAILURE, 0, _("%s [-d] terminated abnormally"),
730               quoteaf (compress_program));
731      --nprocs;
732    }
733
734  return cpid;
735}
736
737/* TEMP represents a new process; add it to the process table.  Create
738   the process table the first time it's called.  */
739
740static void
741register_proc (struct tempnode *temp)
742{
743  if (! proctab)
744    {
745      proctab = hash_initialize (INIT_PROCTAB_SIZE, NULL,
746                                 proctab_hasher,
747                                 proctab_comparator,
748                                 NULL);
749      if (! proctab)
750        xalloc_die ();
751    }
752
753  temp->state = UNREAPED;
754
755  if (! hash_insert (proctab, temp))
756    xalloc_die ();
757}
758
759/* If PID is in the process table, remove it and return true.
760   Otherwise, return false.  */
761
762static bool
763delete_proc (pid_t pid)
764{
765  struct tempnode test;
766
767  test.pid = pid;
768  struct tempnode *node = hash_delete (proctab, &test);
769  if (! node)
770    return false;
771  node->state = REAPED;
772  return true;
773}
774
775/* Remove PID from the process table, and wait for it to exit if it
776   hasn't already.  */
777
778static void
779wait_proc (pid_t pid)
780{
781  if (delete_proc (pid))
782    reap (pid);
783}
784
785/* Reap any exited children.  Do not block; reap only those that have
786   already exited.  */
787
788static void
789reap_exited (void)
790{
791  while (0 < nprocs && reap (0))
792    continue;
793}
794
795/* Reap at least one exited child, waiting if necessary.  */
796
797static void
798reap_some (void)
799{
800  reap (-1);
801  reap_exited ();
802}
803
804/* Reap all children, waiting if necessary.  */
805
806static void
807reap_all (void)
808{
809  while (0 < nprocs)
810    reap (-1);
811}
812
813/* Clean up any remaining temporary files.  */
814
815static void
816cleanup (void)
817{
818  struct tempnode const *node;
819
820  for (node = temphead; node; node = node->next)
821    unlink (node->name);
822  temphead = NULL;
823}
824
825/* Cleanup actions to take when exiting.  */
826
827static void
828exit_cleanup (void)
829{
830  if (temphead)
831    {
832      /* Clean up any remaining temporary files in a critical section so
833         that a signal handler does not try to clean them too.  */
834      struct cs_status cs = cs_enter ();
835      cleanup ();
836      cs_leave (cs);
837    }
838
839  close_stdout ();
840}
841
842/* Create a new temporary file, returning its newly allocated tempnode.
843   Store into *PFD the file descriptor open for writing.
844   If the creation fails, return NULL and store -1 into *PFD if the
845   failure is due to file descriptor exhaustion and
846   SURVIVE_FD_EXHAUSTION; otherwise, die.  */
847
848static struct tempnode *
849create_temp_file (int *pfd, bool survive_fd_exhaustion)
850{
851  static char const slashbase[] = "/sortXXXXXX";
852  static size_t temp_dir_index;
853  int fd;
854  int saved_errno;
855  char const *temp_dir = temp_dirs[temp_dir_index];
856  size_t len = strlen (temp_dir);
857  struct tempnode *node =
858    xmalloc (offsetof (struct tempnode, name) + len + sizeof slashbase);
859  char *file = node->name;
860  struct cs_status cs;
861
862  memcpy (file, temp_dir, len);
863  memcpy (file + len, slashbase, sizeof slashbase);
864  node->next = NULL;
865  if (++temp_dir_index == temp_dir_count)
866    temp_dir_index = 0;
867
868  /* Create the temporary file in a critical section, to avoid races.  */
869  cs = cs_enter ();
870  fd = mkstemp (file);
871  if (0 <= fd)
872    {
873      *temptail = node;
874      temptail = &node->next;
875    }
876  saved_errno = errno;
877  cs_leave (cs);
878  errno = saved_errno;
879
880  if (fd < 0)
881    {
882      if (! (survive_fd_exhaustion && errno == EMFILE))
883        error (SORT_FAILURE, errno, _("cannot create temporary file in %s"),
884               quoteaf (temp_dir));
885      free (node);
886      node = NULL;
887    }
888
889  *pfd = fd;
890  return node;
891}
892
893/* Return a stream for FILE, opened with mode HOW.  A null FILE means
894   standard output; HOW should be "w".  When opening for input, "-"
895   means standard input.  To avoid confusion, do not return file
896   descriptors STDIN_FILENO, STDOUT_FILENO, or STDERR_FILENO when
897   opening an ordinary FILE.  Return NULL if unsuccessful.
898
899   fadvise() is used to specify an access pattern for input files.
900   There are a few hints we could possibly provide,
901   and after careful testing it was decided that
902   specifying POSIX_FADV_SEQUENTIAL was not detrimental
903   to any cases.  On Linux 2.6.31, this option doubles
904   the size of read ahead performed and thus was seen to
905   benefit these cases:
906     Merging
907     Sorting with a smaller internal buffer
908     Reading from faster flash devices
909
910   In _addition_ one could also specify other hints...
911
912   POSIX_FADV_WILLNEED was tested, but Linux 2.6.31
913   at least uses that to _synchronously_ prepopulate the cache
914   with the specified range.  While sort does need to
915   read all of its input before outputting, a synchronous
916   read of the whole file up front precludes any processing
917   that sort could do in parallel with the system doing
918   read ahead of the data. This was seen to have negative effects
919   in a couple of cases:
920     Merging
921     Sorting with a smaller internal buffer
922   Note this option was seen to shorten the runtime for sort
923   on a multicore system with lots of RAM and other processes
924   competing for CPU.  It could be argued that more explicit
925   scheduling hints with 'nice' et. al. are more appropriate
926   for this situation.
927
928   POSIX_FADV_NOREUSE is a possibility as it could lower
929   the priority of input data in the cache as sort will
930   only need to process it once.  However its functionality
931   has changed over Linux kernel versions and as of 2.6.31
932   it does nothing and thus we can't depend on what it might
933   do in future.
934
935   POSIX_FADV_DONTNEED is not appropriate for user specified
936   input files, but for temp files we do want to drop the
937   cache immediately after processing.  This is done implicitly
938   however when the files are unlinked.  */
939
940static FILE *
941stream_open (char const *file, char const *how)
942{
943  FILE *fp;
944
945  if (*how == 'r')
946    {
947      if (STREQ (file, "-"))
948        {
949          have_read_stdin = true;
950          fp = stdin;
951        }
952      else
953        fp = fopen (file, how);
954      fadvise (fp, FADVISE_SEQUENTIAL);
955    }
956  else if (*how == 'w')
957    {
958      if (file && ftruncate (STDOUT_FILENO, 0) != 0)
959        error (SORT_FAILURE, errno, _("%s: error truncating"),
960               quotef (file));
961      fp = stdout;
962    }
963  else
964    assert (!"unexpected mode passed to stream_open");
965
966  return fp;
967}
968
969/* Same as stream_open, except always return a non-null value; die on
970   failure.  */
971
972static FILE *
973xfopen (char const *file, char const *how)
974{
975  FILE *fp = stream_open (file, how);
976  if (!fp)
977    die (_("open failed"), file);
978  return fp;
979}
980
981/* Close FP, whose name is FILE, and report any errors.  */
982
983static void
984xfclose (FILE *fp, char const *file)
985{
986  switch (fileno (fp))
987    {
988    case STDIN_FILENO:
989      /* Allow reading stdin from tty more than once.  */
990      if (feof (fp))
991        clearerr (fp);
992      break;
993
994    case STDOUT_FILENO:
995      /* Don't close stdout just yet.  close_stdout does that.  */
996      if (fflush (fp) != 0)
997        die (_("fflush failed"), file);
998      break;
999
1000    default:
1001      if (fclose (fp) != 0)
1002        die (_("close failed"), file);
1003      break;
1004    }
1005}
1006
1007static void
1008move_fd_or_die (int oldfd, int newfd)
1009{
1010  if (oldfd != newfd)
1011    {
1012      /* This should never fail for our usage.  */
1013      dup2 (oldfd, newfd);
1014      close (oldfd);
1015    }
1016}
1017
1018/* Fork a child process for piping to and do common cleanup.  The
1019   TRIES parameter tells us how many times to try to fork before
1020   giving up.  Return the PID of the child, or -1 (setting errno)
1021   on failure. */
1022
1023static pid_t
1024pipe_fork (int pipefds[2], size_t tries)
1025{
1026#if HAVE_WORKING_FORK
1027  struct tempnode *saved_temphead;
1028  int saved_errno;
1029  double wait_retry = 0.25;
1030  pid_t pid IF_LINT ( = -1);
1031  struct cs_status cs;
1032
1033  if (pipe (pipefds) < 0)
1034    return -1;
1035
1036  /* At least NMERGE + 1 subprocesses are needed.  More could be created, but
1037     uncontrolled subprocess generation can hurt performance significantly.
1038     Allow at most NMERGE + 2 subprocesses, on the theory that there
1039     may be some useful parallelism by letting compression for the
1040     previous merge finish (1 subprocess) in parallel with the current
1041     merge (NMERGE + 1 subprocesses).  */
1042
1043  if (nmerge + 1 < nprocs)
1044    reap_some ();
1045
1046  while (tries--)
1047    {
1048      /* This is so the child process won't delete our temp files
1049         if it receives a signal before exec-ing.  */
1050      cs = cs_enter ();
1051      saved_temphead = temphead;
1052      temphead = NULL;
1053
1054      pid = fork ();
1055      saved_errno = errno;
1056      if (pid)
1057        temphead = saved_temphead;
1058
1059      cs_leave (cs);
1060      errno = saved_errno;
1061
1062      if (0 <= pid || errno != EAGAIN)
1063        break;
1064      else
1065        {
1066          xnanosleep (wait_retry);
1067          wait_retry *= 2;
1068          reap_exited ();
1069        }
1070    }
1071
1072  if (pid < 0)
1073    {
1074      saved_errno = errno;
1075      close (pipefds[0]);
1076      close (pipefds[1]);
1077      errno = saved_errno;
1078    }
1079  else if (pid == 0)
1080    {
1081      close (STDIN_FILENO);
1082      close (STDOUT_FILENO);
1083    }
1084  else
1085    ++nprocs;
1086
1087  return pid;
1088
1089#else  /* ! HAVE_WORKING_FORK */
1090  return -1;
1091#endif
1092}
1093
1094/* Create a temporary file and, if asked for, start a compressor
1095   to that file.  Set *PFP to the file handle and return
1096   the address of the new temp node.  If the creation
1097   fails, return NULL if the failure is due to file descriptor
1098   exhaustion and SURVIVE_FD_EXHAUSTION; otherwise, die.  */
1099
1100static struct tempnode *
1101maybe_create_temp (FILE **pfp, bool survive_fd_exhaustion)
1102{
1103  int tempfd;
1104  struct tempnode *node = create_temp_file (&tempfd, survive_fd_exhaustion);
1105  if (! node)
1106    return NULL;
1107
1108  node->state = UNCOMPRESSED;
1109
1110  if (compress_program)
1111    {
1112      int pipefds[2];
1113
1114      node->pid = pipe_fork (pipefds, MAX_FORK_TRIES_COMPRESS);
1115      if (0 < node->pid)
1116        {
1117          close (tempfd);
1118          close (pipefds[0]);
1119          tempfd = pipefds[1];
1120
1121          register_proc (node);
1122        }
1123      else if (node->pid == 0)
1124        {
1125          /* Being the child of a multithreaded program before exec(),
1126             we're restricted to calling async-signal-safe routines here.  */
1127          close (pipefds[1]);
1128          move_fd_or_die (tempfd, STDOUT_FILENO);
1129          move_fd_or_die (pipefds[0], STDIN_FILENO);
1130
1131          execlp (compress_program, compress_program, (char *) NULL);
1132
1133          async_safe_die (errno, "couldn't execute compress program");
1134        }
1135    }
1136
1137  *pfp = fdopen (tempfd, "w");
1138  if (! *pfp)
1139    die (_("couldn't create temporary file"), node->name);
1140
1141  return node;
1142}
1143
1144/* Create a temporary file and, if asked for, start a compressor
1145   to that file.  Set *PFP to the file handle and return the address
1146   of the new temp node.  Die on failure.  */
1147
1148static struct tempnode *
1149create_temp (FILE **pfp)
1150{
1151  return maybe_create_temp (pfp, false);
1152}
1153
1154/* Open a compressed temp file and start a decompression process through
1155   which to filter the input.  Return NULL (setting errno to
1156   EMFILE) if we ran out of file descriptors, and die on any other
1157   kind of failure.  */
1158
1159static FILE *
1160open_temp (struct tempnode *temp)
1161{
1162  int tempfd, pipefds[2];
1163  FILE *fp = NULL;
1164
1165  if (temp->state == UNREAPED)
1166    wait_proc (temp->pid);
1167
1168  tempfd = open (temp->name, O_RDONLY);
1169  if (tempfd < 0)
1170    return NULL;
1171
1172  pid_t child = pipe_fork (pipefds, MAX_FORK_TRIES_DECOMPRESS);
1173
1174  switch (child)
1175    {
1176    case -1:
1177      if (errno != EMFILE)
1178        error (SORT_FAILURE, errno, _("couldn't create process for %s -d"),
1179               quoteaf (compress_program));
1180      close (tempfd);
1181      errno = EMFILE;
1182      break;
1183
1184    case 0:
1185      /* Being the child of a multithreaded program before exec(),
1186         we're restricted to calling async-signal-safe routines here.  */
1187      close (pipefds[0]);
1188      move_fd_or_die (tempfd, STDIN_FILENO);
1189      move_fd_or_die (pipefds[1], STDOUT_FILENO);
1190
1191      execlp (compress_program, compress_program, "-d", (char *) NULL);
1192
1193      async_safe_die (errno, "couldn't execute compress program (with -d)");
1194
1195    default:
1196      temp->pid = child;
1197      register_proc (temp);
1198      close (tempfd);
1199      close (pipefds[1]);
1200
1201      fp = fdopen (pipefds[0], "r");
1202      if (! fp)
1203        {
1204          int saved_errno = errno;
1205          close (pipefds[0]);
1206          errno = saved_errno;
1207        }
1208      break;
1209    }
1210
1211  return fp;
1212}
1213
1214/* Append DIR to the array of temporary directory names.  */
1215static void
1216add_temp_dir (char const *dir)
1217{
1218  if (temp_dir_count == temp_dir_alloc)
1219    temp_dirs = X2NREALLOC (temp_dirs, &temp_dir_alloc);
1220
1221  temp_dirs[temp_dir_count++] = dir;
1222}
1223
1224/* Remove NAME from the list of temporary files.  */
1225
1226static void
1227zaptemp (char const *name)
1228{
1229  struct tempnode *volatile *pnode;
1230  struct tempnode *node;
1231  struct tempnode *next;
1232  int unlink_status;
1233  int unlink_errno = 0;
1234  struct cs_status cs;
1235
1236  for (pnode = &temphead; (node = *pnode)->name != name; pnode = &node->next)
1237    continue;
1238
1239  if (node->state == UNREAPED)
1240    wait_proc (node->pid);
1241
1242  /* Unlink the temporary file in a critical section to avoid races.  */
1243  next = node->next;
1244  cs = cs_enter ();
1245  unlink_status = unlink (name);
1246  unlink_errno = errno;
1247  *pnode = next;
1248  cs_leave (cs);
1249
1250  if (unlink_status != 0)
1251    error (0, unlink_errno, _("warning: cannot remove: %s"), quotef (name));
1252  if (! next)
1253    temptail = pnode;
1254  free (node);
1255}
1256
1257#if HAVE_NL_LANGINFO
1258
1259static int
1260struct_month_cmp (void const *m1, void const *m2)
1261{
1262  struct month const *month1 = m1;
1263  struct month const *month2 = m2;
1264  return strcmp (month1->name, month2->name);
1265}
1266
1267#endif
1268
1269/* Initialize the character class tables. */
1270
1271static void
1272inittables (void)
1273{
1274  size_t i;
1275
1276  for (i = 0; i < UCHAR_LIM; ++i)
1277    {
1278      blanks[i] = field_sep (i);
1279      nonprinting[i] = ! isprint (i);
1280      nondictionary[i] = ! isalnum (i) && ! field_sep (i);
1281      fold_toupper[i] = toupper (i);
1282    }
1283
1284#if HAVE_NL_LANGINFO
1285  /* If we're not in the "C" locale, read different names for months.  */
1286  if (hard_LC_TIME)
1287    {
1288      for (i = 0; i < MONTHS_PER_YEAR; i++)
1289        {
1290          char const *s;
1291          size_t s_len;
1292          size_t j, k;
1293          char *name;
1294
1295          s = nl_langinfo (ABMON_1 + i);
1296          s_len = strlen (s);
1297          monthtab[i].name = name = xmalloc (s_len + 1);
1298          monthtab[i].val = i + 1;
1299
1300          for (j = k = 0; j < s_len; j++)
1301            if (! isblank (to_uchar (s[j])))
1302              name[k++] = fold_toupper[to_uchar (s[j])];
1303          name[k] = '\0';
1304        }
1305      qsort (monthtab, MONTHS_PER_YEAR, sizeof *monthtab, struct_month_cmp);
1306    }
1307#endif
1308}
1309
1310/* Specify how many inputs may be merged at once.
1311   This may be set on the command-line with the
1312   --batch-size option. */
1313static void
1314specify_nmerge (int oi, char c, char const *s)
1315{
1316  uintmax_t n;
1317  struct rlimit rlimit;
1318  enum strtol_error e = xstrtoumax (s, NULL, 10, &n, NULL);
1319
1320  /* Try to find out how many file descriptors we'll be able
1321     to open.  We need at least nmerge + 3 (STDIN_FILENO,
1322     STDOUT_FILENO and STDERR_FILENO). */
1323  unsigned int max_nmerge = ((getrlimit (RLIMIT_NOFILE, &rlimit) == 0
1324                              ? rlimit.rlim_cur
1325                              : OPEN_MAX)
1326                             - 3);
1327
1328  if (e == LONGINT_OK)
1329    {
1330      nmerge = n;
1331      if (nmerge != n)
1332        e = LONGINT_OVERFLOW;
1333      else
1334        {
1335          if (nmerge < 2)
1336            {
1337              error (0, 0, _("invalid --%s argument %s"),
1338                     long_options[oi].name, quote (s));
1339              error (SORT_FAILURE, 0,
1340                     _("minimum --%s argument is %s"),
1341                     long_options[oi].name, quote ("2"));
1342            }
1343          else if (max_nmerge < nmerge)
1344            {
1345              e = LONGINT_OVERFLOW;
1346            }
1347          else
1348            return;
1349        }
1350    }
1351
1352  if (e == LONGINT_OVERFLOW)
1353    {
1354      char max_nmerge_buf[INT_BUFSIZE_BOUND (max_nmerge)];
1355      error (0, 0, _("--%s argument %s too large"),
1356             long_options[oi].name, quote (s));
1357      error (SORT_FAILURE, 0,
1358             _("maximum --%s argument with current rlimit is %s"),
1359             long_options[oi].name,
1360             uinttostr (max_nmerge, max_nmerge_buf));
1361    }
1362  else
1363    xstrtol_fatal (e, oi, c, long_options, s);
1364}
1365
1366/* Specify the amount of main memory to use when sorting.  */
1367static void
1368specify_sort_size (int oi, char c, char const *s)
1369{
1370  uintmax_t n;
1371  char *suffix;
1372  enum strtol_error e = xstrtoumax (s, &suffix, 10, &n, "EgGkKmMPtTYZ");
1373
1374  /* The default unit is KiB.  */
1375  if (e == LONGINT_OK && ISDIGIT (suffix[-1]))
1376    {
1377      if (n <= UINTMAX_MAX / 1024)
1378        n *= 1024;
1379      else
1380        e = LONGINT_OVERFLOW;
1381    }
1382
1383  /* A 'b' suffix means bytes; a '%' suffix means percent of memory.  */
1384  if (e == LONGINT_INVALID_SUFFIX_CHAR && ISDIGIT (suffix[-1]) && ! suffix[1])
1385    switch (suffix[0])
1386      {
1387      case 'b':
1388        e = LONGINT_OK;
1389        break;
1390
1391      case '%':
1392        {
1393          double mem = physmem_total () * n / 100;
1394
1395          /* Use "<", not "<=", to avoid problems with rounding.  */
1396          if (mem < UINTMAX_MAX)
1397            {
1398              n = mem;
1399              e = LONGINT_OK;
1400            }
1401          else
1402            e = LONGINT_OVERFLOW;
1403        }
1404        break;
1405      }
1406
1407  if (e == LONGINT_OK)
1408    {
1409      /* If multiple sort sizes are specified, take the maximum, so
1410         that option order does not matter.  */
1411      if (n < sort_size)
1412        return;
1413
1414      sort_size = n;
1415      if (sort_size == n)
1416        {
1417          sort_size = MAX (sort_size, MIN_SORT_SIZE);
1418          return;
1419        }
1420
1421      e = LONGINT_OVERFLOW;
1422    }
1423
1424  xstrtol_fatal (e, oi, c, long_options, s);
1425}
1426
1427/* Specify the number of threads to spawn during internal sort.  */
1428static size_t
1429specify_nthreads (int oi, char c, char const *s)
1430{
1431  unsigned long int nthreads;
1432  enum strtol_error e = xstrtoul (s, NULL, 10, &nthreads, "");
1433  if (e == LONGINT_OVERFLOW)
1434    return SIZE_MAX;
1435  if (e != LONGINT_OK)
1436    xstrtol_fatal (e, oi, c, long_options, s);
1437  if (SIZE_MAX < nthreads)
1438    nthreads = SIZE_MAX;
1439  if (nthreads == 0)
1440    error (SORT_FAILURE, 0, _("number in parallel must be nonzero"));
1441  return nthreads;
1442}
1443
1444/* Return the default sort size.  */
1445static size_t
1446default_sort_size (void)
1447{
1448  /* Let SIZE be MEM, but no more than the maximum object size,
1449     total memory, or system resource limits.  Don't bother to check
1450     for values like RLIM_INFINITY since in practice they are not much
1451     less than SIZE_MAX.  */
1452  size_t size = SIZE_MAX;
1453  struct rlimit rlimit;
1454  if (getrlimit (RLIMIT_DATA, &rlimit) == 0 && rlimit.rlim_cur < size)
1455    size = rlimit.rlim_cur;
1456#ifdef RLIMIT_AS
1457  if (getrlimit (RLIMIT_AS, &rlimit) == 0 && rlimit.rlim_cur < size)
1458    size = rlimit.rlim_cur;
1459#endif
1460
1461  /* Leave a large safety margin for the above limits, as failure can
1462     occur when they are exceeded.  */
1463  size /= 2;
1464
1465#ifdef RLIMIT_RSS
1466  /* Leave a 1/16 margin for RSS to leave room for code, stack, etc.
1467     Exceeding RSS is not fatal, but can be quite slow.  */
1468  if (getrlimit (RLIMIT_RSS, &rlimit) == 0 && rlimit.rlim_cur / 16 * 15 < size)
1469    size = rlimit.rlim_cur / 16 * 15;
1470#endif
1471
1472  /* Let MEM be available memory or 1/8 of total memory, whichever
1473     is greater.  */
1474  double avail = physmem_available ();
1475  double total = physmem_total ();
1476  double mem = MAX (avail, total / 8);
1477
1478  /* Leave a 1/4 margin for physical memory.  */
1479  if (total * 0.75 < size)
1480    size = total * 0.75;
1481
1482  /* Return the minimum of MEM and SIZE, but no less than
1483     MIN_SORT_SIZE.  Avoid the MIN macro here, as it is not quite
1484     right when only one argument is floating point.  */
1485  if (mem < size)
1486    size = mem;
1487  return MAX (size, MIN_SORT_SIZE);
1488}
1489
1490/* Return the sort buffer size to use with the input files identified
1491   by FPS and FILES, which are alternate names of the same files.
1492   NFILES gives the number of input files; NFPS may be less.  Assume
1493   that each input line requires LINE_BYTES extra bytes' worth of line
1494   information.  Do not exceed the size bound specified by the user
1495   (or a default size bound, if the user does not specify one).  */
1496
1497static size_t
1498sort_buffer_size (FILE *const *fps, size_t nfps,
1499                  char *const *files, size_t nfiles,
1500                  size_t line_bytes)
1501{
1502  /* A bound on the input size.  If zero, the bound hasn't been
1503     determined yet.  */
1504  static size_t size_bound;
1505
1506  /* In the worst case, each input byte is a newline.  */
1507  size_t worst_case_per_input_byte = line_bytes + 1;
1508
1509  /* Keep enough room for one extra input line and an extra byte.
1510     This extra room might be needed when preparing to read EOF.  */
1511  size_t size = worst_case_per_input_byte + 1;
1512
1513  size_t i;
1514
1515  for (i = 0; i < nfiles; i++)
1516    {
1517      struct stat st;
1518      off_t file_size;
1519      size_t worst_case;
1520
1521      if ((i < nfps ? fstat (fileno (fps[i]), &st)
1522           : STREQ (files[i], "-") ? fstat (STDIN_FILENO, &st)
1523           : stat (files[i], &st))
1524          != 0)
1525        die (_("stat failed"), files[i]);
1526
1527      if (S_ISREG (st.st_mode))
1528        file_size = st.st_size;
1529      else
1530        {
1531          /* The file has unknown size.  If the user specified a sort
1532             buffer size, use that; otherwise, guess the size.  */
1533          if (sort_size)
1534            return sort_size;
1535          file_size = INPUT_FILE_SIZE_GUESS;
1536        }
1537
1538      if (! size_bound)
1539        {
1540          size_bound = sort_size;
1541          if (! size_bound)
1542            size_bound = default_sort_size ();
1543        }
1544
1545      /* Add the amount of memory needed to represent the worst case
1546         where the input consists entirely of newlines followed by a
1547         single non-newline.  Check for overflow.  */
1548      worst_case = file_size * worst_case_per_input_byte + 1;
1549      if (file_size != worst_case / worst_case_per_input_byte
1550          || size_bound - size <= worst_case)
1551        return size_bound;
1552      size += worst_case;
1553    }
1554
1555  return size;
1556}
1557
1558/* Initialize BUF.  Reserve LINE_BYTES bytes for each line; LINE_BYTES
1559   must be at least sizeof (struct line).  Allocate ALLOC bytes
1560   initially.  */
1561
1562static void
1563initbuf (struct buffer *buf, size_t line_bytes, size_t alloc)
1564{
1565  /* Ensure that the line array is properly aligned.  If the desired
1566     size cannot be allocated, repeatedly halve it until allocation
1567     succeeds.  The smaller allocation may hurt overall performance,
1568     but that's better than failing.  */
1569  while (true)
1570    {
1571      alloc += sizeof (struct line) - alloc % sizeof (struct line);
1572      buf->buf = malloc (alloc);
1573      if (buf->buf)
1574        break;
1575      alloc /= 2;
1576      if (alloc <= line_bytes + 1)
1577        xalloc_die ();
1578    }
1579
1580  buf->line_bytes = line_bytes;
1581  buf->alloc = alloc;
1582  buf->used = buf->left = buf->nlines = 0;
1583  buf->eof = false;
1584}
1585
1586/* Return one past the limit of the line array.  */
1587
1588static inline struct line *
1589buffer_linelim (struct buffer const *buf)
1590{
1591  void *linelim = buf->buf + buf->alloc;
1592  return linelim;
1593}
1594
1595/* Return a pointer to the first character of the field specified
1596   by KEY in LINE. */
1597
1598static char *
1599begfield (struct line const *line, struct keyfield const *key)
1600{
1601  char *ptr = line->text, *lim = ptr + line->length - 1;
1602  size_t sword = key->sword;
1603  size_t schar = key->schar;
1604
1605  /* The leading field separator itself is included in a field when -t
1606     is absent.  */
1607
1608  if (tab != TAB_DEFAULT)
1609    while (ptr < lim && sword--)
1610      {
1611        while (ptr < lim && *ptr != tab)
1612          ++ptr;
1613        if (ptr < lim)
1614          ++ptr;
1615      }
1616  else
1617    while (ptr < lim && sword--)
1618      {
1619        while (ptr < lim && blanks[to_uchar (*ptr)])
1620          ++ptr;
1621        while (ptr < lim && !blanks[to_uchar (*ptr)])
1622          ++ptr;
1623      }
1624
1625  /* If we're ignoring leading blanks when computing the Start
1626     of the field, skip past them here.  */
1627  if (key->skipsblanks)
1628    while (ptr < lim && blanks[to_uchar (*ptr)])
1629      ++ptr;
1630
1631  /* Advance PTR by SCHAR (if possible), but no further than LIM.  */
1632  ptr = MIN (lim, ptr + schar);
1633
1634  return ptr;
1635}
1636
1637/* Return the limit of (a pointer to the first character after) the field
1638   in LINE specified by KEY. */
1639
1640static char *
1641limfield (struct line const *line, struct keyfield const *key)
1642{
1643  char *ptr = line->text, *lim = ptr + line->length - 1;
1644  size_t eword = key->eword, echar = key->echar;
1645
1646  if (echar == 0)
1647    eword++; /* Skip all of end field.  */
1648
1649  /* Move PTR past EWORD fields or to one past the last byte on LINE,
1650     whichever comes first.  If there are more than EWORD fields, leave
1651     PTR pointing at the beginning of the field having zero-based index,
1652     EWORD.  If a delimiter character was specified (via -t), then that
1653     'beginning' is the first character following the delimiting TAB.
1654     Otherwise, leave PTR pointing at the first 'blank' character after
1655     the preceding field.  */
1656  if (tab != TAB_DEFAULT)
1657    while (ptr < lim && eword--)
1658      {
1659        while (ptr < lim && *ptr != tab)
1660          ++ptr;
1661        if (ptr < lim && (eword || echar))
1662          ++ptr;
1663      }
1664  else
1665    while (ptr < lim && eword--)
1666      {
1667        while (ptr < lim && blanks[to_uchar (*ptr)])
1668          ++ptr;
1669        while (ptr < lim && !blanks[to_uchar (*ptr)])
1670          ++ptr;
1671      }
1672
1673#ifdef POSIX_UNSPECIFIED
1674  /* The following block of code makes GNU sort incompatible with
1675     standard Unix sort, so it's ifdef'd out for now.
1676     The POSIX spec isn't clear on how to interpret this.
1677     FIXME: request clarification.
1678
1679     From: kwzh@gnu.ai.mit.edu (Karl Heuer)
1680     Date: Thu, 30 May 96 12:20:41 -0400
1681     [Translated to POSIX 1003.1-2001 terminology by Paul Eggert.]
1682
1683     [...]I believe I've found another bug in 'sort'.
1684
1685     $ cat /tmp/sort.in
1686     a b c 2 d
1687     pq rs 1 t
1688     $ textutils-1.15/src/sort -k1.7,1.7 </tmp/sort.in
1689     a b c 2 d
1690     pq rs 1 t
1691     $ /bin/sort -k1.7,1.7 </tmp/sort.in
1692     pq rs 1 t
1693     a b c 2 d
1694
1695     Unix sort produced the answer I expected: sort on the single character
1696     in column 7.  GNU sort produced different results, because it disagrees
1697     on the interpretation of the key-end spec "M.N".  Unix sort reads this
1698     as "skip M-1 fields, then N-1 characters"; but GNU sort wants it to mean
1699     "skip M-1 fields, then either N-1 characters or the rest of the current
1700     field, whichever comes first".  This extra clause applies only to
1701     key-ends, not key-starts.
1702     */
1703
1704  /* Make LIM point to the end of (one byte past) the current field.  */
1705  if (tab != TAB_DEFAULT)
1706    {
1707      char *newlim;
1708      newlim = memchr (ptr, tab, lim - ptr);
1709      if (newlim)
1710        lim = newlim;
1711    }
1712  else
1713    {
1714      char *newlim;
1715      newlim = ptr;
1716      while (newlim < lim && blanks[to_uchar (*newlim)])
1717        ++newlim;
1718      while (newlim < lim && !blanks[to_uchar (*newlim)])
1719        ++newlim;
1720      lim = newlim;
1721    }
1722#endif
1723
1724  if (echar != 0) /* We need to skip over a portion of the end field.  */
1725    {
1726      /* If we're ignoring leading blanks when computing the End
1727         of the field, skip past them here.  */
1728      if (key->skipeblanks)
1729        while (ptr < lim && blanks[to_uchar (*ptr)])
1730          ++ptr;
1731
1732      /* Advance PTR by ECHAR (if possible), but no further than LIM.  */
1733      ptr = MIN (lim, ptr + echar);
1734    }
1735
1736  return ptr;
1737}
1738
1739/* Fill BUF reading from FP, moving buf->left bytes from the end
1740   of buf->buf to the beginning first.  If EOF is reached and the
1741   file wasn't terminated by a newline, supply one.  Set up BUF's line
1742   table too.  FILE is the name of the file corresponding to FP.
1743   Return true if some input was read.  */
1744
1745static bool
1746fillbuf (struct buffer *buf, FILE *fp, char const *file)
1747{
1748  struct keyfield const *key = keylist;
1749  char eol = eolchar;
1750  size_t line_bytes = buf->line_bytes;
1751  size_t mergesize = merge_buffer_size - MIN_MERGE_BUFFER_SIZE;
1752
1753  if (buf->eof)
1754    return false;
1755
1756  if (buf->used != buf->left)
1757    {
1758      memmove (buf->buf, buf->buf + buf->used - buf->left, buf->left);
1759      buf->used = buf->left;
1760      buf->nlines = 0;
1761    }
1762
1763  while (true)
1764    {
1765      char *ptr = buf->buf + buf->used;
1766      struct line *linelim = buffer_linelim (buf);
1767      struct line *line = linelim - buf->nlines;
1768      size_t avail = (char *) linelim - buf->nlines * line_bytes - ptr;
1769      char *line_start = buf->nlines ? line->text + line->length : buf->buf;
1770
1771      while (line_bytes + 1 < avail)
1772        {
1773          /* Read as many bytes as possible, but do not read so many
1774             bytes that there might not be enough room for the
1775             corresponding line array.  The worst case is when the
1776             rest of the input file consists entirely of newlines,
1777             except that the last byte is not a newline.  */
1778          size_t readsize = (avail - 1) / (line_bytes + 1);
1779          size_t bytes_read = fread (ptr, 1, readsize, fp);
1780          char *ptrlim = ptr + bytes_read;
1781          char *p;
1782          avail -= bytes_read;
1783
1784          if (bytes_read != readsize)
1785            {
1786              if (ferror (fp))
1787                die (_("read failed"), file);
1788              if (feof (fp))
1789                {
1790                  buf->eof = true;
1791                  if (buf->buf == ptrlim)
1792                    return false;
1793                  if (line_start != ptrlim && ptrlim[-1] != eol)
1794                    *ptrlim++ = eol;
1795                }
1796            }
1797
1798          /* Find and record each line in the just-read input.  */
1799          while ((p = memchr (ptr, eol, ptrlim - ptr)))
1800            {
1801              /* Delimit the line with NUL. This eliminates the need to
1802                 temporarily replace the last byte with NUL when calling
1803                 xmemcoll(), which increases performance.  */
1804              *p = '\0';
1805              ptr = p + 1;
1806              line--;
1807              line->text = line_start;
1808              line->length = ptr - line_start;
1809              mergesize = MAX (mergesize, line->length);
1810              avail -= line_bytes;
1811
1812              if (key)
1813                {
1814                  /* Precompute the position of the first key for
1815                     efficiency.  */
1816                  line->keylim = (key->eword == SIZE_MAX
1817                                  ? p
1818                                  : limfield (line, key));
1819
1820                  if (key->sword != SIZE_MAX)
1821                    line->keybeg = begfield (line, key);
1822                  else
1823                    {
1824                      if (key->skipsblanks)
1825                        while (blanks[to_uchar (*line_start)])
1826                          line_start++;
1827                      line->keybeg = line_start;
1828                    }
1829                }
1830
1831              line_start = ptr;
1832            }
1833
1834          ptr = ptrlim;
1835          if (buf->eof)
1836            break;
1837        }
1838
1839      buf->used = ptr - buf->buf;
1840      buf->nlines = buffer_linelim (buf) - line;
1841      if (buf->nlines != 0)
1842        {
1843          buf->left = ptr - line_start;
1844          merge_buffer_size = mergesize + MIN_MERGE_BUFFER_SIZE;
1845          return true;
1846        }
1847
1848      {
1849        /* The current input line is too long to fit in the buffer.
1850           Increase the buffer size and try again, keeping it properly
1851           aligned.  */
1852        size_t line_alloc = buf->alloc / sizeof (struct line);
1853        buf->buf = x2nrealloc (buf->buf, &line_alloc, sizeof (struct line));
1854        buf->alloc = line_alloc * sizeof (struct line);
1855      }
1856    }
1857}
1858
1859/* Table that maps characters to order-of-magnitude values.  */
1860static char const unit_order[UCHAR_LIM] =
1861  {
1862#if ! ('K' == 75 && 'M' == 77 && 'G' == 71 && 'T' == 84 && 'P' == 80 \
1863     && 'E' == 69 && 'Z' == 90 && 'Y' == 89 && 'k' == 107)
1864    /* This initializer syntax works on all C99 hosts.  For now, use
1865       it only on non-ASCII hosts, to ease the pain of porting to
1866       pre-C99 ASCII hosts.  */
1867    ['K']=1, ['M']=2, ['G']=3, ['T']=4, ['P']=5, ['E']=6, ['Z']=7, ['Y']=8,
1868    ['k']=1,
1869#else
1870    /* Generate the following table with this command:
1871       perl -e 'my %a=(k=>1, K=>1, M=>2, G=>3, T=>4, P=>5, E=>6, Z=>7, Y=>8);
1872       foreach my $i (0..255) {my $c=chr($i); $a{$c} ||= 0;print "$a{$c}, "}'\
1873       |fmt  */
1874    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1875    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1876    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 3,
1877    0, 0, 0, 1, 0, 2, 0, 0, 5, 0, 0, 0, 4, 0, 0, 0, 0, 8, 7, 0, 0, 0, 0, 0,
1878    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1879    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1880    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1881    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1882    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1883    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1884    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
1885#endif
1886  };
1887
1888/* Return an integer that represents the order of magnitude of the
1889   unit following the number.  The number may contain thousands
1890   separators and a decimal point, but it may not contain leading blanks.
1891   Negative numbers get negative orders; zero numbers have a zero order.  */
1892
1893static int _GL_ATTRIBUTE_PURE
1894find_unit_order (char const *number)
1895{
1896  bool minus_sign = (*number == '-');
1897  char const *p = number + minus_sign;
1898  int nonzero = 0;
1899  unsigned char ch;
1900
1901  /* Scan to end of number.
1902     Decimals or separators not followed by digits stop the scan.
1903     Numbers ending in decimals or separators are thus considered
1904     to be lacking in units.
1905     FIXME: add support for multibyte thousands_sep and decimal_point.  */
1906
1907  do
1908    {
1909      while (ISDIGIT (ch = *p++))
1910        nonzero |= ch - '0';
1911    }
1912  while (ch == thousands_sep);
1913
1914  if (ch == decimal_point)
1915    while (ISDIGIT (ch = *p++))
1916      nonzero |= ch - '0';
1917
1918  if (nonzero)
1919    {
1920      int order = unit_order[ch];
1921      return (minus_sign ? -order : order);
1922    }
1923  else
1924    return 0;
1925}
1926
1927/* Compare numbers A and B ending in units with SI or IEC prefixes
1928       <none/unknown> < K/k < M < G < T < P < E < Z < Y  */
1929
1930static int
1931human_numcompare (char const *a, char const *b)
1932{
1933  while (blanks[to_uchar (*a)])
1934    a++;
1935  while (blanks[to_uchar (*b)])
1936    b++;
1937
1938  int diff = find_unit_order (a) - find_unit_order (b);
1939  return (diff ? diff : strnumcmp (a, b, decimal_point, thousands_sep));
1940}
1941
1942/* Compare strings A and B as numbers without explicitly converting them to
1943   machine numbers.  Comparatively slow for short strings, but asymptotically
1944   hideously fast. */
1945
1946static int
1947numcompare (char const *a, char const *b)
1948{
1949  while (blanks[to_uchar (*a)])
1950    a++;
1951  while (blanks[to_uchar (*b)])
1952    b++;
1953
1954  return strnumcmp (a, b, decimal_point, thousands_sep);
1955}
1956
1957/* Work around a problem whereby the long double value returned by glibc's
1958   strtold ("NaN", ...) contains uninitialized bits: clear all bytes of
1959   A and B before calling strtold.  FIXME: remove this function once
1960   gnulib guarantees that strtold's result is always well defined.  */
1961static int
1962nan_compare (char const *sa, char const *sb)
1963{
1964  long_double a;
1965  memset (&a, 0, sizeof a);
1966  a = strtold (sa, NULL);
1967
1968  long_double b;
1969  memset (&b, 0, sizeof b);
1970  b = strtold (sb, NULL);
1971
1972  return memcmp (&a, &b, sizeof a);
1973}
1974
1975static int
1976general_numcompare (char const *sa, char const *sb)
1977{
1978  /* FIXME: maybe add option to try expensive FP conversion
1979     only if A and B can't be compared more cheaply/accurately.  */
1980
1981  char *ea;
1982  char *eb;
1983  long_double a = strtold (sa, &ea);
1984  long_double b = strtold (sb, &eb);
1985
1986  /* Put conversion errors at the start of the collating sequence.  */
1987  if (sa == ea)
1988    return sb == eb ? 0 : -1;
1989  if (sb == eb)
1990    return 1;
1991
1992  /* Sort numbers in the usual way, where -0 == +0.  Put NaNs after
1993     conversion errors but before numbers; sort them by internal
1994     bit-pattern, for lack of a more portable alternative.  */
1995  return (a < b ? -1
1996          : a > b ? 1
1997          : a == b ? 0
1998          : b == b ? -1
1999          : a == a ? 1
2000          : nan_compare (sa, sb));
2001}
2002
2003/* Return an integer in 1..12 of the month name MONTH.
2004   Return 0 if the name in S is not recognized.  */
2005
2006static int
2007getmonth (char const *month, char **ea)
2008{
2009  size_t lo = 0;
2010  size_t hi = MONTHS_PER_YEAR;
2011
2012  while (blanks[to_uchar (*month)])
2013    month++;
2014
2015  do
2016    {
2017      size_t ix = (lo + hi) / 2;
2018      char const *m = month;
2019      char const *n = monthtab[ix].name;
2020
2021      for (;; m++, n++)
2022        {
2023          if (!*n)
2024            {
2025              if (ea)
2026                *ea = (char *) m;
2027              return monthtab[ix].val;
2028            }
2029          if (to_uchar (fold_toupper[to_uchar (*m)]) < to_uchar (*n))
2030            {
2031              hi = ix;
2032              break;
2033            }
2034          else if (to_uchar (fold_toupper[to_uchar (*m)]) > to_uchar (*n))
2035            {
2036              lo = ix + 1;
2037              break;
2038            }
2039        }
2040    }
2041  while (lo < hi);
2042
2043  return 0;
2044}
2045
2046/* A randomly chosen MD5 state, used for random comparison.  */
2047static struct md5_ctx random_md5_state;
2048
2049/* Initialize the randomly chosen MD5 state.  */
2050
2051static void
2052random_md5_state_init (char const *random_source)
2053{
2054  unsigned char buf[MD5_DIGEST_SIZE];
2055  struct randread_source *r = randread_new (random_source, sizeof buf);
2056  if (! r)
2057    die (_("open failed"), random_source);
2058  randread (r, buf, sizeof buf);
2059  if (randread_free (r) != 0)
2060    die (_("close failed"), random_source);
2061  md5_init_ctx (&random_md5_state);
2062  md5_process_bytes (buf, sizeof buf, &random_md5_state);
2063}
2064
2065/* This is like strxfrm, except it reports any error and exits.  */
2066
2067static size_t
2068xstrxfrm (char *restrict dest, char const *restrict src, size_t destsize)
2069{
2070  errno = 0;
2071  size_t translated_size = strxfrm (dest, src, destsize);
2072
2073  if (errno)
2074    {
2075      error (0, errno, _("string transformation failed"));
2076      error (0, 0, _("set LC_ALL='C' to work around the problem"));
2077      error (SORT_FAILURE, 0,
2078             _("the untransformed string was %s"),
2079             quotearg_n_style (0, locale_quoting_style, src));
2080    }
2081
2082  return translated_size;
2083}
2084
2085/* Compare the keys TEXTA (of length LENA) and TEXTB (of length LENB)
2086   using one or more random hash functions.  TEXTA[LENA] and
2087   TEXTB[LENB] must be zero.  */
2088
2089static int
2090compare_random (char *restrict texta, size_t lena,
2091                char *restrict textb, size_t lenb)
2092{
2093  /* XFRM_DIFF records the equivalent of memcmp on the transformed
2094     data.  This is used to break ties if there is a checksum
2095     collision, and this is good enough given the astronomically low
2096     probability of a collision.  */
2097  int xfrm_diff = 0;
2098
2099  char stackbuf[4000];
2100  char *buf = stackbuf;
2101  size_t bufsize = sizeof stackbuf;
2102  void *allocated = NULL;
2103  uint32_t dig[2][MD5_DIGEST_SIZE / sizeof (uint32_t)];
2104  struct md5_ctx s[2];
2105  s[0] = s[1] = random_md5_state;
2106
2107  if (hard_LC_COLLATE)
2108    {
2109      char const *lima = texta + lena;
2110      char const *limb = textb + lenb;
2111
2112      while (true)
2113        {
2114          /* Transform the text into the basis of comparison, so that byte
2115             strings that would otherwise considered to be equal are
2116             considered equal here even if their bytes differ.
2117
2118             Each time through this loop, transform one
2119             null-terminated string's worth from TEXTA or from TEXTB
2120             or both.  That way, there's no need to store the
2121             transformation of the whole line, if it contains many
2122             null-terminated strings.  */
2123
2124          /* Store the transformed data into a big-enough buffer.  */
2125
2126          /* A 3X size guess avoids the overhead of calling strxfrm
2127             twice on typical implementations.  Don't worry about
2128             size_t overflow, as the guess need not be correct.  */
2129          size_t guess_bufsize = 3 * (lena + lenb) + 2;
2130          if (bufsize < guess_bufsize)
2131            {
2132              bufsize = MAX (guess_bufsize, bufsize * 3 / 2);
2133              free (allocated);
2134              buf = allocated = malloc (bufsize);
2135              if (! buf)
2136                {
2137                  buf = stackbuf;
2138                  bufsize = sizeof stackbuf;
2139                }
2140            }
2141
2142          size_t sizea =
2143            (texta < lima ? xstrxfrm (buf, texta, bufsize) + 1 : 0);
2144          bool a_fits = sizea <= bufsize;
2145          size_t sizeb =
2146            (textb < limb
2147             ? (xstrxfrm ((a_fits ? buf + sizea : NULL), textb,
2148                          (a_fits ? bufsize - sizea : 0))
2149                + 1)
2150             : 0);
2151
2152          if (! (a_fits && sizea + sizeb <= bufsize))
2153            {
2154              bufsize = sizea + sizeb;
2155              if (bufsize < SIZE_MAX / 3)
2156                bufsize = bufsize * 3 / 2;
2157              free (allocated);
2158              buf = allocated = xmalloc (bufsize);
2159              if (texta < lima)
2160                strxfrm (buf, texta, sizea);
2161              if (textb < limb)
2162                strxfrm (buf + sizea, textb, sizeb);
2163            }
2164
2165          /* Advance past NULs to the next part of each input string,
2166             exiting the loop if both strings are exhausted.  When
2167             exiting the loop, prepare to finish off the tiebreaker
2168             comparison properly.  */
2169          if (texta < lima)
2170            texta += strlen (texta) + 1;
2171          if (textb < limb)
2172            textb += strlen (textb) + 1;
2173          if (! (texta < lima || textb < limb))
2174            {
2175              lena = sizea; texta = buf;
2176              lenb = sizeb; textb = buf + sizea;
2177              break;
2178            }
2179
2180          /* Accumulate the transformed data in the corresponding
2181             checksums.  */
2182          md5_process_bytes (buf, sizea, &s[0]);
2183          md5_process_bytes (buf + sizea, sizeb, &s[1]);
2184
2185          /* Update the tiebreaker comparison of the transformed data.  */
2186          if (! xfrm_diff)
2187            {
2188              xfrm_diff = memcmp (buf, buf + sizea, MIN (sizea, sizeb));
2189              if (! xfrm_diff)
2190                xfrm_diff = (sizea > sizeb) - (sizea < sizeb);
2191            }
2192        }
2193    }
2194
2195  /* Compute and compare the checksums.  */
2196  md5_process_bytes (texta, lena, &s[0]); md5_finish_ctx (&s[0], dig[0]);
2197  md5_process_bytes (textb, lenb, &s[1]); md5_finish_ctx (&s[1], dig[1]);
2198  int diff = memcmp (dig[0], dig[1], sizeof dig[0]);
2199
2200  /* Fall back on the tiebreaker if the checksums collide.  */
2201  if (! diff)
2202    {
2203      if (! xfrm_diff)
2204        {
2205          xfrm_diff = memcmp (texta, textb, MIN (lena, lenb));
2206          if (! xfrm_diff)
2207            xfrm_diff = (lena > lenb) - (lena < lenb);
2208        }
2209
2210      diff = xfrm_diff;
2211    }
2212
2213  free (allocated);
2214
2215  return diff;
2216}
2217
2218/* Return the printable width of the block of memory starting at
2219   TEXT and ending just before LIM, counting each tab as one byte.
2220   FIXME: Should we generally be counting non printable chars?  */
2221
2222static size_t
2223debug_width (char const *text, char const *lim)
2224{
2225  size_t width = mbsnwidth (text, lim - text, 0);
2226  while (text < lim)
2227    width += (*text++ == '\t');
2228  return width;
2229}
2230
2231/* For debug mode, "underline" a key at the
2232   specified offset and screen width.  */
2233
2234static void
2235mark_key (size_t offset, size_t width)
2236{
2237  while (offset--)
2238    putchar (' ');
2239
2240  if (!width)
2241    printf (_("^ no match for key\n"));
2242  else
2243    {
2244      do
2245        putchar ('_');
2246      while (--width);
2247
2248      putchar ('\n');
2249    }
2250}
2251
2252/* Return true if KEY is a numeric key.  */
2253
2254static inline bool
2255key_numeric (struct keyfield const *key)
2256{
2257  return key->numeric || key->general_numeric || key->human_numeric;
2258}
2259
2260/* For LINE, output a debugging line that underlines KEY in LINE.
2261   If KEY is null, underline the whole line.  */
2262
2263static void
2264debug_key (struct line const *line, struct keyfield const *key)
2265{
2266  char *text = line->text;
2267  char *beg = text;
2268  char *lim = text + line->length - 1;
2269
2270  if (key)
2271    {
2272      if (key->sword != SIZE_MAX)
2273        beg = begfield (line, key);
2274      if (key->eword != SIZE_MAX)
2275        lim = limfield (line, key);
2276
2277      if ((key->skipsblanks && key->sword == SIZE_MAX)
2278          || key->month || key_numeric (key))
2279        {
2280          char saved = *lim;
2281          *lim = '\0';
2282
2283          while (blanks[to_uchar (*beg)])
2284            beg++;
2285
2286          char *tighter_lim = beg;
2287
2288          if (lim < beg)
2289            tighter_lim = lim;
2290          else if (key->month)
2291            getmonth (beg, &tighter_lim);
2292          else if (key->general_numeric)
2293            ignore_value (strtold (beg, &tighter_lim));
2294          else if (key->numeric || key->human_numeric)
2295            {
2296              char *p = beg + (beg < lim && *beg == '-');
2297              bool found_digit = false;
2298              unsigned char ch;
2299
2300              do
2301                {
2302                  while (ISDIGIT (ch = *p++))
2303                    found_digit = true;
2304                }
2305              while (ch == thousands_sep);
2306
2307              if (ch == decimal_point)
2308                while (ISDIGIT (ch = *p++))
2309                  found_digit = true;
2310
2311              if (found_digit)
2312                tighter_lim = p - ! (key->human_numeric && unit_order[ch]);
2313            }
2314          else
2315            tighter_lim = lim;
2316
2317          *lim = saved;
2318          lim = tighter_lim;
2319        }
2320    }
2321
2322  size_t offset = debug_width (text, beg);
2323  size_t width = debug_width (beg, lim);
2324  mark_key (offset, width);
2325}
2326
2327/* Debug LINE by underlining its keys.  */
2328
2329static void
2330debug_line (struct line const *line)
2331{
2332  struct keyfield const *key = keylist;
2333
2334  do
2335    debug_key (line, key);
2336  while (key && ((key = key->next) || ! (unique || stable)));
2337}
2338
2339/* Return whether sorting options specified for key.  */
2340
2341static bool
2342default_key_compare (struct keyfield const *key)
2343{
2344  return ! (key->ignore
2345            || key->translate
2346            || key->skipsblanks
2347            || key->skipeblanks
2348            || key_numeric (key)
2349            || key->month
2350            || key->version
2351            || key->random
2352            /* || key->reverse */
2353           );
2354}
2355
2356/* Convert a key to the short options used to specify it.  */
2357
2358static void
2359key_to_opts (struct keyfield const *key, char *opts)
2360{
2361  if (key->skipsblanks || key->skipeblanks)
2362    *opts++ = 'b';/* either disables global -b  */
2363  if (key->ignore == nondictionary)
2364    *opts++ = 'd';
2365  if (key->translate)
2366    *opts++ = 'f';
2367  if (key->general_numeric)
2368    *opts++ = 'g';
2369  if (key->human_numeric)
2370    *opts++ = 'h';
2371  if (key->ignore == nonprinting)
2372    *opts++ = 'i';
2373  if (key->month)
2374    *opts++ = 'M';
2375  if (key->numeric)
2376    *opts++ = 'n';
2377  if (key->random)
2378    *opts++ = 'R';
2379  if (key->reverse)
2380    *opts++ = 'r';
2381  if (key->version)
2382    *opts++ = 'V';
2383  *opts = '\0';
2384}
2385
2386/* Output data independent key warnings to stderr.  */
2387
2388static void
2389key_warnings (struct keyfield const *gkey, bool gkey_only)
2390{
2391  struct keyfield const *key;
2392  struct keyfield ugkey = *gkey;
2393  unsigned long keynum = 1;
2394
2395  for (key = keylist; key; key = key->next, keynum++)
2396    {
2397      if (key->traditional_used)
2398        {
2399          size_t sword = key->sword;
2400          size_t eword = key->eword;
2401          char tmp[INT_BUFSIZE_BOUND (uintmax_t)];
2402          /* obsolescent syntax +A.x -B.y is equivalent to:
2403               -k A+1.x+1,B.y   (when y = 0)
2404               -k A+1.x+1,B+1.y (when y > 0)  */
2405          char obuf[INT_BUFSIZE_BOUND (sword) * 2 + 4]; /* +# -#  */
2406          char nbuf[INT_BUFSIZE_BOUND (sword) * 2 + 5]; /* -k #,#  */
2407          char *po = obuf;
2408          char *pn = nbuf;
2409
2410          if (sword == SIZE_MAX)
2411            sword++;
2412
2413          po = stpcpy (stpcpy (po, "+"), umaxtostr (sword, tmp));
2414          pn = stpcpy (stpcpy (pn, "-k "), umaxtostr (sword + 1, tmp));
2415          if (key->eword != SIZE_MAX)
2416            {
2417              stpcpy (stpcpy (po, " -"), umaxtostr (eword + 1, tmp));
2418              stpcpy (stpcpy (pn, ","),
2419                      umaxtostr (eword + 1
2420                                 + (key->echar == SIZE_MAX), tmp));
2421            }
2422          error (0, 0, _("obsolescent key %s used; consider %s instead"),
2423                 quote_n (0, obuf), quote_n (1, nbuf));
2424        }
2425
2426      /* Warn about field specs that will never match.  */
2427      bool zero_width = key->sword != SIZE_MAX && key->eword < key->sword;
2428      if (zero_width)
2429        error (0, 0, _("key %lu has zero width and will be ignored"), keynum);
2430
2431      /* Warn about significant leading blanks.  */
2432      bool implicit_skip = key_numeric (key) || key->month;
2433      bool line_offset = key->eword == 0 && key->echar != 0; /* -k1.x,1.y  */
2434      if (!zero_width && !gkey_only && tab == TAB_DEFAULT && !line_offset
2435          && ((!key->skipsblanks && !implicit_skip)
2436              || (!key->skipsblanks && key->schar)
2437              || (!key->skipeblanks && key->echar)))
2438        error (0, 0, _("leading blanks are significant in key %lu; "
2439                       "consider also specifying 'b'"), keynum);
2440
2441      /* Warn about numeric comparisons spanning fields,
2442         as field delimiters could be interpreted as part
2443         of the number (maybe only in other locales).  */
2444      if (!gkey_only && key_numeric (key))
2445        {
2446          size_t sword = key->sword + 1;
2447          size_t eword = key->eword + 1;
2448          if (!sword)
2449            sword++;
2450          if (!eword || sword < eword)
2451            error (0, 0, _("key %lu is numeric and spans multiple fields"),
2452                   keynum);
2453        }
2454
2455      /* Flag global options not copied or specified in any key.  */
2456      if (ugkey.ignore && (ugkey.ignore == key->ignore))
2457        ugkey.ignore = NULL;
2458      if (ugkey.translate && (ugkey.translate == key->translate))
2459        ugkey.translate = NULL;
2460      ugkey.skipsblanks &= !key->skipsblanks;
2461      ugkey.skipeblanks &= !key->skipeblanks;
2462      ugkey.month &= !key->month;
2463      ugkey.numeric &= !key->numeric;
2464      ugkey.general_numeric &= !key->general_numeric;
2465      ugkey.human_numeric &= !key->human_numeric;
2466      ugkey.random &= !key->random;
2467      ugkey.version &= !key->version;
2468      ugkey.reverse &= !key->reverse;
2469    }
2470
2471  /* Warn about ignored global options flagged above.
2472     Note if gkey is the only one in the list, all flags are cleared.  */
2473  if (!default_key_compare (&ugkey)
2474      || (ugkey.reverse && (stable || unique) && keylist))
2475    {
2476      bool ugkey_reverse = ugkey.reverse;
2477      if (!(stable || unique))
2478        ugkey.reverse = false;
2479      /* The following is too big, but guaranteed to be "big enough".  */
2480      char opts[sizeof short_options];
2481      key_to_opts (&ugkey, opts);
2482      error (0, 0,
2483             ngettext ("option '-%s' is ignored",
2484                       "options '-%s' are ignored",
2485                       select_plural (strlen (opts))), opts);
2486      ugkey.reverse = ugkey_reverse;
2487    }
2488  if (ugkey.reverse && !(stable || unique) && keylist)
2489    error (0, 0, _("option '-r' only applies to last-resort comparison"));
2490}
2491
2492/* Compare two lines A and B trying every key in sequence until there
2493   are no more keys or a difference is found. */
2494
2495static int
2496keycompare (struct line const *a, struct line const *b)
2497{
2498  struct keyfield *key = keylist;
2499
2500  /* For the first iteration only, the key positions have been
2501     precomputed for us. */
2502  char *texta = a->keybeg;
2503  char *textb = b->keybeg;
2504  char *lima = a->keylim;
2505  char *limb = b->keylim;
2506
2507  int diff;
2508
2509  while (true)
2510    {
2511      char const *translate = key->translate;
2512      bool const *ignore = key->ignore;
2513
2514      /* Treat field ends before field starts as empty fields.  */
2515      lima = MAX (texta, lima);
2516      limb = MAX (textb, limb);
2517
2518      /* Find the lengths. */
2519      size_t lena = lima - texta;
2520      size_t lenb = limb - textb;
2521
2522      if (hard_LC_COLLATE || key_numeric (key)
2523          || key->month || key->random || key->version)
2524        {
2525          char *ta;
2526          char *tb;
2527          size_t tlena;
2528          size_t tlenb;
2529
2530          char enda IF_LINT (= 0);
2531          char endb IF_LINT (= 0);
2532          void *allocated IF_LINT (= NULL);
2533          char stackbuf[4000];
2534
2535          if (ignore || translate)
2536            {
2537              /* Compute with copies of the keys, which are the result of
2538                 translating or ignoring characters, and which need their
2539                 own storage.  */
2540
2541              size_t i;
2542
2543              /* Allocate space for copies.  */
2544              size_t size = lena + 1 + lenb + 1;
2545              if (size <= sizeof stackbuf)
2546                ta = stackbuf, allocated = NULL;
2547              else
2548                ta = allocated = xmalloc (size);
2549              tb = ta + lena + 1;
2550
2551              /* Put into each copy a version of the key in which the
2552                 requested characters are ignored or translated.  */
2553              for (tlena = i = 0; i < lena; i++)
2554                if (! (ignore && ignore[to_uchar (texta[i])]))
2555                  ta[tlena++] = (translate
2556                                 ? translate[to_uchar (texta[i])]
2557                                 : texta[i]);
2558              ta[tlena] = '\0';
2559
2560              for (tlenb = i = 0; i < lenb; i++)
2561                if (! (ignore && ignore[to_uchar (textb[i])]))
2562                  tb[tlenb++] = (translate
2563                                 ? translate[to_uchar (textb[i])]
2564                                 : textb[i]);
2565              tb[tlenb] = '\0';
2566            }
2567          else
2568            {
2569              /* Use the keys in-place, temporarily null-terminated.  */
2570              ta = texta; tlena = lena; enda = ta[tlena]; ta[tlena] = '\0';
2571              tb = textb; tlenb = lenb; endb = tb[tlenb]; tb[tlenb] = '\0';
2572            }
2573
2574          if (key->numeric)
2575            diff = numcompare (ta, tb);
2576          else if (key->general_numeric)
2577            diff = general_numcompare (ta, tb);
2578          else if (key->human_numeric)
2579            diff = human_numcompare (ta, tb);
2580          else if (key->month)
2581            diff = getmonth (ta, NULL) - getmonth (tb, NULL);
2582          else if (key->random)
2583            diff = compare_random (ta, tlena, tb, tlenb);
2584          else if (key->version)
2585            diff = filevercmp (ta, tb);
2586          else
2587            {
2588              /* Locale-dependent string sorting.  This is slower than
2589                 C-locale sorting, which is implemented below.  */
2590              if (tlena == 0)
2591                diff = - NONZERO (tlenb);
2592              else if (tlenb == 0)
2593                diff = 1;
2594              else
2595                diff = xmemcoll0 (ta, tlena + 1, tb, tlenb + 1);
2596            }
2597
2598          if (ignore || translate)
2599            free (allocated);
2600          else
2601            {
2602              ta[tlena] = enda;
2603              tb[tlenb] = endb;
2604            }
2605        }
2606      else if (ignore)
2607        {
2608#define CMP_WITH_IGNORE(A, B)						\
2609  do									\
2610    {									\
2611          while (true)							\
2612            {								\
2613              while (texta < lima && ignore[to_uchar (*texta)])		\
2614                ++texta;						\
2615              while (textb < limb && ignore[to_uchar (*textb)])		\
2616                ++textb;						\
2617              if (! (texta < lima && textb < limb))			\
2618                break;							\
2619              diff = to_uchar (A) - to_uchar (B);			\
2620              if (diff)							\
2621                goto not_equal;						\
2622              ++texta;							\
2623              ++textb;							\
2624            }								\
2625                                                                        \
2626          diff = (texta < lima) - (textb < limb);			\
2627    }									\
2628  while (0)
2629
2630          if (translate)
2631            CMP_WITH_IGNORE (translate[to_uchar (*texta)],
2632                             translate[to_uchar (*textb)]);
2633          else
2634            CMP_WITH_IGNORE (*texta, *textb);
2635        }
2636      else if (lena == 0)
2637        diff = - NONZERO (lenb);
2638      else if (lenb == 0)
2639        goto greater;
2640      else
2641        {
2642          if (translate)
2643            {
2644              while (texta < lima && textb < limb)
2645                {
2646                  diff = (to_uchar (translate[to_uchar (*texta++)])
2647                          - to_uchar (translate[to_uchar (*textb++)]));
2648                  if (diff)
2649                    goto not_equal;
2650                }
2651            }
2652          else
2653            {
2654              diff = memcmp (texta, textb, MIN (lena, lenb));
2655              if (diff)
2656                goto not_equal;
2657            }
2658          diff = lena < lenb ? -1 : lena != lenb;
2659        }
2660
2661      if (diff)
2662        goto not_equal;
2663
2664      key = key->next;
2665      if (! key)
2666        break;
2667
2668      /* Find the beginning and limit of the next field.  */
2669      if (key->eword != SIZE_MAX)
2670        lima = limfield (a, key), limb = limfield (b, key);
2671      else
2672        lima = a->text + a->length - 1, limb = b->text + b->length - 1;
2673
2674      if (key->sword != SIZE_MAX)
2675        texta = begfield (a, key), textb = begfield (b, key);
2676      else
2677        {
2678          texta = a->text, textb = b->text;
2679          if (key->skipsblanks)
2680            {
2681              while (texta < lima && blanks[to_uchar (*texta)])
2682                ++texta;
2683              while (textb < limb && blanks[to_uchar (*textb)])
2684                ++textb;
2685            }
2686        }
2687    }
2688
2689  return 0;
2690
2691 greater:
2692  diff = 1;
2693 not_equal:
2694  return key->reverse ? -diff : diff;
2695}
2696
2697/* Compare two lines A and B, returning negative, zero, or positive
2698   depending on whether A compares less than, equal to, or greater than B. */
2699
2700static int
2701compare (struct line const *a, struct line const *b)
2702{
2703  int diff;
2704  size_t alen, blen;
2705
2706  /* First try to compare on the specified keys (if any).
2707     The only two cases with no key at all are unadorned sort,
2708     and unadorned sort -r. */
2709  if (keylist)
2710    {
2711      diff = keycompare (a, b);
2712      if (diff || unique || stable)
2713        return diff;
2714    }
2715
2716  /* If the keys all compare equal (or no keys were specified)
2717     fall through to the default comparison.  */
2718  alen = a->length - 1, blen = b->length - 1;
2719
2720  if (alen == 0)
2721    diff = - NONZERO (blen);
2722  else if (blen == 0)
2723    diff = 1;
2724  else if (hard_LC_COLLATE)
2725    {
2726      /* Note xmemcoll0 is a performance enhancement as
2727         it will not unconditionally write '\0' after the
2728         passed in buffers, which was seen to give around
2729         a 3% increase in performance for short lines.  */
2730      diff = xmemcoll0 (a->text, alen + 1, b->text, blen + 1);
2731    }
2732  else if (! (diff = memcmp (a->text, b->text, MIN (alen, blen))))
2733    diff = alen < blen ? -1 : alen != blen;
2734
2735  return reverse ? -diff : diff;
2736}
2737
2738/* Write LINE to output stream FP; the output file's name is
2739   OUTPUT_FILE if OUTPUT_FILE is non-null, and is the standard output
2740   otherwise.  If debugging is enabled and FP is standard output,
2741   append some debugging information.  */
2742
2743static void
2744write_line (struct line const *line, FILE *fp, char const *output_file)
2745{
2746  char *buf = line->text;
2747  size_t n_bytes = line->length;
2748  char *ebuf = buf + n_bytes;
2749
2750  if (!output_file && debug)
2751    {
2752      /* Convert TAB to '>' and EOL to \n, and then output debugging info.  */
2753      char const *c = buf;
2754
2755      while (c < ebuf)
2756        {
2757          char wc = *c++;
2758          if (wc == '\t')
2759            wc = '>';
2760          else if (c == ebuf)
2761            wc = '\n';
2762          if (fputc (wc, fp) == EOF)
2763            die (_("write failed"), output_file);
2764        }
2765
2766      debug_line (line);
2767    }
2768  else
2769    {
2770      ebuf[-1] = eolchar;
2771      if (fwrite (buf, 1, n_bytes, fp) != n_bytes)
2772        die (_("write failed"), output_file);
2773      ebuf[-1] = '\0';
2774    }
2775}
2776
2777/* Check that the lines read from FILE_NAME come in order.  Return
2778   true if they are in order.  If CHECKONLY == 'c', also print a
2779   diagnostic (FILE_NAME, line number, contents of line) to stderr if
2780   they are not in order.  */
2781
2782static bool
2783check (char const *file_name, char checkonly)
2784{
2785  FILE *fp = xfopen (file_name, "r");
2786  struct buffer buf;		/* Input buffer. */
2787  struct line temp;		/* Copy of previous line. */
2788  size_t alloc = 0;
2789  uintmax_t line_number = 0;
2790  struct keyfield const *key = keylist;
2791  bool nonunique = ! unique;
2792  bool ordered = true;
2793
2794  initbuf (&buf, sizeof (struct line),
2795           MAX (merge_buffer_size, sort_size));
2796  temp.text = NULL;
2797
2798  while (fillbuf (&buf, fp, file_name))
2799    {
2800      struct line const *line = buffer_linelim (&buf);
2801      struct line const *linebase = line - buf.nlines;
2802
2803      /* Make sure the line saved from the old buffer contents is
2804         less than or equal to the first line of the new buffer. */
2805      if (alloc && nonunique <= compare (&temp, line - 1))
2806        {
2807        found_disorder:
2808          {
2809            if (checkonly == 'c')
2810              {
2811                struct line const *disorder_line = line - 1;
2812                uintmax_t disorder_line_number =
2813                  buffer_linelim (&buf) - disorder_line + line_number;
2814                char hr_buf[INT_BUFSIZE_BOUND (disorder_line_number)];
2815                fprintf (stderr, _("%s: %s:%s: disorder: "),
2816                         program_name, file_name,
2817                         umaxtostr (disorder_line_number, hr_buf));
2818                write_line (disorder_line, stderr, _("standard error"));
2819              }
2820
2821            ordered = false;
2822            break;
2823          }
2824        }
2825
2826      /* Compare each line in the buffer with its successor.  */
2827      while (linebase < --line)
2828        if (nonunique <= compare (line, line - 1))
2829          goto found_disorder;
2830
2831      line_number += buf.nlines;
2832
2833      /* Save the last line of the buffer.  */
2834      if (alloc < line->length)
2835        {
2836          do
2837            {
2838              alloc *= 2;
2839              if (! alloc)
2840                {
2841                  alloc = line->length;
2842                  break;
2843                }
2844            }
2845          while (alloc < line->length);
2846
2847          free (temp.text);
2848          temp.text = xmalloc (alloc);
2849        }
2850      memcpy (temp.text, line->text, line->length);
2851      temp.length = line->length;
2852      if (key)
2853        {
2854          temp.keybeg = temp.text + (line->keybeg - line->text);
2855          temp.keylim = temp.text + (line->keylim - line->text);
2856        }
2857    }
2858
2859  xfclose (fp, file_name);
2860  free (buf.buf);
2861  free (temp.text);
2862  return ordered;
2863}
2864
2865/* Open FILES (there are NFILES of them) and store the resulting array
2866   of stream pointers into (*PFPS).  Allocate the array.  Return the
2867   number of successfully opened files, setting errno if this value is
2868   less than NFILES.  */
2869
2870static size_t
2871open_input_files (struct sortfile *files, size_t nfiles, FILE ***pfps)
2872{
2873  FILE **fps = *pfps = xnmalloc (nfiles, sizeof *fps);
2874  int i;
2875
2876  /* Open as many input files as we can.  */
2877  for (i = 0; i < nfiles; i++)
2878    {
2879      fps[i] = (files[i].temp && files[i].temp->state != UNCOMPRESSED
2880                ? open_temp (files[i].temp)
2881                : stream_open (files[i].name, "r"));
2882      if (!fps[i])
2883        break;
2884    }
2885
2886  return i;
2887}
2888
2889/* Merge lines from FILES onto OFP.  NTEMPS is the number of temporary
2890   files (all of which are at the start of the FILES array), and
2891   NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
2892   FPS is the vector of open stream corresponding to the files.
2893   Close input and output streams before returning.
2894   OUTPUT_FILE gives the name of the output file.  If it is NULL,
2895   the output file is standard output.  */
2896
2897static void
2898mergefps (struct sortfile *files, size_t ntemps, size_t nfiles,
2899          FILE *ofp, char const *output_file, FILE **fps)
2900{
2901  struct buffer *buffer = xnmalloc (nfiles, sizeof *buffer);
2902                                /* Input buffers for each file. */
2903  struct line saved;		/* Saved line storage for unique check. */
2904  struct line const *savedline = NULL;
2905                                /* &saved if there is a saved line. */
2906  size_t savealloc = 0;		/* Size allocated for the saved line. */
2907  struct line const **cur = xnmalloc (nfiles, sizeof *cur);
2908                                /* Current line in each line table. */
2909  struct line const **base = xnmalloc (nfiles, sizeof *base);
2910                                /* Base of each line table.  */
2911  size_t *ord = xnmalloc (nfiles, sizeof *ord);
2912                                /* Table representing a permutation of fps,
2913                                   such that cur[ord[0]] is the smallest line
2914                                   and will be next output. */
2915  size_t i;
2916  size_t j;
2917  size_t t;
2918  struct keyfield const *key = keylist;
2919  saved.text = NULL;
2920
2921  /* Read initial lines from each input file. */
2922  for (i = 0; i < nfiles; )
2923    {
2924      initbuf (&buffer[i], sizeof (struct line),
2925               MAX (merge_buffer_size, sort_size / nfiles));
2926      if (fillbuf (&buffer[i], fps[i], files[i].name))
2927        {
2928          struct line const *linelim = buffer_linelim (&buffer[i]);
2929          cur[i] = linelim - 1;
2930          base[i] = linelim - buffer[i].nlines;
2931          i++;
2932        }
2933      else
2934        {
2935          /* fps[i] is empty; eliminate it from future consideration.  */
2936          xfclose (fps[i], files[i].name);
2937          if (i < ntemps)
2938            {
2939              ntemps--;
2940              zaptemp (files[i].name);
2941            }
2942          free (buffer[i].buf);
2943          --nfiles;
2944          for (j = i; j < nfiles; ++j)
2945            {
2946              files[j] = files[j + 1];
2947              fps[j] = fps[j + 1];
2948            }
2949        }
2950    }
2951
2952  /* Set up the ord table according to comparisons among input lines.
2953     Since this only reorders two items if one is strictly greater than
2954     the other, it is stable. */
2955  for (i = 0; i < nfiles; ++i)
2956    ord[i] = i;
2957  for (i = 1; i < nfiles; ++i)
2958    if (0 < compare (cur[ord[i - 1]], cur[ord[i]]))
2959      t = ord[i - 1], ord[i - 1] = ord[i], ord[i] = t, i = 0;
2960
2961  /* Repeatedly output the smallest line until no input remains. */
2962  while (nfiles)
2963    {
2964      struct line const *smallest = cur[ord[0]];
2965
2966      /* If uniquified output is turned on, output only the first of
2967         an identical series of lines. */
2968      if (unique)
2969        {
2970          if (savedline && compare (savedline, smallest))
2971            {
2972              savedline = NULL;
2973              write_line (&saved, ofp, output_file);
2974            }
2975          if (!savedline)
2976            {
2977              savedline = &saved;
2978              if (savealloc < smallest->length)
2979                {
2980                  do
2981                    if (! savealloc)
2982                      {
2983                        savealloc = smallest->length;
2984                        break;
2985                      }
2986                  while ((savealloc *= 2) < smallest->length);
2987
2988                  free (saved.text);
2989                  saved.text = xmalloc (savealloc);
2990                }
2991              saved.length = smallest->length;
2992              memcpy (saved.text, smallest->text, saved.length);
2993              if (key)
2994                {
2995                  saved.keybeg =
2996                    saved.text + (smallest->keybeg - smallest->text);
2997                  saved.keylim =
2998                    saved.text + (smallest->keylim - smallest->text);
2999                }
3000            }
3001        }
3002      else
3003        write_line (smallest, ofp, output_file);
3004
3005      /* Check if we need to read more lines into core. */
3006      if (base[ord[0]] < smallest)
3007        cur[ord[0]] = smallest - 1;
3008      else
3009        {
3010          if (fillbuf (&buffer[ord[0]], fps[ord[0]], files[ord[0]].name))
3011            {
3012              struct line const *linelim = buffer_linelim (&buffer[ord[0]]);
3013              cur[ord[0]] = linelim - 1;
3014              base[ord[0]] = linelim - buffer[ord[0]].nlines;
3015            }
3016          else
3017            {
3018              /* We reached EOF on fps[ord[0]].  */
3019              for (i = 1; i < nfiles; ++i)
3020                if (ord[i] > ord[0])
3021                  --ord[i];
3022              --nfiles;
3023              xfclose (fps[ord[0]], files[ord[0]].name);
3024              if (ord[0] < ntemps)
3025                {
3026                  ntemps--;
3027                  zaptemp (files[ord[0]].name);
3028                }
3029              free (buffer[ord[0]].buf);
3030              for (i = ord[0]; i < nfiles; ++i)
3031                {
3032                  fps[i] = fps[i + 1];
3033                  files[i] = files[i + 1];
3034                  buffer[i] = buffer[i + 1];
3035                  cur[i] = cur[i + 1];
3036                  base[i] = base[i + 1];
3037                }
3038              for (i = 0; i < nfiles; ++i)
3039                ord[i] = ord[i + 1];
3040              continue;
3041            }
3042        }
3043
3044      /* The new line just read in may be larger than other lines
3045         already in main memory; push it back in the queue until we
3046         encounter a line larger than it.  Optimize for the common
3047         case where the new line is smallest.  */
3048      {
3049        size_t lo = 1;
3050        size_t hi = nfiles;
3051        size_t probe = lo;
3052        size_t ord0 = ord[0];
3053        size_t count_of_smaller_lines;
3054
3055        while (lo < hi)
3056          {
3057            int cmp = compare (cur[ord0], cur[ord[probe]]);
3058            if (cmp < 0 || (cmp == 0 && ord0 < ord[probe]))
3059              hi = probe;
3060            else
3061              lo = probe + 1;
3062            probe = (lo + hi) / 2;
3063          }
3064
3065        count_of_smaller_lines = lo - 1;
3066        for (j = 0; j < count_of_smaller_lines; j++)
3067          ord[j] = ord[j + 1];
3068        ord[count_of_smaller_lines] = ord0;
3069      }
3070    }
3071
3072  if (unique && savedline)
3073    {
3074      write_line (&saved, ofp, output_file);
3075      free (saved.text);
3076    }
3077
3078  xfclose (ofp, output_file);
3079  free (fps);
3080  free (buffer);
3081  free (ord);
3082  free (base);
3083  free (cur);
3084}
3085
3086/* Merge lines from FILES onto OFP.  NTEMPS is the number of temporary
3087   files (all of which are at the start of the FILES array), and
3088   NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
3089   Close input and output files before returning.
3090   OUTPUT_FILE gives the name of the output file.
3091
3092   Return the number of files successfully merged.  This number can be
3093   less than NFILES if we ran low on file descriptors, but in this
3094   case it is never less than 2.  */
3095
3096static size_t
3097mergefiles (struct sortfile *files, size_t ntemps, size_t nfiles,
3098            FILE *ofp, char const *output_file)
3099{
3100  FILE **fps;
3101  size_t nopened = open_input_files (files, nfiles, &fps);
3102  if (nopened < nfiles && nopened < 2)
3103    die (_("open failed"), files[nopened].name);
3104  mergefps (files, ntemps, nopened, ofp, output_file, fps);
3105  return nopened;
3106}
3107
3108/* Merge into T (of size NLINES) the two sorted arrays of lines
3109   LO (with NLINES / 2 members), and
3110   T - (NLINES / 2) (with NLINES - NLINES / 2 members).
3111   T and LO point just past their respective arrays, and the arrays
3112   are in reverse order.  NLINES must be at least 2.  */
3113
3114static void
3115mergelines (struct line *restrict t, size_t nlines,
3116            struct line const *restrict lo)
3117{
3118  size_t nlo = nlines / 2;
3119  size_t nhi = nlines - nlo;
3120  struct line *hi = t - nlo;
3121
3122  while (true)
3123    if (compare (lo - 1, hi - 1) <= 0)
3124      {
3125        *--t = *--lo;
3126        if (! --nlo)
3127          {
3128            /* HI must equal T now, and there is no need to copy from
3129               HI to T. */
3130            return;
3131          }
3132      }
3133    else
3134      {
3135        *--t = *--hi;
3136        if (! --nhi)
3137          {
3138            do
3139              *--t = *--lo;
3140            while (--nlo);
3141
3142            return;
3143          }
3144      }
3145}
3146
3147/* Sort the array LINES with NLINES members, using TEMP for temporary space.
3148   Do this all within one thread.  NLINES must be at least 2.
3149   If TO_TEMP, put the sorted output into TEMP, and TEMP is as large as LINES.
3150   Otherwise the sort is in-place and TEMP is half-sized.
3151   The input and output arrays are in reverse order, and LINES and
3152   TEMP point just past the end of their respective arrays.
3153
3154   Use a recursive divide-and-conquer algorithm, in the style
3155   suggested by Knuth volume 3 (2nd edition), exercise 5.2.4-23.  Use
3156   the optimization suggested by exercise 5.2.4-10; this requires room
3157   for only 1.5*N lines, rather than the usual 2*N lines.  Knuth
3158   writes that this memory optimization was originally published by
3159   D. A. Bell, Comp J. 1 (1958), 75.  */
3160
3161static void
3162sequential_sort (struct line *restrict lines, size_t nlines,
3163                 struct line *restrict temp, bool to_temp)
3164{
3165  if (nlines == 2)
3166    {
3167      /* Declare 'swap' as int, not bool, to work around a bug
3168         <http://lists.gnu.org/archive/html/bug-coreutils/2005-10/msg00086.html>
3169         in the IBM xlc 6.0.0.0 compiler in 64-bit mode.  */
3170      int swap = (0 < compare (&lines[-1], &lines[-2]));
3171      if (to_temp)
3172        {
3173          temp[-1] = lines[-1 - swap];
3174          temp[-2] = lines[-2 + swap];
3175        }
3176      else if (swap)
3177        {
3178          temp[-1] = lines[-1];
3179          lines[-1] = lines[-2];
3180          lines[-2] = temp[-1];
3181        }
3182    }
3183  else
3184    {
3185      size_t nlo = nlines / 2;
3186      size_t nhi = nlines - nlo;
3187      struct line *lo = lines;
3188      struct line *hi = lines - nlo;
3189
3190      sequential_sort (hi, nhi, temp - (to_temp ? nlo : 0), to_temp);
3191      if (1 < nlo)
3192        sequential_sort (lo, nlo, temp, !to_temp);
3193      else if (!to_temp)
3194        temp[-1] = lo[-1];
3195
3196      struct line *dest;
3197      struct line const *sorted_lo;
3198      if (to_temp)
3199        {
3200          dest = temp;
3201          sorted_lo = lines;
3202        }
3203      else
3204        {
3205          dest = lines;
3206          sorted_lo = temp;
3207        }
3208      mergelines (dest, nlines, sorted_lo);
3209    }
3210}
3211
3212static struct merge_node *init_node (struct merge_node *restrict,
3213                                     struct merge_node *restrict,
3214                                     struct line *, size_t, size_t, bool);
3215
3216
3217/* Create and return a merge tree for NTHREADS threads, sorting NLINES
3218   lines, with destination DEST.  */
3219static struct merge_node *
3220merge_tree_init (size_t nthreads, size_t nlines, struct line *dest)
3221{
3222  struct merge_node *merge_tree = xmalloc (2 * sizeof *merge_tree * nthreads);
3223
3224  struct merge_node *root = merge_tree;
3225  root->lo = root->hi = root->end_lo = root->end_hi = NULL;
3226  root->dest = NULL;
3227  root->nlo = root->nhi = nlines;
3228  root->parent = NULL;
3229  root->level = MERGE_END;
3230  root->queued = false;
3231  pthread_mutex_init (&root->lock, NULL);
3232
3233  init_node (root, root + 1, dest, nthreads, nlines, false);
3234  return merge_tree;
3235}
3236
3237/* Destroy the merge tree. */
3238static void
3239merge_tree_destroy (size_t nthreads, struct merge_node *merge_tree)
3240{
3241  size_t n_nodes = nthreads * 2;
3242  struct merge_node *node = merge_tree;
3243
3244  while (n_nodes--)
3245    {
3246      pthread_mutex_destroy (&node->lock);
3247      node++;
3248    }
3249
3250  free (merge_tree);
3251}
3252
3253/* Initialize a merge tree node and its descendants.  The node's
3254   parent is PARENT.  The node and its descendants are taken from the
3255   array of nodes NODE_POOL.  Their destination starts at DEST; they
3256   will consume NTHREADS threads.  The total number of sort lines is
3257   TOTAL_LINES.  IS_LO_CHILD is true if the node is the low child of
3258   its parent.  */
3259
3260static struct merge_node *
3261init_node (struct merge_node *restrict parent,
3262           struct merge_node *restrict node_pool,
3263           struct line *dest, size_t nthreads,
3264           size_t total_lines, bool is_lo_child)
3265{
3266  size_t nlines = (is_lo_child ? parent->nlo : parent->nhi);
3267  size_t nlo = nlines / 2;
3268  size_t nhi = nlines - nlo;
3269  struct line *lo = dest - total_lines;
3270  struct line *hi = lo - nlo;
3271  struct line **parent_end = (is_lo_child ? &parent->end_lo : &parent->end_hi);
3272
3273  struct merge_node *node = node_pool++;
3274  node->lo = node->end_lo = lo;
3275  node->hi = node->end_hi = hi;
3276  node->dest = parent_end;
3277  node->nlo = nlo;
3278  node->nhi = nhi;
3279  node->parent = parent;
3280  node->level = parent->level + 1;
3281  node->queued = false;
3282  pthread_mutex_init (&node->lock, NULL);
3283
3284  if (nthreads > 1)
3285    {
3286      size_t lo_threads = nthreads / 2;
3287      size_t hi_threads = nthreads - lo_threads;
3288      node->lo_child = node_pool;
3289      node_pool = init_node (node, node_pool, lo, lo_threads,
3290                             total_lines, true);
3291      node->hi_child = node_pool;
3292      node_pool = init_node (node, node_pool, hi, hi_threads,
3293                             total_lines, false);
3294    }
3295  else
3296    {
3297      node->lo_child = NULL;
3298      node->hi_child = NULL;
3299    }
3300  return node_pool;
3301}
3302
3303
3304/* Compare two merge nodes A and B for priority.  */
3305
3306static int
3307compare_nodes (void const *a, void const *b)
3308{
3309  struct merge_node const *nodea = a;
3310  struct merge_node const *nodeb = b;
3311  if (nodea->level == nodeb->level)
3312      return (nodea->nlo + nodea->nhi) < (nodeb->nlo + nodeb->nhi);
3313  return nodea->level < nodeb->level;
3314}
3315
3316/* Lock a merge tree NODE.  */
3317
3318static inline void
3319lock_node (struct merge_node *node)
3320{
3321  pthread_mutex_lock (&node->lock);
3322}
3323
3324/* Unlock a merge tree NODE. */
3325
3326static inline void
3327unlock_node (struct merge_node *node)
3328{
3329  pthread_mutex_unlock (&node->lock);
3330}
3331
3332/* Destroy merge QUEUE. */
3333
3334static void
3335queue_destroy (struct merge_node_queue *queue)
3336{
3337  heap_free (queue->priority_queue);
3338  pthread_cond_destroy (&queue->cond);
3339  pthread_mutex_destroy (&queue->mutex);
3340}
3341
3342/* Initialize merge QUEUE, allocating space suitable for a maximum of
3343   NTHREADS threads.  */
3344
3345static void
3346queue_init (struct merge_node_queue *queue, size_t nthreads)
3347{
3348  /* Though it's highly unlikely all nodes are in the heap at the same
3349     time, the heap should accommodate all of them.  Counting a NULL
3350     dummy head for the heap, reserve 2 * NTHREADS nodes.  */
3351  queue->priority_queue = heap_alloc (compare_nodes, 2 * nthreads);
3352  pthread_mutex_init (&queue->mutex, NULL);
3353  pthread_cond_init (&queue->cond, NULL);
3354}
3355
3356/* Insert NODE into QUEUE.  The caller either holds a lock on NODE, or
3357   does not need to lock NODE.  */
3358
3359static void
3360queue_insert (struct merge_node_queue *queue, struct merge_node *node)
3361{
3362  pthread_mutex_lock (&queue->mutex);
3363  heap_insert (queue->priority_queue, node);
3364  node->queued = true;
3365  pthread_cond_signal (&queue->cond);
3366  pthread_mutex_unlock (&queue->mutex);
3367}
3368
3369/* Pop the top node off the priority QUEUE, lock the node, return it.  */
3370
3371static struct merge_node *
3372queue_pop (struct merge_node_queue *queue)
3373{
3374  struct merge_node *node;
3375  pthread_mutex_lock (&queue->mutex);
3376  while (! (node = heap_remove_top (queue->priority_queue)))
3377    pthread_cond_wait (&queue->cond, &queue->mutex);
3378  pthread_mutex_unlock (&queue->mutex);
3379  lock_node (node);
3380  node->queued = false;
3381  return node;
3382}
3383
3384/* Output LINE to TFP, unless -u is specified and the line compares
3385   equal to the previous line.  TEMP_OUTPUT is the name of TFP, or
3386   is null if TFP is standard output.
3387
3388   This function does not save the line for comparison later, so it is
3389   appropriate only for internal sort.  */
3390
3391static void
3392write_unique (struct line const *line, FILE *tfp, char const *temp_output)
3393{
3394  if (unique)
3395    {
3396      if (saved_line.text && ! compare (line, &saved_line))
3397        return;
3398      saved_line = *line;
3399    }
3400
3401  write_line (line, tfp, temp_output);
3402}
3403
3404/* Merge the lines currently available to a NODE in the binary
3405   merge tree.  Merge a number of lines appropriate for this merge
3406   level, assuming TOTAL_LINES is the total number of lines.
3407
3408   If merging at the top level, send output to TFP.  TEMP_OUTPUT is
3409   the name of TFP, or is null if TFP is standard output.  */
3410
3411static void
3412mergelines_node (struct merge_node *restrict node, size_t total_lines,
3413                 FILE *tfp, char const *temp_output)
3414{
3415  struct line *lo_orig = node->lo;
3416  struct line *hi_orig = node->hi;
3417  size_t to_merge = MAX_MERGE (total_lines, node->level);
3418  size_t merged_lo;
3419  size_t merged_hi;
3420
3421  if (node->level > MERGE_ROOT)
3422    {
3423      /* Merge to destination buffer. */
3424      struct line *dest = *node->dest;
3425      while (node->lo != node->end_lo && node->hi != node->end_hi && to_merge--)
3426        if (compare (node->lo - 1, node->hi - 1) <= 0)
3427          *--dest = *--node->lo;
3428        else
3429          *--dest = *--node->hi;
3430
3431      merged_lo = lo_orig - node->lo;
3432      merged_hi = hi_orig - node->hi;
3433
3434      if (node->nhi == merged_hi)
3435        while (node->lo != node->end_lo && to_merge--)
3436          *--dest = *--node->lo;
3437      else if (node->nlo == merged_lo)
3438        while (node->hi != node->end_hi && to_merge--)
3439          *--dest = *--node->hi;
3440      *node->dest = dest;
3441    }
3442  else
3443    {
3444      /* Merge directly to output. */
3445      while (node->lo != node->end_lo && node->hi != node->end_hi && to_merge--)
3446        {
3447          if (compare (node->lo - 1, node->hi - 1) <= 0)
3448            write_unique (--node->lo, tfp, temp_output);
3449          else
3450            write_unique (--node->hi, tfp, temp_output);
3451        }
3452
3453      merged_lo = lo_orig - node->lo;
3454      merged_hi = hi_orig - node->hi;
3455
3456      if (node->nhi == merged_hi)
3457        {
3458          while (node->lo != node->end_lo && to_merge--)
3459            write_unique (--node->lo, tfp, temp_output);
3460        }
3461      else if (node->nlo == merged_lo)
3462        {
3463          while (node->hi != node->end_hi && to_merge--)
3464            write_unique (--node->hi, tfp, temp_output);
3465        }
3466    }
3467
3468  /* Update NODE. */
3469  merged_lo = lo_orig - node->lo;
3470  merged_hi = hi_orig - node->hi;
3471  node->nlo -= merged_lo;
3472  node->nhi -= merged_hi;
3473}
3474
3475/* Into QUEUE, insert NODE if it is not already queued, and if one of
3476   NODE's children has available lines and the other either has
3477   available lines or has exhausted its lines.  */
3478
3479static void
3480queue_check_insert (struct merge_node_queue *queue, struct merge_node *node)
3481{
3482  if (! node->queued)
3483    {
3484      bool lo_avail = (node->lo - node->end_lo) != 0;
3485      bool hi_avail = (node->hi - node->end_hi) != 0;
3486      if (lo_avail ? hi_avail || ! node->nhi : hi_avail && ! node->nlo)
3487        queue_insert (queue, node);
3488    }
3489}
3490
3491/* Into QUEUE, insert NODE's parent if the parent can now be worked on.  */
3492
3493static void
3494queue_check_insert_parent (struct merge_node_queue *queue,
3495                           struct merge_node *node)
3496{
3497  if (node->level > MERGE_ROOT)
3498    {
3499      lock_node (node->parent);
3500      queue_check_insert (queue, node->parent);
3501      unlock_node (node->parent);
3502    }
3503  else if (node->nlo + node->nhi == 0)
3504    {
3505      /* If the MERGE_ROOT NODE has finished merging, insert the
3506         MERGE_END node.  */
3507      queue_insert (queue, node->parent);
3508    }
3509}
3510
3511/* Repeatedly pop QUEUE for a node with lines to merge, and merge at least
3512   some of those lines, until the MERGE_END node is popped.
3513   TOTAL_LINES is the total number of lines.  If merging at the top
3514   level, send output to TFP.  TEMP_OUTPUT is the name of TFP, or is
3515   null if TFP is standard output.  */
3516
3517static void
3518merge_loop (struct merge_node_queue *queue,
3519            size_t total_lines, FILE *tfp, char const *temp_output)
3520{
3521  while (1)
3522    {
3523      struct merge_node *node = queue_pop (queue);
3524
3525      if (node->level == MERGE_END)
3526        {
3527          unlock_node (node);
3528          /* Reinsert so other threads can pop it. */
3529          queue_insert (queue, node);
3530          break;
3531        }
3532      mergelines_node (node, total_lines, tfp, temp_output);
3533      queue_check_insert (queue, node);
3534      queue_check_insert_parent (queue, node);
3535
3536      unlock_node (node);
3537    }
3538}
3539
3540
3541static void sortlines (struct line *restrict, size_t, size_t,
3542                       struct merge_node *, struct merge_node_queue *,
3543                       FILE *, char const *);
3544
3545/* Thread arguments for sortlines_thread. */
3546
3547struct thread_args
3548{
3549  /* Source, i.e., the array of lines to sort.  This points just past
3550     the end of the array.  */
3551  struct line *lines;
3552
3553  /* Number of threads to use.  If 0 or 1, sort single-threaded.  */
3554  size_t nthreads;
3555
3556  /* Number of lines in LINES and DEST.  */
3557  size_t const total_lines;
3558
3559  /* Merge node. Lines from this node and this node's sibling will merged
3560     to this node's parent. */
3561  struct merge_node *const node;
3562
3563  /* The priority queue controlling available work for the entire
3564     internal sort.  */
3565  struct merge_node_queue *const queue;
3566
3567  /* If at the top level, the file to output to, and the file's name.
3568     If the file is standard output, the file's name is null.  */
3569  FILE *tfp;
3570  char const *output_temp;
3571};
3572
3573/* Like sortlines, except with a signature acceptable to pthread_create.  */
3574
3575static void *
3576sortlines_thread (void *data)
3577{
3578  struct thread_args const *args = data;
3579  sortlines (args->lines, args->nthreads, args->total_lines,
3580             args->node, args->queue, args->tfp,
3581             args->output_temp);
3582  return NULL;
3583}
3584
3585/* Sort lines, possibly in parallel.  The arguments are as in struct
3586   thread_args above.
3587
3588   The algorithm has three phases: node creation, sequential sort,
3589   and binary merge.
3590
3591   During node creation, sortlines recursively visits each node in the
3592   binary merge tree and creates a NODE structure corresponding to all the
3593   future line merging NODE is responsible for. For each call to
3594   sortlines, half the available threads are assigned to each recursive
3595   call, until a leaf node having only 1 available thread is reached.
3596
3597   Each leaf node then performs two sequential sorts, one on each half of
3598   the lines it is responsible for. It records in its NODE structure that
3599   there are two sorted sublists available to merge from, and inserts its
3600   NODE into the priority queue.
3601
3602   The binary merge phase then begins. Each thread drops into a loop
3603   where the thread retrieves a NODE from the priority queue, merges lines
3604   available to that NODE, and potentially insert NODE or its parent back
3605   into the queue if there are sufficient available lines for them to
3606   merge. This continues until all lines at all nodes of the merge tree
3607   have been merged. */
3608
3609static void
3610sortlines (struct line *restrict lines, size_t nthreads,
3611           size_t total_lines, struct merge_node *node,
3612           struct merge_node_queue *queue, FILE *tfp, char const *temp_output)
3613{
3614  size_t nlines = node->nlo + node->nhi;
3615
3616  /* Calculate thread arguments. */
3617  size_t lo_threads = nthreads / 2;
3618  size_t hi_threads = nthreads - lo_threads;
3619  pthread_t thread;
3620  struct thread_args args = {lines, lo_threads, total_lines,
3621                             node->lo_child, queue, tfp, temp_output};
3622
3623  if (nthreads > 1 && SUBTHREAD_LINES_HEURISTIC <= nlines
3624      && pthread_create (&thread, NULL, sortlines_thread, &args) == 0)
3625    {
3626      sortlines (lines - node->nlo, hi_threads, total_lines,
3627                 node->hi_child, queue, tfp, temp_output);
3628      pthread_join (thread, NULL);
3629    }
3630  else
3631    {
3632      /* Nthreads = 1, this is a leaf NODE, or pthread_create failed.
3633         Sort with 1 thread. */
3634      size_t nlo = node->nlo;
3635      size_t nhi = node->nhi;
3636      struct line *temp = lines - total_lines;
3637      if (1 < nhi)
3638        sequential_sort (lines - nlo, nhi, temp - nlo / 2, false);
3639      if (1 < nlo)
3640        sequential_sort (lines, nlo, temp, false);
3641
3642      /* Update merge NODE. No need to lock yet. */
3643      node->lo = lines;
3644      node->hi = lines - nlo;
3645      node->end_lo = lines - nlo;
3646      node->end_hi = lines - nlo - nhi;
3647
3648      queue_insert (queue, node);
3649      merge_loop (queue, total_lines, tfp, temp_output);
3650    }
3651}
3652
3653/* Scan through FILES[NTEMPS .. NFILES-1] looking for files that are
3654   the same as OUTFILE.  If found, replace each with the same
3655   temporary copy that can be merged into OUTFILE without destroying
3656   OUTFILE before it is completely read.  This temporary copy does not
3657   count as a merge temp, so don't worry about incrementing NTEMPS in
3658   the caller; final cleanup will remove it, not zaptemp.
3659
3660   This test ensures that an otherwise-erroneous use like
3661   "sort -m -o FILE ... FILE ..." copies FILE before writing to it.
3662   It's not clear that POSIX requires this nicety.
3663   Detect common error cases, but don't try to catch obscure cases like
3664   "cat ... FILE ... | sort -m -o FILE"
3665   where traditional "sort" doesn't copy the input and where
3666   people should know that they're getting into trouble anyway.
3667   Catching these obscure cases would slow down performance in
3668   common cases.  */
3669
3670static void
3671avoid_trashing_input (struct sortfile *files, size_t ntemps,
3672                      size_t nfiles, char const *outfile)
3673{
3674  size_t i;
3675  bool got_outstat = false;
3676  struct stat outstat;
3677  struct tempnode *tempcopy = NULL;
3678
3679  for (i = ntemps; i < nfiles; i++)
3680    {
3681      bool is_stdin = STREQ (files[i].name, "-");
3682      bool same;
3683      struct stat instat;
3684
3685      if (outfile && STREQ (outfile, files[i].name) && !is_stdin)
3686        same = true;
3687      else
3688        {
3689          if (! got_outstat)
3690            {
3691              if (fstat (STDOUT_FILENO, &outstat) != 0)
3692                break;
3693              got_outstat = true;
3694            }
3695
3696          same = (((is_stdin
3697                    ? fstat (STDIN_FILENO, &instat)
3698                    : stat (files[i].name, &instat))
3699                   == 0)
3700                  && SAME_INODE (instat, outstat));
3701        }
3702
3703      if (same)
3704        {
3705          if (! tempcopy)
3706            {
3707              FILE *tftp;
3708              tempcopy = create_temp (&tftp);
3709              mergefiles (&files[i], 0, 1, tftp, tempcopy->name);
3710            }
3711
3712          files[i].name = tempcopy->name;
3713          files[i].temp = tempcopy;
3714        }
3715    }
3716}
3717
3718/* Scan the input files to ensure all are accessible.
3719   Otherwise exit with a diagnostic.
3720
3721   Note this will catch common issues with permissions etc.
3722   but will fail to notice issues where you can open() but not read(),
3723   like when a directory is specified on some systems.
3724   Catching these obscure cases could slow down performance in
3725   common cases.  */
3726
3727static void
3728check_inputs (char *const *files, size_t nfiles)
3729{
3730  size_t i;
3731  for (i = 0; i < nfiles; i++)
3732    {
3733      if (STREQ (files[i], "-"))
3734        continue;
3735
3736      if (euidaccess (files[i], R_OK) != 0)
3737        die (_("cannot read"), files[i]);
3738    }
3739}
3740
3741/* Ensure a specified output file can be created or written to,
3742   and point stdout to it.  Do not truncate the file.
3743   Exit with a diagnostic on failure.  */
3744
3745static void
3746check_output (char const *outfile)
3747{
3748  if (outfile)
3749    {
3750      int outfd = open (outfile, O_WRONLY | O_CREAT | O_BINARY, MODE_RW_UGO);
3751      if (outfd < 0)
3752        die (_("open failed"), outfile);
3753      move_fd_or_die (outfd, STDOUT_FILENO);
3754    }
3755}
3756
3757/* Merge the input FILES.  NTEMPS is the number of files at the
3758   start of FILES that are temporary; it is zero at the top level.
3759   NFILES is the total number of files.  Put the output in
3760   OUTPUT_FILE; a null OUTPUT_FILE stands for standard output.  */
3761
3762static void
3763merge (struct sortfile *files, size_t ntemps, size_t nfiles,
3764       char const *output_file)
3765{
3766  while (nmerge < nfiles)
3767    {
3768      /* Number of input files processed so far.  */
3769      size_t in;
3770
3771      /* Number of output files generated so far.  */
3772      size_t out;
3773
3774      /* nfiles % NMERGE; this counts input files that are left over
3775         after all full-sized merges have been done.  */
3776      size_t remainder;
3777
3778      /* Number of easily-available slots at the next loop iteration.  */
3779      size_t cheap_slots;
3780
3781      /* Do as many NMERGE-size merges as possible. In the case that
3782         nmerge is bogus, increment by the maximum number of file
3783         descriptors allowed.  */
3784      for (out = in = 0; nmerge <= nfiles - in; out++)
3785        {
3786          FILE *tfp;
3787          struct tempnode *temp = create_temp (&tfp);
3788          size_t num_merged = mergefiles (&files[in], MIN (ntemps, nmerge),
3789                                          nmerge, tfp, temp->name);
3790          ntemps -= MIN (ntemps, num_merged);
3791          files[out].name = temp->name;
3792          files[out].temp = temp;
3793          in += num_merged;
3794        }
3795
3796      remainder = nfiles - in;
3797      cheap_slots = nmerge - out % nmerge;
3798
3799      if (cheap_slots < remainder)
3800        {
3801          /* So many files remain that they can't all be put into the last
3802             NMERGE-sized output window.  Do one more merge.  Merge as few
3803             files as possible, to avoid needless I/O.  */
3804          size_t nshortmerge = remainder - cheap_slots + 1;
3805          FILE *tfp;
3806          struct tempnode *temp = create_temp (&tfp);
3807          size_t num_merged = mergefiles (&files[in], MIN (ntemps, nshortmerge),
3808                                          nshortmerge, tfp, temp->name);
3809          ntemps -= MIN (ntemps, num_merged);
3810          files[out].name = temp->name;
3811          files[out++].temp = temp;
3812          in += num_merged;
3813        }
3814
3815      /* Put the remaining input files into the last NMERGE-sized output
3816         window, so they will be merged in the next pass.  */
3817      memmove (&files[out], &files[in], (nfiles - in) * sizeof *files);
3818      ntemps += out;
3819      nfiles -= in - out;
3820    }
3821
3822  avoid_trashing_input (files, ntemps, nfiles, output_file);
3823
3824  /* We aren't guaranteed that this final mergefiles will work, therefore we
3825     try to merge into the output, and then merge as much as we can into a
3826     temp file if we can't. Repeat.  */
3827
3828  while (true)
3829    {
3830      /* Merge directly into the output file if possible.  */
3831      FILE **fps;
3832      size_t nopened = open_input_files (files, nfiles, &fps);
3833
3834      if (nopened == nfiles)
3835        {
3836          FILE *ofp = stream_open (output_file, "w");
3837          if (ofp)
3838            {
3839              mergefps (files, ntemps, nfiles, ofp, output_file, fps);
3840              break;
3841            }
3842          if (errno != EMFILE || nopened <= 2)
3843            die (_("open failed"), output_file);
3844        }
3845      else if (nopened <= 2)
3846        die (_("open failed"), files[nopened].name);
3847
3848      /* We ran out of file descriptors.  Close one of the input
3849         files, to gain a file descriptor.  Then create a temporary
3850         file with our spare file descriptor.  Retry if that failed
3851         (e.g., some other process could open a file between the time
3852         we closed and tried to create).  */
3853      FILE *tfp;
3854      struct tempnode *temp;
3855      do
3856        {
3857          nopened--;
3858          xfclose (fps[nopened], files[nopened].name);
3859          temp = maybe_create_temp (&tfp, ! (nopened <= 2));
3860        }
3861      while (!temp);
3862
3863      /* Merge into the newly allocated temporary.  */
3864      mergefps (&files[0], MIN (ntemps, nopened), nopened, tfp, temp->name,
3865                fps);
3866      ntemps -= MIN (ntemps, nopened);
3867      files[0].name = temp->name;
3868      files[0].temp = temp;
3869
3870      memmove (&files[1], &files[nopened], (nfiles - nopened) * sizeof *files);
3871      ntemps++;
3872      nfiles -= nopened - 1;
3873    }
3874}
3875
3876/* Sort NFILES FILES onto OUTPUT_FILE.  Use at most NTHREADS threads.  */
3877
3878static void
3879sort (char *const *files, size_t nfiles, char const *output_file,
3880      size_t nthreads)
3881{
3882  struct buffer buf;
3883  IF_LINT (buf.buf = NULL);
3884  size_t ntemps = 0;
3885  bool output_file_created = false;
3886
3887  buf.alloc = 0;
3888
3889  while (nfiles)
3890    {
3891      char const *temp_output;
3892      char const *file = *files;
3893      FILE *fp = xfopen (file, "r");
3894      FILE *tfp;
3895
3896      size_t bytes_per_line;
3897      if (nthreads > 1)
3898        {
3899          /* Get log P. */
3900          size_t tmp = 1;
3901          size_t mult = 1;
3902          while (tmp < nthreads)
3903            {
3904              tmp *= 2;
3905              mult++;
3906            }
3907          bytes_per_line = (mult * sizeof (struct line));
3908        }
3909      else
3910        bytes_per_line = sizeof (struct line) * 3 / 2;
3911
3912      if (! buf.alloc)
3913        initbuf (&buf, bytes_per_line,
3914                 sort_buffer_size (&fp, 1, files, nfiles, bytes_per_line));
3915      buf.eof = false;
3916      files++;
3917      nfiles--;
3918
3919      while (fillbuf (&buf, fp, file))
3920        {
3921          struct line *line;
3922
3923          if (buf.eof && nfiles
3924              && (bytes_per_line + 1
3925                  < (buf.alloc - buf.used - bytes_per_line * buf.nlines)))
3926            {
3927              /* End of file, but there is more input and buffer room.
3928                 Concatenate the next input file; this is faster in
3929                 the usual case.  */
3930              buf.left = buf.used;
3931              break;
3932            }
3933
3934          saved_line.text = NULL;
3935          line = buffer_linelim (&buf);
3936          if (buf.eof && !nfiles && !ntemps && !buf.left)
3937            {
3938              xfclose (fp, file);
3939              tfp = xfopen (output_file, "w");
3940              temp_output = output_file;
3941              output_file_created = true;
3942            }
3943          else
3944            {
3945              ++ntemps;
3946              temp_output = create_temp (&tfp)->name;
3947            }
3948          if (1 < buf.nlines)
3949            {
3950              struct merge_node_queue queue;
3951              queue_init (&queue, nthreads);
3952              struct merge_node *merge_tree =
3953                merge_tree_init (nthreads, buf.nlines, line);
3954
3955              sortlines (line, nthreads, buf.nlines, merge_tree + 1,
3956                         &queue, tfp, temp_output);
3957
3958#ifdef lint
3959              merge_tree_destroy (nthreads, merge_tree);
3960              queue_destroy (&queue);
3961#endif
3962            }
3963          else
3964            write_unique (line - 1, tfp, temp_output);
3965
3966          xfclose (tfp, temp_output);
3967
3968          if (output_file_created)
3969            goto finish;
3970        }
3971      xfclose (fp, file);
3972    }
3973
3974 finish:
3975  free (buf.buf);
3976
3977  if (! output_file_created)
3978    {
3979      size_t i;
3980      struct tempnode *node = temphead;
3981      struct sortfile *tempfiles = xnmalloc (ntemps, sizeof *tempfiles);
3982      for (i = 0; node; i++)
3983        {
3984          tempfiles[i].name = node->name;
3985          tempfiles[i].temp = node;
3986          node = node->next;
3987        }
3988      merge (tempfiles, ntemps, ntemps, output_file);
3989      free (tempfiles);
3990    }
3991
3992  reap_all ();
3993}
3994
3995/* Insert a malloc'd copy of key KEY_ARG at the end of the key list.  */
3996
3997static void
3998insertkey (struct keyfield *key_arg)
3999{
4000  struct keyfield **p;
4001  struct keyfield *key = xmemdup (key_arg, sizeof *key);
4002
4003  for (p = &keylist; *p; p = &(*p)->next)
4004    continue;
4005  *p = key;
4006  key->next = NULL;
4007}
4008
4009/* Report a bad field specification SPEC, with extra info MSGID.  */
4010
4011static void badfieldspec (char const *, char const *)
4012     ATTRIBUTE_NORETURN;
4013static void
4014badfieldspec (char const *spec, char const *msgid)
4015{
4016  error (SORT_FAILURE, 0, _("%s: invalid field specification %s"),
4017         _(msgid), quote (spec));
4018  abort ();
4019}
4020
4021/* Report incompatible options.  */
4022
4023static void incompatible_options (char const *) ATTRIBUTE_NORETURN;
4024static void
4025incompatible_options (char const *opts)
4026{
4027  error (SORT_FAILURE, 0, _("options '-%s' are incompatible"), (opts));
4028  abort ();
4029}
4030
4031/* Check compatibility of ordering options.  */
4032
4033static void
4034check_ordering_compatibility (void)
4035{
4036  struct keyfield *key;
4037
4038  for (key = keylist; key; key = key->next)
4039    if (1 < (key->numeric + key->general_numeric + key->human_numeric
4040             + key->month + (key->version | key->random | !!key->ignore)))
4041      {
4042        /* The following is too big, but guaranteed to be "big enough".  */
4043        char opts[sizeof short_options];
4044        /* Clear flags we're not interested in.  */
4045        key->skipsblanks = key->skipeblanks = key->reverse = false;
4046        key_to_opts (key, opts);
4047        incompatible_options (opts);
4048      }
4049}
4050
4051/* Parse the leading integer in STRING and store the resulting value
4052   (which must fit into size_t) into *VAL.  Return the address of the
4053   suffix after the integer.  If the value is too large, silently
4054   substitute SIZE_MAX.  If MSGID is NULL, return NULL after
4055   failure; otherwise, report MSGID and exit on failure.  */
4056
4057static char const *
4058parse_field_count (char const *string, size_t *val, char const *msgid)
4059{
4060  char *suffix;
4061  uintmax_t n;
4062
4063  switch (xstrtoumax (string, &suffix, 10, &n, ""))
4064    {
4065    case LONGINT_OK:
4066    case LONGINT_INVALID_SUFFIX_CHAR:
4067      *val = n;
4068      if (*val == n)
4069        break;
4070      /* Fall through.  */
4071    case LONGINT_OVERFLOW:
4072    case LONGINT_OVERFLOW | LONGINT_INVALID_SUFFIX_CHAR:
4073      *val = SIZE_MAX;
4074      break;
4075
4076    case LONGINT_INVALID:
4077      if (msgid)
4078        error (SORT_FAILURE, 0, _("%s: invalid count at start of %s"),
4079               _(msgid), quote (string));
4080      return NULL;
4081    }
4082
4083  return suffix;
4084}
4085
4086/* Handle interrupts and hangups. */
4087
4088static void
4089sighandler (int sig)
4090{
4091  if (! SA_NOCLDSTOP)
4092    signal (sig, SIG_IGN);
4093
4094  cleanup ();
4095
4096  signal (sig, SIG_DFL);
4097  raise (sig);
4098}
4099
4100/* Set the ordering options for KEY specified in S.
4101   Return the address of the first character in S that
4102   is not a valid ordering option.
4103   BLANKTYPE is the kind of blanks that 'b' should skip. */
4104
4105static char *
4106set_ordering (char const *s, struct keyfield *key, enum blanktype blanktype)
4107{
4108  while (*s)
4109    {
4110      switch (*s)
4111        {
4112        case 'b':
4113          if (blanktype == bl_start || blanktype == bl_both)
4114            key->skipsblanks = true;
4115          if (blanktype == bl_end || blanktype == bl_both)
4116            key->skipeblanks = true;
4117          break;
4118        case 'd':
4119          key->ignore = nondictionary;
4120          break;
4121        case 'f':
4122          key->translate = fold_toupper;
4123          break;
4124        case 'g':
4125          key->general_numeric = true;
4126          break;
4127        case 'h':
4128          key->human_numeric = true;
4129          break;
4130        case 'i':
4131          /* Option order should not matter, so don't let -i override
4132             -d.  -d implies -i, but -i does not imply -d.  */
4133          if (! key->ignore)
4134            key->ignore = nonprinting;
4135          break;
4136        case 'M':
4137          key->month = true;
4138          break;
4139        case 'n':
4140          key->numeric = true;
4141          break;
4142        case 'R':
4143          key->random = true;
4144          break;
4145        case 'r':
4146          key->reverse = true;
4147          break;
4148        case 'V':
4149          key->version = true;
4150          break;
4151        default:
4152          return (char *) s;
4153        }
4154      ++s;
4155    }
4156  return (char *) s;
4157}
4158
4159/* Initialize KEY.  */
4160
4161static struct keyfield *
4162key_init (struct keyfield *key)
4163{
4164  memset (key, 0, sizeof *key);
4165  key->eword = SIZE_MAX;
4166  return key;
4167}
4168
4169int
4170main (int argc, char **argv)
4171{
4172  struct keyfield *key;
4173  struct keyfield key_buf;
4174  struct keyfield gkey;
4175  bool gkey_only = false;
4176  char const *s;
4177  int c = 0;
4178  char checkonly = 0;
4179  bool mergeonly = false;
4180  char *random_source = NULL;
4181  bool need_random = false;
4182  size_t nthreads = 0;
4183  size_t nfiles = 0;
4184  bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
4185  int posix_ver = posix2_version ();
4186  bool traditional_usage = ! (200112 <= posix_ver && posix_ver < 200809);
4187  char **files;
4188  char *files_from = NULL;
4189  struct Tokens tok;
4190  char const *outfile = NULL;
4191  bool locale_ok;
4192
4193  initialize_main (&argc, &argv);
4194  set_program_name (argv[0]);
4195  locale_ok = !! setlocale (LC_ALL, "");
4196  bindtextdomain (PACKAGE, LOCALEDIR);
4197  textdomain (PACKAGE);
4198
4199  initialize_exit_failure (SORT_FAILURE);
4200
4201  hard_LC_COLLATE = hard_locale (LC_COLLATE);
4202#if HAVE_NL_LANGINFO
4203  hard_LC_TIME = hard_locale (LC_TIME);
4204#endif
4205
4206  /* Get locale's representation of the decimal point.  */
4207  {
4208    struct lconv const *locale = localeconv ();
4209
4210    /* If the locale doesn't define a decimal point, or if the decimal
4211       point is multibyte, use the C locale's decimal point.  FIXME:
4212       add support for multibyte decimal points.  */
4213    decimal_point = to_uchar (locale->decimal_point[0]);
4214    if (! decimal_point || locale->decimal_point[1])
4215      decimal_point = '.';
4216
4217    /* FIXME: add support for multibyte thousands separators.  */
4218    thousands_sep = to_uchar (*locale->thousands_sep);
4219    if (! thousands_sep || locale->thousands_sep[1])
4220      thousands_sep = -1;
4221  }
4222
4223  have_read_stdin = false;
4224  inittables ();
4225
4226  {
4227    size_t i;
4228    static int const sig[] =
4229      {
4230        /* The usual suspects.  */
4231        SIGALRM, SIGHUP, SIGINT, SIGPIPE, SIGQUIT, SIGTERM,
4232#ifdef SIGPOLL
4233        SIGPOLL,
4234#endif
4235#ifdef SIGPROF
4236        SIGPROF,
4237#endif
4238#ifdef SIGVTALRM
4239        SIGVTALRM,
4240#endif
4241#ifdef SIGXCPU
4242        SIGXCPU,
4243#endif
4244#ifdef SIGXFSZ
4245        SIGXFSZ,
4246#endif
4247      };
4248    enum { nsigs = ARRAY_CARDINALITY (sig) };
4249
4250#if SA_NOCLDSTOP
4251    struct sigaction act;
4252
4253    sigemptyset (&caught_signals);
4254    for (i = 0; i < nsigs; i++)
4255      {
4256        sigaction (sig[i], NULL, &act);
4257        if (act.sa_handler != SIG_IGN)
4258          sigaddset (&caught_signals, sig[i]);
4259      }
4260
4261    act.sa_handler = sighandler;
4262    act.sa_mask = caught_signals;
4263    act.sa_flags = 0;
4264
4265    for (i = 0; i < nsigs; i++)
4266      if (sigismember (&caught_signals, sig[i]))
4267        sigaction (sig[i], &act, NULL);
4268#else
4269    for (i = 0; i < nsigs; i++)
4270      if (signal (sig[i], SIG_IGN) != SIG_IGN)
4271        {
4272          signal (sig[i], sighandler);
4273          siginterrupt (sig[i], 1);
4274        }
4275#endif
4276  }
4277  signal (SIGCHLD, SIG_DFL); /* Don't inherit CHLD handling from parent.  */
4278
4279  /* The signal mask is known, so it is safe to invoke exit_cleanup.  */
4280  atexit (exit_cleanup);
4281
4282  key_init (&gkey);
4283  gkey.sword = SIZE_MAX;
4284
4285  files = xnmalloc (argc, sizeof *files);
4286
4287  while (true)
4288    {
4289      /* Parse an operand as a file after "--" was seen; or if
4290         pedantic and a file was seen, unless the POSIX version
4291         is not 1003.1-2001 and -c was not seen and the operand is
4292         "-o FILE" or "-oFILE".  */
4293      int oi = -1;
4294
4295      if (c == -1
4296          || (posixly_correct && nfiles != 0
4297              && ! (traditional_usage
4298                    && ! checkonly
4299                    && optind != argc
4300                    && argv[optind][0] == '-' && argv[optind][1] == 'o'
4301                    && (argv[optind][2] || optind + 1 != argc)))
4302          || ((c = getopt_long (argc, argv, short_options,
4303                                long_options, &oi))
4304              == -1))
4305        {
4306          if (argc <= optind)
4307            break;
4308          files[nfiles++] = argv[optind++];
4309        }
4310      else switch (c)
4311        {
4312        case 1:
4313          key = NULL;
4314          if (optarg[0] == '+')
4315            {
4316              bool minus_pos_usage = (optind != argc && argv[optind][0] == '-'
4317                                      && ISDIGIT (argv[optind][1]));
4318              traditional_usage |= minus_pos_usage && !posixly_correct;
4319              if (traditional_usage)
4320                {
4321                  /* Treat +POS1 [-POS2] as a key if possible; but silently
4322                     treat an operand as a file if it is not a valid +POS1.  */
4323                  key = key_init (&key_buf);
4324                  s = parse_field_count (optarg + 1, &key->sword, NULL);
4325                  if (s && *s == '.')
4326                    s = parse_field_count (s + 1, &key->schar, NULL);
4327                  if (! (key->sword || key->schar))
4328                    key->sword = SIZE_MAX;
4329                  if (! s || *set_ordering (s, key, bl_start))
4330                    key = NULL;
4331                  else
4332                    {
4333                      if (minus_pos_usage)
4334                        {
4335                          char const *optarg1 = argv[optind++];
4336                          s = parse_field_count (optarg1 + 1, &key->eword,
4337                                             N_("invalid number after '-'"));
4338                          /* When called with a non-NULL message ID,
4339                             parse_field_count cannot return NULL.  Tell static
4340                             analysis tools that dereferencing S is safe.  */
4341                          assert (s);
4342                          if (*s == '.')
4343                            s = parse_field_count (s + 1, &key->echar,
4344                                               N_("invalid number after '.'"));
4345                          if (!key->echar && key->eword)
4346                            {
4347                              /* obsolescent syntax +A.x -B.y is equivalent to:
4348                                   -k A+1.x+1,B.y   (when y = 0)
4349                                   -k A+1.x+1,B+1.y (when y > 0)
4350                                 So eword is decremented as in the -k case
4351                                 only when the end field (B) is specified and
4352                                 echar (y) is 0.  */
4353                              key->eword--;
4354                            }
4355                          if (*set_ordering (s, key, bl_end))
4356                            badfieldspec (optarg1,
4357                                      N_("stray character in field spec"));
4358                        }
4359                      key->traditional_used = true;
4360                      insertkey (key);
4361                    }
4362                }
4363            }
4364          if (! key)
4365            files[nfiles++] = optarg;
4366          break;
4367
4368        case SORT_OPTION:
4369          c = XARGMATCH ("--sort", optarg, sort_args, sort_types);
4370          /* Fall through. */
4371        case 'b':
4372        case 'd':
4373        case 'f':
4374        case 'g':
4375        case 'h':
4376        case 'i':
4377        case 'M':
4378        case 'n':
4379        case 'r':
4380        case 'R':
4381        case 'V':
4382          {
4383            char str[2];
4384            str[0] = c;
4385            str[1] = '\0';
4386            set_ordering (str, &gkey, bl_both);
4387          }
4388          break;
4389
4390        case CHECK_OPTION:
4391          c = (optarg
4392               ? XARGMATCH ("--check", optarg, check_args, check_types)
4393               : 'c');
4394          /* Fall through.  */
4395        case 'c':
4396        case 'C':
4397          if (checkonly && checkonly != c)
4398            incompatible_options ("cC");
4399          checkonly = c;
4400          break;
4401
4402        case COMPRESS_PROGRAM_OPTION:
4403          if (compress_program && !STREQ (compress_program, optarg))
4404            error (SORT_FAILURE, 0, _("multiple compress programs specified"));
4405          compress_program = optarg;
4406          break;
4407
4408        case DEBUG_PROGRAM_OPTION:
4409          debug = true;
4410          break;
4411
4412        case FILES0_FROM_OPTION:
4413          files_from = optarg;
4414          break;
4415
4416        case 'k':
4417          key = key_init (&key_buf);
4418
4419          /* Get POS1. */
4420          s = parse_field_count (optarg, &key->sword,
4421                                 N_("invalid number at field start"));
4422          if (! key->sword--)
4423            {
4424              /* Provoke with 'sort -k0' */
4425              badfieldspec (optarg, N_("field number is zero"));
4426            }
4427          if (*s == '.')
4428            {
4429              s = parse_field_count (s + 1, &key->schar,
4430                                     N_("invalid number after '.'"));
4431              if (! key->schar--)
4432                {
4433                  /* Provoke with 'sort -k1.0' */
4434                  badfieldspec (optarg, N_("character offset is zero"));
4435                }
4436            }
4437          if (! (key->sword || key->schar))
4438            key->sword = SIZE_MAX;
4439          s = set_ordering (s, key, bl_start);
4440          if (*s != ',')
4441            {
4442              key->eword = SIZE_MAX;
4443              key->echar = 0;
4444            }
4445          else
4446            {
4447              /* Get POS2. */
4448              s = parse_field_count (s + 1, &key->eword,
4449                                     N_("invalid number after ','"));
4450              if (! key->eword--)
4451                {
4452                  /* Provoke with 'sort -k1,0' */
4453                  badfieldspec (optarg, N_("field number is zero"));
4454                }
4455              if (*s == '.')
4456                {
4457                  s = parse_field_count (s + 1, &key->echar,
4458                                         N_("invalid number after '.'"));
4459                }
4460              s = set_ordering (s, key, bl_end);
4461            }
4462          if (*s)
4463            badfieldspec (optarg, N_("stray character in field spec"));
4464          insertkey (key);
4465          break;
4466
4467        case 'm':
4468          mergeonly = true;
4469          break;
4470
4471        case NMERGE_OPTION:
4472          specify_nmerge (oi, c, optarg);
4473          break;
4474
4475        case 'o':
4476          if (outfile && !STREQ (outfile, optarg))
4477            error (SORT_FAILURE, 0, _("multiple output files specified"));
4478          outfile = optarg;
4479          break;
4480
4481        case RANDOM_SOURCE_OPTION:
4482          if (random_source && !STREQ (random_source, optarg))
4483            error (SORT_FAILURE, 0, _("multiple random sources specified"));
4484          random_source = optarg;
4485          break;
4486
4487        case 's':
4488          stable = true;
4489          break;
4490
4491        case 'S':
4492          specify_sort_size (oi, c, optarg);
4493          break;
4494
4495        case 't':
4496          {
4497            char newtab = optarg[0];
4498            if (! newtab)
4499              error (SORT_FAILURE, 0, _("empty tab"));
4500            if (optarg[1])
4501              {
4502                if (STREQ (optarg, "\\0"))
4503                  newtab = '\0';
4504                else
4505                  {
4506                    /* Provoke with 'sort -txx'.  Complain about
4507                       "multi-character tab" instead of "multibyte tab", so
4508                       that the diagnostic's wording does not need to be
4509                       changed once multibyte characters are supported.  */
4510                    error (SORT_FAILURE, 0, _("multi-character tab %s"),
4511                           quote (optarg));
4512                  }
4513              }
4514            if (tab != TAB_DEFAULT && tab != newtab)
4515              error (SORT_FAILURE, 0, _("incompatible tabs"));
4516            tab = newtab;
4517          }
4518          break;
4519
4520        case 'T':
4521          add_temp_dir (optarg);
4522          break;
4523
4524        case PARALLEL_OPTION:
4525          nthreads = specify_nthreads (oi, c, optarg);
4526          break;
4527
4528        case 'u':
4529          unique = true;
4530          break;
4531
4532        case 'y':
4533          /* Accept and ignore e.g. -y0 for compatibility with Solaris 2.x
4534             through Solaris 7.  It is also accepted by many non-Solaris
4535             "sort" implementations, e.g., AIX 5.2, HP-UX 11i v2, IRIX 6.5.
4536             -y is marked as obsolete starting with Solaris 8 (1999), but is
4537             still accepted as of Solaris 10 prerelease (2004).
4538
4539             Solaris 2.5.1 "sort -y 100" reads the input file "100", but
4540             emulate Solaris 8 and 9 "sort -y 100" which ignores the "100",
4541             and which in general ignores the argument after "-y" if it
4542             consists entirely of digits (it can even be empty).  */
4543          if (optarg == argv[optind - 1])
4544            {
4545              char const *p;
4546              for (p = optarg; ISDIGIT (*p); p++)
4547                continue;
4548              optind -= (*p != '\0');
4549            }
4550          break;
4551
4552        case 'z':
4553          eolchar = 0;
4554          break;
4555
4556        case_GETOPT_HELP_CHAR;
4557
4558        case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
4559
4560        default:
4561          usage (SORT_FAILURE);
4562        }
4563    }
4564
4565  if (files_from)
4566    {
4567      FILE *stream;
4568
4569      /* When using --files0-from=F, you may not specify any files
4570         on the command-line.  */
4571      if (nfiles)
4572        {
4573          error (0, 0, _("extra operand %s"), quoteaf (files[0]));
4574          fprintf (stderr, "%s\n",
4575                   _("file operands cannot be combined with --files0-from"));
4576          usage (SORT_FAILURE);
4577        }
4578
4579      if (STREQ (files_from, "-"))
4580        stream = stdin;
4581      else
4582        {
4583          stream = fopen (files_from, "r");
4584          if (stream == NULL)
4585            error (SORT_FAILURE, errno, _("cannot open %s for reading"),
4586                   quoteaf (files_from));
4587        }
4588
4589      readtokens0_init (&tok);
4590
4591      if (! readtokens0 (stream, &tok) || fclose (stream) != 0)
4592        error (SORT_FAILURE, 0, _("cannot read file names from %s"),
4593               quoteaf (files_from));
4594
4595      if (tok.n_tok)
4596        {
4597          size_t i;
4598          free (files);
4599          files = tok.tok;
4600          nfiles = tok.n_tok;
4601          for (i = 0; i < nfiles; i++)
4602            {
4603              if (STREQ (files[i], "-"))
4604                error (SORT_FAILURE, 0, _("when reading file names from stdin, "
4605                                          "no file name of %s allowed"),
4606                       quoteaf (files[i]));
4607              else if (files[i][0] == '\0')
4608                {
4609                  /* Using the standard 'filename:line-number:' prefix here is
4610                     not totally appropriate, since NUL is the separator,
4611                     not NL, but it might be better than nothing.  */
4612                  unsigned long int file_number = i + 1;
4613                  error (SORT_FAILURE, 0,
4614                         _("%s:%lu: invalid zero-length file name"),
4615                         quotef (files_from), file_number);
4616                }
4617            }
4618        }
4619      else
4620        error (SORT_FAILURE, 0, _("no input from %s"),
4621               quoteaf (files_from));
4622    }
4623
4624  /* Inheritance of global options to individual keys. */
4625  for (key = keylist; key; key = key->next)
4626    {
4627      if (default_key_compare (key) && !key->reverse)
4628        {
4629          key->ignore = gkey.ignore;
4630          key->translate = gkey.translate;
4631          key->skipsblanks = gkey.skipsblanks;
4632          key->skipeblanks = gkey.skipeblanks;
4633          key->month = gkey.month;
4634          key->numeric = gkey.numeric;
4635          key->general_numeric = gkey.general_numeric;
4636          key->human_numeric = gkey.human_numeric;
4637          key->version = gkey.version;
4638          key->random = gkey.random;
4639          key->reverse = gkey.reverse;
4640        }
4641
4642      need_random |= key->random;
4643    }
4644
4645  if (!keylist && !default_key_compare (&gkey))
4646    {
4647      gkey_only = true;
4648      insertkey (&gkey);
4649      need_random |= gkey.random;
4650    }
4651
4652  check_ordering_compatibility ();
4653
4654  if (debug)
4655    {
4656      if (checkonly || outfile)
4657        {
4658          static char opts[] = "X --debug";
4659          opts[0] = (checkonly ? checkonly : 'o');
4660          incompatible_options (opts);
4661        }
4662
4663      /* Always output the locale in debug mode, since this
4664         is such a common source of confusion.  */
4665      if (hard_LC_COLLATE)
4666        error (0, 0, _("using %s sorting rules"),
4667               quote (setlocale (LC_COLLATE, NULL)));
4668      else
4669        {
4670          /* OpenBSD can only set some categories with LC_ALL above,
4671             so set LC_COLLATE explicitly to flag errors.  */
4672          if (locale_ok)
4673            locale_ok = !! setlocale (LC_COLLATE, "");
4674          error (0, 0, "%s%s", locale_ok ? "" : _("failed to set locale; "),
4675                 _("using simple byte comparison"));
4676        }
4677      key_warnings (&gkey, gkey_only);
4678    }
4679
4680  reverse = gkey.reverse;
4681
4682  if (need_random)
4683    random_md5_state_init (random_source);
4684
4685  if (temp_dir_count == 0)
4686    {
4687      char const *tmp_dir = getenv ("TMPDIR");
4688      add_temp_dir (tmp_dir ? tmp_dir : DEFAULT_TMPDIR);
4689    }
4690
4691  if (nfiles == 0)
4692    {
4693      nfiles = 1;
4694      free (files);
4695      files = xmalloc (sizeof *files);
4696      *files = (char *) "-";
4697    }
4698
4699  /* Need to re-check that we meet the minimum requirement for memory
4700     usage with the final value for NMERGE. */
4701  if (0 < sort_size)
4702    sort_size = MAX (sort_size, MIN_SORT_SIZE);
4703
4704  if (checkonly)
4705    {
4706      if (nfiles > 1)
4707        error (SORT_FAILURE, 0, _("extra operand %s not allowed with -%c"),
4708               quoteaf (files[1]), checkonly);
4709
4710      if (outfile)
4711        {
4712          static char opts[] = {0, 'o', 0};
4713          opts[0] = checkonly;
4714          incompatible_options (opts);
4715        }
4716
4717      /* POSIX requires that sort return 1 IFF invoked with -c or -C and the
4718         input is not properly sorted.  */
4719      return check (files[0], checkonly) ? EXIT_SUCCESS : SORT_OUT_OF_ORDER;
4720    }
4721
4722  /* Check all inputs are accessible, or exit immediately.  */
4723  check_inputs (files, nfiles);
4724
4725  /* Check output is writable, or exit immediately.  */
4726  check_output (outfile);
4727
4728  if (mergeonly)
4729    {
4730      struct sortfile *sortfiles = xcalloc (nfiles, sizeof *sortfiles);
4731      size_t i;
4732
4733      for (i = 0; i < nfiles; ++i)
4734        sortfiles[i].name = files[i];
4735
4736      merge (sortfiles, 0, nfiles, outfile);
4737      IF_LINT (free (sortfiles));
4738    }
4739  else
4740    {
4741      if (!nthreads)
4742        {
4743          unsigned long int np = num_processors (NPROC_CURRENT_OVERRIDABLE);
4744          nthreads = MIN (np, DEFAULT_MAX_THREADS);
4745        }
4746
4747      /* Avoid integer overflow later.  */
4748      size_t nthreads_max = SIZE_MAX / (2 * sizeof (struct merge_node));
4749      nthreads = MIN (nthreads, nthreads_max);
4750
4751      sort (files, nfiles, outfile, nthreads);
4752    }
4753
4754#ifdef lint
4755  if (files_from)
4756    readtokens0_free (&tok);
4757  else
4758    free (files);
4759#endif
4760
4761  if (have_read_stdin && fclose (stdin) == EOF)
4762    die (_("close failed"), "-");
4763
4764  return EXIT_SUCCESS;
4765}
4766