1/* Shuffle lines of text.
2
3   Copyright (C) 2006-2017 Free Software Foundation, Inc.
4
5   This program is free software: you can redistribute it and/or modify
6   it under the terms of the GNU General Public License as published by
7   the Free Software Foundation, either version 3 of the License, or
8   (at your option) any later version.
9
10   This program is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   GNU General Public License for more details.
14
15   You should have received a copy of the GNU General Public License
16   along with this program.  If not, see <http://www.gnu.org/licenses/>.
17
18   Written by Paul Eggert.  */
19
20#include <config.h>
21
22#include <sys/types.h>
23#include "system.h"
24
25#include "die.h"
26#include "error.h"
27#include "fadvise.h"
28#include "getopt.h"
29#include "linebuffer.h"
30#include "quote.h"
31#include "randint.h"
32#include "randperm.h"
33#include "read-file.h"
34#include "stdio--.h"
35#include "xdectoint.h"
36#include "xstrtol.h"
37
38/* The official name of this program (e.g., no 'g' prefix).  */
39#define PROGRAM_NAME "shuf"
40
41#define AUTHORS proper_name ("Paul Eggert")
42
43/* For reservoir-sampling, allocate the reservoir lines in batches.  */
44enum { RESERVOIR_LINES_INCREMENT = 1024 };
45
46/* reservoir-sampling introduces CPU overhead for small inputs.
47   So only enable it for inputs >= this limit.
48   This limit was determined using these commands:
49     $ for p in $(seq 7); do src/seq $((10**$p)) > 10p$p.in; done
50     $ for p in $(seq 7); do time shuf-nores -n10 10p$p.in >/dev/null; done
51     $ for p in $(seq 7); do time shuf -n10 10p$p.in >/dev/null; done  .*/
52enum { RESERVOIR_MIN_INPUT = 8192 * 1024 };
53
54
55void
56usage (int status)
57{
58  if (status != EXIT_SUCCESS)
59    emit_try_help ();
60  else
61    {
62      printf (_("\
63Usage: %s [OPTION]... [FILE]\n\
64  or:  %s -e [OPTION]... [ARG]...\n\
65  or:  %s -i LO-HI [OPTION]...\n\
66"),
67              program_name, program_name, program_name);
68      fputs (_("\
69Write a random permutation of the input lines to standard output.\n\
70"), stdout);
71
72      emit_stdin_note ();
73      emit_mandatory_arg_note ();
74
75      fputs (_("\
76  -e, --echo                treat each ARG as an input line\n\
77  -i, --input-range=LO-HI   treat each number LO through HI as an input line\n\
78  -n, --head-count=COUNT    output at most COUNT lines\n\
79  -o, --output=FILE         write result to FILE instead of standard output\n\
80      --random-source=FILE  get random bytes from FILE\n\
81  -r, --repeat              output lines can be repeated\n\
82"), stdout);
83      fputs (_("\
84  -z, --zero-terminated     line delimiter is NUL, not newline\n\
85"), stdout);
86      fputs (HELP_OPTION_DESCRIPTION, stdout);
87      fputs (VERSION_OPTION_DESCRIPTION, stdout);
88      emit_ancillary_info (PROGRAM_NAME);
89    }
90
91  exit (status);
92}
93
94/* For long options that have no equivalent short option, use a
95   non-character as a pseudo short option, starting with CHAR_MAX + 1.  */
96enum
97{
98  RANDOM_SOURCE_OPTION = CHAR_MAX + 1
99};
100
101static struct option const long_opts[] =
102{
103  {"echo", no_argument, NULL, 'e'},
104  {"input-range", required_argument, NULL, 'i'},
105  {"head-count", required_argument, NULL, 'n'},
106  {"output", required_argument, NULL, 'o'},
107  {"random-source", required_argument, NULL, RANDOM_SOURCE_OPTION},
108  {"repeat", no_argument, NULL, 'r'},
109  {"zero-terminated", no_argument, NULL, 'z'},
110  {GETOPT_HELP_OPTION_DECL},
111  {GETOPT_VERSION_OPTION_DECL},
112  {0, 0, 0, 0},
113};
114
115static void
116input_from_argv (char **operand, int n_operands, char eolbyte)
117{
118  char *p;
119  size_t size = n_operands;
120  int i;
121
122  for (i = 0; i < n_operands; i++)
123    size += strlen (operand[i]);
124  p = xmalloc (size);
125
126  for (i = 0; i < n_operands; i++)
127    {
128      char *p1 = stpcpy (p, operand[i]);
129      operand[i] = p;
130      p = p1;
131      *p++ = eolbyte;
132    }
133
134  operand[n_operands] = p;
135}
136
137/* Return the start of the next line after LINE.  The current line
138   ends in EOLBYTE, and is guaranteed to end before LINE + N.  */
139
140static char *
141next_line (char *line, char eolbyte, size_t n)
142{
143  char *p = memchr (line, eolbyte, n);
144  return p + 1;
145}
146
147/* Return the size of the input if possible or OFF_T_MAX if not.  */
148
149static off_t
150input_size (void)
151{
152  off_t file_size;
153
154  struct stat stat_buf;
155  if (fstat (STDIN_FILENO, &stat_buf) != 0)
156    return OFF_T_MAX;
157  if (usable_st_size (&stat_buf))
158    file_size = stat_buf.st_size;
159  else
160    return OFF_T_MAX;
161
162  off_t input_offset = lseek (STDIN_FILENO, 0, SEEK_CUR);
163  if (input_offset < 0)
164    return OFF_T_MAX;
165
166  file_size -= input_offset;
167
168  return file_size;
169}
170
171/* Read all lines and store up to K permuted lines in *OUT_RSRV.
172   Return the number of lines read, up to a maximum of K.  */
173
174static size_t
175read_input_reservoir_sampling (FILE *in, char eolbyte, size_t k,
176                               struct randint_source *s,
177                               struct linebuffer **out_rsrv)
178{
179  randint n_lines = 0;
180  size_t n_alloc_lines = MIN (k, RESERVOIR_LINES_INCREMENT);
181  struct linebuffer *line = NULL;
182  struct linebuffer *rsrv;
183
184  rsrv = xcalloc (n_alloc_lines, sizeof (struct linebuffer));
185
186  /* Fill the first K lines, directly into the reservoir.  */
187  while (n_lines < k
188         && (line =
189             readlinebuffer_delim (&rsrv[n_lines], in, eolbyte)) != NULL)
190    {
191      n_lines++;
192
193      /* Enlarge reservoir.  */
194      if (n_lines >= n_alloc_lines)
195        {
196          n_alloc_lines += RESERVOIR_LINES_INCREMENT;
197          rsrv = xnrealloc (rsrv, n_alloc_lines, sizeof (struct linebuffer));
198          memset (&rsrv[n_lines], 0,
199                  RESERVOIR_LINES_INCREMENT * sizeof (struct linebuffer));
200        }
201    }
202
203  /* last line wasn't NULL - so there may be more lines to read.  */
204  if (line != NULL)
205    {
206      struct linebuffer dummy;
207      initbuffer (&dummy);  /* space for lines not put in reservoir.  */
208
209      /* Choose the fate of the next line, with decreasing probability (as
210         n_lines increases in size).
211
212         If the line will be used, store it directly in the reservoir.
213         Otherwise, store it in dummy space.
214
215         With 'struct linebuffer', storing into existing buffer will reduce
216         re-allocations (will only re-allocate if the new line is longer than
217         the currently allocated space).  */
218      do
219        {
220          randint j = randint_choose (s, n_lines + 1);  /* 0 .. n_lines.  */
221          line = (j < k) ? (&rsrv[j]) : (&dummy);
222        }
223      while (readlinebuffer_delim (line, in, eolbyte) != NULL && n_lines++);
224
225      if (! n_lines)
226        die (EXIT_FAILURE, EOVERFLOW, _("too many input lines"));
227
228      freebuffer (&dummy);
229    }
230
231  /* no more input lines, or an input error.  */
232  if (ferror (in))
233    die (EXIT_FAILURE, errno, _("read error"));
234
235  *out_rsrv = rsrv;
236  return MIN (k, n_lines);
237}
238
239static int
240write_permuted_output_reservoir (size_t n_lines, struct linebuffer *lines,
241                                 size_t const *permutation)
242{
243  for (size_t i = 0; i < n_lines; i++)
244    {
245      const struct linebuffer *p = &lines[permutation[i]];
246      if (fwrite (p->buffer, sizeof (char), p->length, stdout) != p->length)
247        return -1;
248    }
249
250  return 0;
251}
252
253/* Read data from file IN.  Input lines are delimited by EOLBYTE;
254   silently append a trailing EOLBYTE if the file ends in some other
255   byte.  Store a pointer to the resulting array of lines into *PLINE.
256   Return the number of lines read.  Report an error and exit on
257   failure.  */
258
259static size_t
260read_input (FILE *in, char eolbyte, char ***pline)
261{
262  char *p;
263  char *buf = NULL;
264  size_t used;
265  char *lim;
266  char **line;
267  size_t n_lines;
268
269  /* TODO: We should limit the amount of data read here,
270     to less than RESERVOIR_MIN_INPUT.  I.e., adjust fread_file() to support
271     taking a byte limit.  We'd then need to ensure we handle a line spanning
272     this boundary.  With that in place we could set use_reservoir_sampling
273     when used==RESERVOIR_MIN_INPUT, and have read_input_reservoir_sampling()
274     call a wrapper function to populate a linebuffer from the internal pline
275     or if none left, stdin.  Doing that would give better performance by
276     avoiding the reservoir CPU overhead when reading < RESERVOIR_MIN_INPUT
277     from a pipe, and allow us to dispense with the input_size() function.  */
278  if (!(buf = fread_file (in, &used)))
279    die (EXIT_FAILURE, errno, _("read error"));
280
281  if (used && buf[used - 1] != eolbyte)
282    buf[used++] = eolbyte;
283
284  lim = buf + used;
285
286  n_lines = 0;
287  for (p = buf; p < lim; p = next_line (p, eolbyte, lim - p))
288    n_lines++;
289
290  *pline = line = xnmalloc (n_lines + 1, sizeof *line);
291
292  line[0] = p = buf;
293  for (size_t i = 1; i <= n_lines; i++)
294    line[i] = p = next_line (p, eolbyte, lim - p);
295
296  return n_lines;
297}
298
299/* Output N_LINES lines to stdout from LINE array,
300   chosen by the indices in PERMUTATION.
301   PERMUTATION and LINE must have at least N_LINES elements.
302   Strings in LINE must include the line-terminator character.  */
303static int
304write_permuted_lines (size_t n_lines, char *const *line,
305                      size_t const *permutation)
306{
307  for (size_t i = 0; i < n_lines; i++)
308    {
309      char *const *p = line + permutation[i];
310      size_t len = p[1] - p[0];
311      if (fwrite (p[0], sizeof *p[0], len, stdout) != len)
312        return -1;
313    }
314
315  return 0;
316}
317
318/* Output N_LINES of numbers to stdout, from PERMUTATION array.
319   PERMUTATION must have at least N_LINES elements.  */
320static int
321write_permuted_numbers (size_t n_lines, size_t lo_input,
322                        size_t const *permutation, char eolbyte)
323{
324  for (size_t i = 0; i < n_lines; i++)
325    {
326      unsigned long int n = lo_input + permutation[i];
327      if (printf ("%lu%c", n, eolbyte) < 0)
328        return -1;
329    }
330
331  return 0;
332}
333
334/* Output COUNT numbers to stdout, chosen randomly from range
335   LO_INPUT through HI_INPUT.  */
336static int
337write_random_numbers (struct randint_source *s, size_t count,
338                      size_t lo_input, size_t hi_input, char eolbyte)
339{
340  const randint range = hi_input - lo_input + 1;
341
342  for (size_t i = 0; i < count; i++)
343    {
344      unsigned long int j = lo_input + randint_choose (s, range);
345      if (printf ("%lu%c", j, eolbyte) < 0)
346        return -1;
347    }
348
349  return 0;
350}
351
352/* Output COUNT lines to stdout from LINES array.
353   LINES must have at least N_LINES elements in it.
354   Strings in LINES_ must include the line-terminator character.  */
355static int
356write_random_lines (struct randint_source *s, size_t count,
357                    char *const *lines, size_t n_lines)
358{
359  for (size_t i = 0; i < count; i++)
360    {
361      const randint j = randint_choose (s, n_lines);
362      char *const *p = lines + j;
363      size_t len = p[1] - p[0];
364      if (fwrite (p[0], sizeof *p[0], len, stdout) != len)
365        return -1;
366    }
367
368  return 0;
369}
370
371int
372main (int argc, char **argv)
373{
374  bool echo = false;
375  bool input_range = false;
376  size_t lo_input = SIZE_MAX;
377  size_t hi_input = 0;
378  size_t head_lines = SIZE_MAX;
379  char const *outfile = NULL;
380  char *random_source = NULL;
381  char eolbyte = '\n';
382  char **input_lines = NULL;
383  bool use_reservoir_sampling = false;
384  bool repeat = false;
385
386  int optc;
387  int n_operands;
388  char **operand;
389  size_t n_lines;
390  char **line = NULL;
391  struct linebuffer *reservoir = NULL;
392  struct randint_source *randint_source;
393  size_t *permutation = NULL;
394  int i;
395
396  initialize_main (&argc, &argv);
397  set_program_name (argv[0]);
398  setlocale (LC_ALL, "");
399  bindtextdomain (PACKAGE, LOCALEDIR);
400  textdomain (PACKAGE);
401
402  atexit (close_stdout);
403
404  while ((optc = getopt_long (argc, argv, "ei:n:o:rz", long_opts, NULL)) != -1)
405    switch (optc)
406      {
407      case 'e':
408        echo = true;
409        break;
410
411      case 'i':
412        {
413          char *p = strchr (optarg, '-');
414          char const *hi_optarg = optarg;
415          bool invalid = !p;
416
417          if (input_range)
418            die (EXIT_FAILURE, 0, _("multiple -i options specified"));
419          input_range = true;
420
421          if (p)
422            {
423              *p = '\0';
424              lo_input = xdectoumax (optarg, 0, SIZE_MAX, "",
425                                     _("invalid input range"), 0);
426              *p = '-';
427              hi_optarg = p + 1;
428            }
429
430          hi_input = xdectoumax (hi_optarg, 0, SIZE_MAX, "",
431                                 _("invalid input range"), 0);
432
433          n_lines = hi_input - lo_input + 1;
434          invalid |= ((lo_input <= hi_input) == (n_lines == 0));
435          if (invalid)
436            die (EXIT_FAILURE, errno, "%s: %s", _("invalid input range"),
437                 quote (optarg));
438        }
439        break;
440
441      case 'n':
442        {
443          unsigned long int argval;
444          strtol_error e = xstrtoul (optarg, NULL, 10, &argval, NULL);
445
446          if (e == LONGINT_OK)
447            head_lines = MIN (head_lines, argval);
448          else if (e != LONGINT_OVERFLOW)
449            die (EXIT_FAILURE, 0, _("invalid line count: %s"),
450                 quote (optarg));
451        }
452        break;
453
454      case 'o':
455        if (outfile && !STREQ (outfile, optarg))
456          die (EXIT_FAILURE, 0, _("multiple output files specified"));
457        outfile = optarg;
458        break;
459
460      case RANDOM_SOURCE_OPTION:
461        if (random_source && !STREQ (random_source, optarg))
462          die (EXIT_FAILURE, 0, _("multiple random sources specified"));
463        random_source = optarg;
464        break;
465
466      case 'r':
467        repeat = true;
468        break;
469
470      case 'z':
471        eolbyte = '\0';
472        break;
473
474      case_GETOPT_HELP_CHAR;
475      case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
476      default:
477        usage (EXIT_FAILURE);
478      }
479
480  n_operands = argc - optind;
481  operand = argv + optind;
482
483  /* Check invalid usage.  */
484  if (echo && input_range)
485    {
486      error (0, 0, _("cannot combine -e and -i options"));
487      usage (EXIT_FAILURE);
488    }
489  if (input_range ? 0 < n_operands : !echo && 1 < n_operands)
490    {
491      error (0, 0, _("extra operand %s"), quote (operand[!input_range]));
492      usage (EXIT_FAILURE);
493    }
494
495  /* Prepare input.  */
496  if (echo)
497    {
498      input_from_argv (operand, n_operands, eolbyte);
499      n_lines = n_operands;
500      line = operand;
501    }
502  else if (input_range)
503    {
504      n_lines = hi_input - lo_input + 1;
505      line = NULL;
506    }
507  else
508    {
509      /* If an input file is specified, re-open it as stdin.  */
510      if (n_operands == 1)
511        if (! (STREQ (operand[0], "-") || ! head_lines
512               || freopen (operand[0], "r", stdin)))
513          die (EXIT_FAILURE, errno, "%s", quotef (operand[0]));
514
515      fadvise (stdin, FADVISE_SEQUENTIAL);
516
517      if (! repeat && head_lines != SIZE_MAX
518          && (! head_lines || input_size () > RESERVOIR_MIN_INPUT))
519        {
520          use_reservoir_sampling = true;
521          n_lines = SIZE_MAX;   /* unknown number of input lines, for now.  */
522        }
523      else
524        {
525          n_lines = read_input (stdin, eolbyte, &input_lines);
526          line = input_lines;
527        }
528    }
529
530  if (! repeat)
531    head_lines = MIN (head_lines, n_lines);
532
533  randint_source = randint_all_new (random_source,
534                                    (use_reservoir_sampling || repeat
535                                     ? SIZE_MAX
536                                     : randperm_bound (head_lines, n_lines)));
537  if (! randint_source)
538    die (EXIT_FAILURE, errno, "%s", quotef (random_source));
539
540  if (use_reservoir_sampling)
541    {
542      /* Instead of reading the entire file into 'line',
543         use reservoir-sampling to store just "head_lines" random lines.  */
544      n_lines = read_input_reservoir_sampling (stdin, eolbyte, head_lines,
545                                               randint_source, &reservoir);
546      head_lines = n_lines;
547    }
548
549  /* Close stdin now, rather than earlier, so that randint_all_new
550     doesn't have to worry about opening something other than
551     stdin.  */
552  if (! (echo || input_range)
553      && (fclose (stdin) != 0))
554    die (EXIT_FAILURE, errno, _("read error"));
555
556  if (!repeat)
557    permutation = randperm_new (randint_source, head_lines, n_lines);
558
559  if (outfile && ! freopen (outfile, "w", stdout))
560    die (EXIT_FAILURE, errno, "%s", quotef (outfile));
561
562  /* Generate output according to requested method */
563  if (repeat)
564    {
565      if (head_lines == 0)
566        i = 0;
567      else
568        {
569          if (n_lines == 0)
570            die (EXIT_FAILURE, 0, _("no lines to repeat"));
571          if (input_range)
572            i = write_random_numbers (randint_source, head_lines,
573                                      lo_input, hi_input, eolbyte);
574          else
575            i = write_random_lines (randint_source, head_lines, line, n_lines);
576        }
577    }
578  else
579    {
580      if (use_reservoir_sampling)
581        i = write_permuted_output_reservoir (n_lines, reservoir, permutation);
582      else if (input_range)
583        i = write_permuted_numbers (head_lines, lo_input,
584                                    permutation, eolbyte);
585      else
586        i = write_permuted_lines (head_lines, line, permutation);
587    }
588
589  if (i != 0)
590    die (EXIT_FAILURE, errno, _("write error"));
591
592#ifdef lint
593  free (permutation);
594  randint_all_free (randint_source);
595  if (input_lines)
596    {
597      free (input_lines[0]);
598      free (input_lines);
599    }
600  if (reservoir)
601    {
602      size_t j;
603      for (j = 0; j < n_lines; ++j)
604        freebuffer (&reservoir[j]);
605      free (reservoir);
606    }
607#endif
608
609  return EXIT_SUCCESS;
610}
611