1/* Shuffle lines of text.
2
3   Copyright (C) 2006-2014 Free Software Foundation, Inc.
4
5   This program is free software: you can redistribute it and/or modify
6   it under the terms of the GNU General Public License as published by
7   the Free Software Foundation, either version 3 of the License, or
8   (at your option) any later version.
9
10   This program is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   GNU General Public License for more details.
14
15   You should have received a copy of the GNU General Public License
16   along with this program.  If not, see <http://www.gnu.org/licenses/>.
17
18   Written by Paul Eggert.  */
19
20#include <config.h>
21
22#include <sys/types.h>
23#include "system.h"
24
25#include "error.h"
26#include "fadvise.h"
27#include "getopt.h"
28#include "linebuffer.h"
29#include "quote.h"
30#include "quotearg.h"
31#include "randint.h"
32#include "randperm.h"
33#include "read-file.h"
34#include "stdio--.h"
35#include "xdectoint.h"
36#include "xstrtol.h"
37
38/* The official name of this program (e.g., no 'g' prefix).  */
39#define PROGRAM_NAME "shuf"
40
41#define AUTHORS proper_name ("Paul Eggert")
42
43/* For reservoir-sampling, allocate the reservoir lines in batches.  */
44enum { RESERVOIR_LINES_INCREMENT = 1024 };
45
46/* reservoir-sampling introduces CPU overhead for small inputs.
47   So only enable it for inputs >= this limit.
48   This limit was determined using these commands:
49     $ for p in $(seq 7); do src/seq $((10**$p)) > 10p$p.in; done
50     $ for p in $(seq 7); do time shuf-nores -n10 10p$p.in >/dev/null; done
51     $ for p in $(seq 7); do time shuf -n10 10p$p.in >/dev/null; done  .*/
52enum { RESERVOIR_MIN_INPUT = 8192 * 1024 };
53
54
55void
56usage (int status)
57{
58  if (status != EXIT_SUCCESS)
59    emit_try_help ();
60  else
61    {
62      printf (_("\
63Usage: %s [OPTION]... [FILE]\n\
64  or:  %s -e [OPTION]... [ARG]...\n\
65  or:  %s -i LO-HI [OPTION]...\n\
66"),
67              program_name, program_name, program_name);
68      fputs (_("\
69Write a random permutation of the input lines to standard output.\n\
70"), stdout);
71
72      emit_mandatory_arg_note ();
73
74      fputs (_("\
75  -e, --echo                treat each ARG as an input line\n\
76  -i, --input-range=LO-HI   treat each number LO through HI as an input line\n\
77  -n, --head-count=COUNT    output at most COUNT lines\n\
78  -o, --output=FILE         write result to FILE instead of standard output\n\
79      --random-source=FILE  get random bytes from FILE\n\
80  -r, --repeat              output lines can be repeated\n\
81"), stdout);
82      fputs (_("\
83  -z, --zero-terminated     line delimiter is NUL, not newline\n\
84"), stdout);
85      fputs (HELP_OPTION_DESCRIPTION, stdout);
86      fputs (VERSION_OPTION_DESCRIPTION, stdout);
87      fputs (_("\
88\n\
89With no FILE, or when FILE is -, read standard input.\n\
90"), stdout);
91      emit_ancillary_info (PROGRAM_NAME);
92    }
93
94  exit (status);
95}
96
97/* For long options that have no equivalent short option, use a
98   non-character as a pseudo short option, starting with CHAR_MAX + 1.  */
99enum
100{
101  RANDOM_SOURCE_OPTION = CHAR_MAX + 1
102};
103
104static struct option const long_opts[] =
105{
106  {"echo", no_argument, NULL, 'e'},
107  {"input-range", required_argument, NULL, 'i'},
108  {"head-count", required_argument, NULL, 'n'},
109  {"output", required_argument, NULL, 'o'},
110  {"random-source", required_argument, NULL, RANDOM_SOURCE_OPTION},
111  {"repeat", no_argument, NULL, 'r'},
112  {"zero-terminated", no_argument, NULL, 'z'},
113  {GETOPT_HELP_OPTION_DECL},
114  {GETOPT_VERSION_OPTION_DECL},
115  {0, 0, 0, 0},
116};
117
118static void
119input_from_argv (char **operand, int n_operands, char eolbyte)
120{
121  char *p;
122  size_t size = n_operands;
123  int i;
124
125  for (i = 0; i < n_operands; i++)
126    size += strlen (operand[i]);
127  p = xmalloc (size);
128
129  for (i = 0; i < n_operands; i++)
130    {
131      char *p1 = stpcpy (p, operand[i]);
132      operand[i] = p;
133      p = p1;
134      *p++ = eolbyte;
135    }
136
137  operand[n_operands] = p;
138}
139
140/* Return the start of the next line after LINE.  The current line
141   ends in EOLBYTE, and is guaranteed to end before LINE + N.  */
142
143static char *
144next_line (char *line, char eolbyte, size_t n)
145{
146  char *p = memchr (line, eolbyte, n);
147  return p + 1;
148}
149
150/* Return the size of the input if possible or OFF_T_MAX if not.  */
151
152static off_t
153input_size (void)
154{
155  off_t file_size;
156
157  struct stat stat_buf;
158  if (fstat (STDIN_FILENO, &stat_buf) != 0)
159    return OFF_T_MAX;
160  if (usable_st_size (&stat_buf))
161    file_size = stat_buf.st_size;
162  else
163    return OFF_T_MAX;
164
165  off_t input_offset = lseek (STDIN_FILENO, 0, SEEK_CUR);
166  if (input_offset < 0)
167    return OFF_T_MAX;
168
169  file_size -= input_offset;
170
171  return file_size;
172}
173
174/* Read all lines and store up to K permuted lines in *OUT_RSRV.
175   Return the number of lines read, up to a maximum of K.  */
176
177static size_t
178read_input_reservoir_sampling (FILE *in, char eolbyte, size_t k,
179                               struct randint_source *s,
180                               struct linebuffer **out_rsrv)
181{
182  randint n_lines = 0;
183  size_t n_alloc_lines = MIN (k, RESERVOIR_LINES_INCREMENT);
184  struct linebuffer *line = NULL;
185  struct linebuffer *rsrv;
186
187  rsrv = xcalloc (n_alloc_lines, sizeof (struct linebuffer));
188
189  /* Fill the first K lines, directly into the reservoir.  */
190  while (n_lines < k
191         && (line =
192             readlinebuffer_delim (&rsrv[n_lines], in, eolbyte)) != NULL)
193    {
194      n_lines++;
195
196      /* Enlarge reservoir.  */
197      if (n_lines >= n_alloc_lines)
198        {
199          n_alloc_lines += RESERVOIR_LINES_INCREMENT;
200          rsrv = xnrealloc (rsrv, n_alloc_lines, sizeof (struct linebuffer));
201          memset (&rsrv[n_lines], 0,
202                  RESERVOIR_LINES_INCREMENT * sizeof (struct linebuffer));
203        }
204    }
205
206  /* last line wasn't NULL - so there may be more lines to read.  */
207  if (line != NULL)
208    {
209      struct linebuffer dummy;
210      initbuffer (&dummy);  /* space for lines not put in reservoir.  */
211
212      /* Choose the fate of the next line, with decreasing probability (as
213         n_lines increases in size).
214
215         If the line will be used, store it directly in the reservoir.
216         Otherwise, store it in dummy space.
217
218         With 'struct linebuffer', storing into existing buffer will reduce
219         re-allocations (will only re-allocate if the new line is longer than
220         the currently allocated space).  */
221      do
222        {
223          randint j = randint_choose (s, n_lines + 1);  /* 0 .. n_lines.  */
224          line = (j < k) ? (&rsrv[j]) : (&dummy);
225        }
226      while (readlinebuffer_delim (line, in, eolbyte) != NULL && n_lines++);
227
228      if (! n_lines)
229        error (EXIT_FAILURE, EOVERFLOW, _("too many input lines"));
230
231      freebuffer (&dummy);
232    }
233
234  /* no more input lines, or an input error.  */
235  if (ferror (in))
236    error (EXIT_FAILURE, errno, _("read error"));
237
238  *out_rsrv = rsrv;
239  return MIN (k, n_lines);
240}
241
242static int
243write_permuted_output_reservoir (size_t n_lines, struct linebuffer *lines,
244                                 size_t const *permutation)
245{
246  size_t i;
247
248  for (i = 0; i < n_lines; i++)
249    {
250      const struct linebuffer *p = &lines[permutation[i]];
251      if (fwrite (p->buffer, sizeof (char), p->length, stdout) != p->length)
252        return -1;
253    }
254
255  return 0;
256}
257
258/* Read data from file IN.  Input lines are delimited by EOLBYTE;
259   silently append a trailing EOLBYTE if the file ends in some other
260   byte.  Store a pointer to the resulting array of lines into *PLINE.
261   Return the number of lines read.  Report an error and exit on
262   failure.  */
263
264static size_t
265read_input (FILE *in, char eolbyte, char ***pline)
266{
267  char *p;
268  char *buf = NULL;
269  size_t used;
270  char *lim;
271  char **line;
272  size_t i;
273  size_t n_lines;
274
275  /* TODO: We should limit the amount of data read here,
276     to less than RESERVOIR_MIN_INPUT.  I.E. adjust fread_file() to support
277     taking a byte limit.  We'd then need to ensure we handle a line spanning
278     this boundary.  With that in place we could set use_reservoir_sampling
279     when used==RESERVOIR_MIN_INPUT, and have read_input_reservoir_sampling()
280     call a wrapper function to populate a linebuffer from the internal pline
281     or if none left, stdin.  Doing that would give better performance by
282     avoiding the reservoir CPU overhead when reading < RESERVOIR_MIN_INPUT
283     from a pipe, and allow us to dispense with the input_size() function.  */
284  if (!(buf = fread_file (in, &used)))
285    error (EXIT_FAILURE, errno, _("read error"));
286
287  if (used && buf[used - 1] != eolbyte)
288    buf[used++] = eolbyte;
289
290  lim = buf + used;
291
292  n_lines = 0;
293  for (p = buf; p < lim; p = next_line (p, eolbyte, lim - p))
294    n_lines++;
295
296  *pline = line = xnmalloc (n_lines + 1, sizeof *line);
297
298  line[0] = p = buf;
299  for (i = 1; i <= n_lines; i++)
300    line[i] = p = next_line (p, eolbyte, lim - p);
301
302  return n_lines;
303}
304
305/* Output N_LINES lines to stdout from LINE array,
306   chosen by the indices in PERMUTATION.
307   PERMUTATION and LINE must have at least N_LINES elements.
308   Strings in LINE must include the line-terminator character.  */
309static int
310write_permuted_lines (size_t n_lines, char *const *line,
311                      size_t const *permutation)
312{
313  size_t i;
314
315  for (i = 0; i < n_lines; i++)
316    {
317      char *const *p = line + permutation[i];
318      size_t len = p[1] - p[0];
319      if (fwrite (p[0], sizeof *p[0], len, stdout) != len)
320        return -1;
321    }
322
323  return 0;
324}
325
326/* Output N_LINES of numbers to stdout, from PERMUTATION array.
327   PERMUTATION must have at least N_LINES elements.  */
328static int
329write_permuted_numbers (size_t n_lines, size_t lo_input,
330                        size_t const *permutation, char eolbyte)
331{
332  size_t i;
333
334  for (i = 0; i < n_lines; i++)
335    {
336      unsigned long int n = lo_input + permutation[i];
337      if (printf ("%lu%c", n, eolbyte) < 0)
338        return -1;
339    }
340
341  return 0;
342}
343
344/* Output COUNT numbers to stdout, chosen randomly from range
345   LO_INPUT through HI_INPUT.  */
346static int
347write_random_numbers (struct randint_source *s, size_t count,
348                      size_t lo_input, size_t hi_input, char eolbyte)
349{
350  size_t i;
351  const randint range = hi_input - lo_input + 1;
352
353  for (i = 0; i < count; i++)
354    {
355      unsigned long int j = lo_input + randint_choose (s, range);
356      if (printf ("%lu%c", j, eolbyte) < 0)
357        return -1;
358    }
359
360  return 0;
361}
362
363/* Output COUNT lines to stdout from LINES array.
364   LINES must have at least N_LINES elements in it.
365   Strings in LINES_ must include the line-terminator character.  */
366static int
367write_random_lines (struct randint_source *s, size_t count,
368                    char *const *lines, size_t n_lines)
369{
370  size_t i;
371
372  for (i = 0; i < count; i++)
373    {
374      const randint j = randint_choose (s, n_lines);
375      char *const *p = lines + j;
376      size_t len = p[1] - p[0];
377      if (fwrite (p[0], sizeof *p[0], len, stdout) != len)
378        return -1;
379    }
380
381  return 0;
382}
383
384int
385main (int argc, char **argv)
386{
387  bool echo = false;
388  bool input_range = false;
389  size_t lo_input = SIZE_MAX;
390  size_t hi_input = 0;
391  size_t head_lines = SIZE_MAX;
392  char const *outfile = NULL;
393  char *random_source = NULL;
394  char eolbyte = '\n';
395  char **input_lines = NULL;
396  bool use_reservoir_sampling = false;
397  bool repeat = false;
398
399  int optc;
400  int n_operands;
401  char **operand;
402  size_t n_lines;
403  char **line = NULL;
404  struct linebuffer *reservoir = NULL;
405  struct randint_source *randint_source;
406  size_t *permutation = NULL;
407  int i;
408
409  initialize_main (&argc, &argv);
410  set_program_name (argv[0]);
411  setlocale (LC_ALL, "");
412  bindtextdomain (PACKAGE, LOCALEDIR);
413  textdomain (PACKAGE);
414
415  atexit (close_stdout);
416
417  while ((optc = getopt_long (argc, argv, "ei:n:o:rz", long_opts, NULL)) != -1)
418    switch (optc)
419      {
420      case 'e':
421        echo = true;
422        break;
423
424      case 'i':
425        {
426          char *p = strchr (optarg, '-');
427          char const *hi_optarg = optarg;
428          bool invalid = !p;
429
430          if (input_range)
431            error (EXIT_FAILURE, 0, _("multiple -i options specified"));
432          input_range = true;
433
434          if (p)
435            {
436              *p = '\0';
437              lo_input = xdectoumax (optarg, 0, SIZE_MAX, "",
438                                     _("invalid input range"), 0);
439              *p = '-';
440              hi_optarg = p + 1;
441            }
442
443          hi_input = xdectoumax (hi_optarg, 0, SIZE_MAX, "",
444                                 _("invalid input range"), 0);
445
446          n_lines = hi_input - lo_input + 1;
447          invalid |= ((lo_input <= hi_input) == (n_lines == 0));
448          if (invalid)
449            error (EXIT_FAILURE, errno, "%s: %s", _("invalid input range"),
450                   quote (optarg));
451        }
452        break;
453
454      case 'n':
455        {
456          unsigned long int argval;
457          strtol_error e = xstrtoul (optarg, NULL, 10, &argval, NULL);
458
459          if (e == LONGINT_OK)
460            head_lines = MIN (head_lines, argval);
461          else if (e != LONGINT_OVERFLOW)
462            error (EXIT_FAILURE, 0, _("invalid line count: %s"),
463                   quote (optarg));
464        }
465        break;
466
467      case 'o':
468        if (outfile && !STREQ (outfile, optarg))
469          error (EXIT_FAILURE, 0, _("multiple output files specified"));
470        outfile = optarg;
471        break;
472
473      case RANDOM_SOURCE_OPTION:
474        if (random_source && !STREQ (random_source, optarg))
475          error (EXIT_FAILURE, 0, _("multiple random sources specified"));
476        random_source = optarg;
477        break;
478
479      case 'r':
480        repeat = true;
481        break;
482
483      case 'z':
484        eolbyte = '\0';
485        break;
486
487      case_GETOPT_HELP_CHAR;
488      case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
489      default:
490        usage (EXIT_FAILURE);
491      }
492
493  n_operands = argc - optind;
494  operand = argv + optind;
495
496  /* Check invalid usage.  */
497  if (echo && input_range)
498    {
499      error (0, 0, _("cannot combine -e and -i options"));
500      usage (EXIT_FAILURE);
501    }
502  if (input_range ? 0 < n_operands : !echo && 1 < n_operands)
503    {
504      error (0, 0, _("extra operand %s"), quote (operand[1]));
505      usage (EXIT_FAILURE);
506    }
507
508  /* Prepare input.  */
509  if (echo)
510    {
511      input_from_argv (operand, n_operands, eolbyte);
512      n_lines = n_operands;
513      line = operand;
514    }
515  else if (input_range)
516    {
517      n_lines = hi_input - lo_input + 1;
518      line = NULL;
519    }
520  else
521    {
522      /* If an input file is specified, re-open it as stdin.  */
523      if (n_operands == 1)
524        if (! (STREQ (operand[0], "-") || ! head_lines
525               || freopen (operand[0], "r", stdin)))
526          error (EXIT_FAILURE, errno, "%s", operand[0]);
527
528      fadvise (stdin, FADVISE_SEQUENTIAL);
529
530      if (! repeat && head_lines != SIZE_MAX
531          && (! head_lines || input_size () > RESERVOIR_MIN_INPUT))
532        {
533          use_reservoir_sampling = true;
534          n_lines = SIZE_MAX;   /* unknown number of input lines, for now.  */
535        }
536      else
537        {
538          n_lines = read_input (stdin, eolbyte, &input_lines);
539          line = input_lines;
540        }
541    }
542
543  if (! repeat)
544    head_lines = MIN (head_lines, n_lines);
545
546  randint_source = randint_all_new (random_source,
547                                    (use_reservoir_sampling || repeat
548                                     ? SIZE_MAX
549                                     : randperm_bound (head_lines, n_lines)));
550  if (! randint_source)
551    error (EXIT_FAILURE, errno, "%s", quotearg_colon (random_source));
552
553  if (use_reservoir_sampling)
554    {
555      /* Instead of reading the entire file into 'line',
556         use reservoir-sampling to store just "head_lines" random lines.  */
557      n_lines = read_input_reservoir_sampling (stdin, eolbyte, head_lines,
558                                               randint_source, &reservoir);
559      head_lines = n_lines;
560    }
561
562  /* Close stdin now, rather than earlier, so that randint_all_new
563     doesn't have to worry about opening something other than
564     stdin.  */
565  if (! (echo || input_range)
566      && (fclose (stdin) != 0))
567    error (EXIT_FAILURE, errno, _("read error"));
568
569  if (!repeat)
570    permutation = randperm_new (randint_source, head_lines, n_lines);
571
572  if (outfile && ! freopen (outfile, "w", stdout))
573    error (EXIT_FAILURE, errno, "%s", quotearg_colon (outfile));
574
575  /* Generate output according to requested method */
576  if (repeat)
577    {
578      if (head_lines == 0)
579        i = 0;
580      else
581        {
582          if (n_lines == 0)
583            error (EXIT_FAILURE, 0, _("no lines to repeat"));
584          if (input_range)
585            i = write_random_numbers (randint_source, head_lines,
586                                      lo_input, hi_input, eolbyte);
587          else
588            i = write_random_lines (randint_source, head_lines, line, n_lines);
589        }
590    }
591  else
592    {
593      if (use_reservoir_sampling)
594        i = write_permuted_output_reservoir (n_lines, reservoir, permutation);
595      else if (input_range)
596        i = write_permuted_numbers (head_lines, lo_input,
597                                    permutation, eolbyte);
598      else
599        i = write_permuted_lines (head_lines, line, permutation);
600    }
601
602  if (i != 0)
603    error (EXIT_FAILURE, errno, _("write error"));
604
605#ifdef lint
606  free (permutation);
607  randint_all_free (randint_source);
608  if (input_lines)
609    {
610      free (input_lines[0]);
611      free (input_lines);
612    }
613  if (reservoir)
614    {
615      size_t j;
616      for (j = 0; j < n_lines; ++j)
617        freebuffer (&reservoir[j]);
618      free (reservoir);
619    }
620#endif
621
622  return EXIT_SUCCESS;
623}
624