1/* Shuffle lines of text.
2
3   Copyright (C) 2006-2015 Free Software Foundation, Inc.
4
5   This program is free software: you can redistribute it and/or modify
6   it under the terms of the GNU General Public License as published by
7   the Free Software Foundation, either version 3 of the License, or
8   (at your option) any later version.
9
10   This program is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   GNU General Public License for more details.
14
15   You should have received a copy of the GNU General Public License
16   along with this program.  If not, see <http://www.gnu.org/licenses/>.
17
18   Written by Paul Eggert.  */
19
20#include <config.h>
21
22#include <sys/types.h>
23#include "system.h"
24
25#include "error.h"
26#include "fadvise.h"
27#include "getopt.h"
28#include "linebuffer.h"
29#include "quote.h"
30#include "quotearg.h"
31#include "randint.h"
32#include "randperm.h"
33#include "read-file.h"
34#include "stdio--.h"
35#include "xdectoint.h"
36#include "xstrtol.h"
37
38/* The official name of this program (e.g., no 'g' prefix).  */
39#define PROGRAM_NAME "shuf"
40
41#define AUTHORS proper_name ("Paul Eggert")
42
43/* For reservoir-sampling, allocate the reservoir lines in batches.  */
44enum { RESERVOIR_LINES_INCREMENT = 1024 };
45
46/* reservoir-sampling introduces CPU overhead for small inputs.
47   So only enable it for inputs >= this limit.
48   This limit was determined using these commands:
49     $ for p in $(seq 7); do src/seq $((10**$p)) > 10p$p.in; done
50     $ for p in $(seq 7); do time shuf-nores -n10 10p$p.in >/dev/null; done
51     $ for p in $(seq 7); do time shuf -n10 10p$p.in >/dev/null; done  .*/
52enum { RESERVOIR_MIN_INPUT = 8192 * 1024 };
53
54
55void
56usage (int status)
57{
58  if (status != EXIT_SUCCESS)
59    emit_try_help ();
60  else
61    {
62      printf (_("\
63Usage: %s [OPTION]... [FILE]\n\
64  or:  %s -e [OPTION]... [ARG]...\n\
65  or:  %s -i LO-HI [OPTION]...\n\
66"),
67              program_name, program_name, program_name);
68      fputs (_("\
69Write a random permutation of the input lines to standard output.\n\
70"), stdout);
71
72      emit_stdin_note ();
73      emit_mandatory_arg_note ();
74
75      fputs (_("\
76  -e, --echo                treat each ARG as an input line\n\
77  -i, --input-range=LO-HI   treat each number LO through HI as an input line\n\
78  -n, --head-count=COUNT    output at most COUNT lines\n\
79  -o, --output=FILE         write result to FILE instead of standard output\n\
80      --random-source=FILE  get random bytes from FILE\n\
81  -r, --repeat              output lines can be repeated\n\
82"), stdout);
83      fputs (_("\
84  -z, --zero-terminated     line delimiter is NUL, not newline\n\
85"), stdout);
86      fputs (HELP_OPTION_DESCRIPTION, stdout);
87      fputs (VERSION_OPTION_DESCRIPTION, stdout);
88      emit_ancillary_info (PROGRAM_NAME);
89    }
90
91  exit (status);
92}
93
94/* For long options that have no equivalent short option, use a
95   non-character as a pseudo short option, starting with CHAR_MAX + 1.  */
96enum
97{
98  RANDOM_SOURCE_OPTION = CHAR_MAX + 1
99};
100
101static struct option const long_opts[] =
102{
103  {"echo", no_argument, NULL, 'e'},
104  {"input-range", required_argument, NULL, 'i'},
105  {"head-count", required_argument, NULL, 'n'},
106  {"output", required_argument, NULL, 'o'},
107  {"random-source", required_argument, NULL, RANDOM_SOURCE_OPTION},
108  {"repeat", no_argument, NULL, 'r'},
109  {"zero-terminated", no_argument, NULL, 'z'},
110  {GETOPT_HELP_OPTION_DECL},
111  {GETOPT_VERSION_OPTION_DECL},
112  {0, 0, 0, 0},
113};
114
115static void
116input_from_argv (char **operand, int n_operands, char eolbyte)
117{
118  char *p;
119  size_t size = n_operands;
120  int i;
121
122  for (i = 0; i < n_operands; i++)
123    size += strlen (operand[i]);
124  p = xmalloc (size);
125
126  for (i = 0; i < n_operands; i++)
127    {
128      char *p1 = stpcpy (p, operand[i]);
129      operand[i] = p;
130      p = p1;
131      *p++ = eolbyte;
132    }
133
134  operand[n_operands] = p;
135}
136
137/* Return the start of the next line after LINE.  The current line
138   ends in EOLBYTE, and is guaranteed to end before LINE + N.  */
139
140static char *
141next_line (char *line, char eolbyte, size_t n)
142{
143  char *p = memchr (line, eolbyte, n);
144  return p + 1;
145}
146
147/* Return the size of the input if possible or OFF_T_MAX if not.  */
148
149static off_t
150input_size (void)
151{
152  off_t file_size;
153
154  struct stat stat_buf;
155  if (fstat (STDIN_FILENO, &stat_buf) != 0)
156    return OFF_T_MAX;
157  if (usable_st_size (&stat_buf))
158    file_size = stat_buf.st_size;
159  else
160    return OFF_T_MAX;
161
162  off_t input_offset = lseek (STDIN_FILENO, 0, SEEK_CUR);
163  if (input_offset < 0)
164    return OFF_T_MAX;
165
166  file_size -= input_offset;
167
168  return file_size;
169}
170
171/* Read all lines and store up to K permuted lines in *OUT_RSRV.
172   Return the number of lines read, up to a maximum of K.  */
173
174static size_t
175read_input_reservoir_sampling (FILE *in, char eolbyte, size_t k,
176                               struct randint_source *s,
177                               struct linebuffer **out_rsrv)
178{
179  randint n_lines = 0;
180  size_t n_alloc_lines = MIN (k, RESERVOIR_LINES_INCREMENT);
181  struct linebuffer *line = NULL;
182  struct linebuffer *rsrv;
183
184  rsrv = xcalloc (n_alloc_lines, sizeof (struct linebuffer));
185
186  /* Fill the first K lines, directly into the reservoir.  */
187  while (n_lines < k
188         && (line =
189             readlinebuffer_delim (&rsrv[n_lines], in, eolbyte)) != NULL)
190    {
191      n_lines++;
192
193      /* Enlarge reservoir.  */
194      if (n_lines >= n_alloc_lines)
195        {
196          n_alloc_lines += RESERVOIR_LINES_INCREMENT;
197          rsrv = xnrealloc (rsrv, n_alloc_lines, sizeof (struct linebuffer));
198          memset (&rsrv[n_lines], 0,
199                  RESERVOIR_LINES_INCREMENT * sizeof (struct linebuffer));
200        }
201    }
202
203  /* last line wasn't NULL - so there may be more lines to read.  */
204  if (line != NULL)
205    {
206      struct linebuffer dummy;
207      initbuffer (&dummy);  /* space for lines not put in reservoir.  */
208
209      /* Choose the fate of the next line, with decreasing probability (as
210         n_lines increases in size).
211
212         If the line will be used, store it directly in the reservoir.
213         Otherwise, store it in dummy space.
214
215         With 'struct linebuffer', storing into existing buffer will reduce
216         re-allocations (will only re-allocate if the new line is longer than
217         the currently allocated space).  */
218      do
219        {
220          randint j = randint_choose (s, n_lines + 1);  /* 0 .. n_lines.  */
221          line = (j < k) ? (&rsrv[j]) : (&dummy);
222        }
223      while (readlinebuffer_delim (line, in, eolbyte) != NULL && n_lines++);
224
225      if (! n_lines)
226        error (EXIT_FAILURE, EOVERFLOW, _("too many input lines"));
227
228      freebuffer (&dummy);
229    }
230
231  /* no more input lines, or an input error.  */
232  if (ferror (in))
233    error (EXIT_FAILURE, errno, _("read error"));
234
235  *out_rsrv = rsrv;
236  return MIN (k, n_lines);
237}
238
239static int
240write_permuted_output_reservoir (size_t n_lines, struct linebuffer *lines,
241                                 size_t const *permutation)
242{
243  size_t i;
244
245  for (i = 0; i < n_lines; i++)
246    {
247      const struct linebuffer *p = &lines[permutation[i]];
248      if (fwrite (p->buffer, sizeof (char), p->length, stdout) != p->length)
249        return -1;
250    }
251
252  return 0;
253}
254
255/* Read data from file IN.  Input lines are delimited by EOLBYTE;
256   silently append a trailing EOLBYTE if the file ends in some other
257   byte.  Store a pointer to the resulting array of lines into *PLINE.
258   Return the number of lines read.  Report an error and exit on
259   failure.  */
260
261static size_t
262read_input (FILE *in, char eolbyte, char ***pline)
263{
264  char *p;
265  char *buf = NULL;
266  size_t used;
267  char *lim;
268  char **line;
269  size_t i;
270  size_t n_lines;
271
272  /* TODO: We should limit the amount of data read here,
273     to less than RESERVOIR_MIN_INPUT.  I.e., adjust fread_file() to support
274     taking a byte limit.  We'd then need to ensure we handle a line spanning
275     this boundary.  With that in place we could set use_reservoir_sampling
276     when used==RESERVOIR_MIN_INPUT, and have read_input_reservoir_sampling()
277     call a wrapper function to populate a linebuffer from the internal pline
278     or if none left, stdin.  Doing that would give better performance by
279     avoiding the reservoir CPU overhead when reading < RESERVOIR_MIN_INPUT
280     from a pipe, and allow us to dispense with the input_size() function.  */
281  if (!(buf = fread_file (in, &used)))
282    error (EXIT_FAILURE, errno, _("read error"));
283
284  if (used && buf[used - 1] != eolbyte)
285    buf[used++] = eolbyte;
286
287  lim = buf + used;
288
289  n_lines = 0;
290  for (p = buf; p < lim; p = next_line (p, eolbyte, lim - p))
291    n_lines++;
292
293  *pline = line = xnmalloc (n_lines + 1, sizeof *line);
294
295  line[0] = p = buf;
296  for (i = 1; i <= n_lines; i++)
297    line[i] = p = next_line (p, eolbyte, lim - p);
298
299  return n_lines;
300}
301
302/* Output N_LINES lines to stdout from LINE array,
303   chosen by the indices in PERMUTATION.
304   PERMUTATION and LINE must have at least N_LINES elements.
305   Strings in LINE must include the line-terminator character.  */
306static int
307write_permuted_lines (size_t n_lines, char *const *line,
308                      size_t const *permutation)
309{
310  size_t i;
311
312  for (i = 0; i < n_lines; i++)
313    {
314      char *const *p = line + permutation[i];
315      size_t len = p[1] - p[0];
316      if (fwrite (p[0], sizeof *p[0], len, stdout) != len)
317        return -1;
318    }
319
320  return 0;
321}
322
323/* Output N_LINES of numbers to stdout, from PERMUTATION array.
324   PERMUTATION must have at least N_LINES elements.  */
325static int
326write_permuted_numbers (size_t n_lines, size_t lo_input,
327                        size_t const *permutation, char eolbyte)
328{
329  size_t i;
330
331  for (i = 0; i < n_lines; i++)
332    {
333      unsigned long int n = lo_input + permutation[i];
334      if (printf ("%lu%c", n, eolbyte) < 0)
335        return -1;
336    }
337
338  return 0;
339}
340
341/* Output COUNT numbers to stdout, chosen randomly from range
342   LO_INPUT through HI_INPUT.  */
343static int
344write_random_numbers (struct randint_source *s, size_t count,
345                      size_t lo_input, size_t hi_input, char eolbyte)
346{
347  size_t i;
348  const randint range = hi_input - lo_input + 1;
349
350  for (i = 0; i < count; i++)
351    {
352      unsigned long int j = lo_input + randint_choose (s, range);
353      if (printf ("%lu%c", j, eolbyte) < 0)
354        return -1;
355    }
356
357  return 0;
358}
359
360/* Output COUNT lines to stdout from LINES array.
361   LINES must have at least N_LINES elements in it.
362   Strings in LINES_ must include the line-terminator character.  */
363static int
364write_random_lines (struct randint_source *s, size_t count,
365                    char *const *lines, size_t n_lines)
366{
367  size_t i;
368
369  for (i = 0; i < count; i++)
370    {
371      const randint j = randint_choose (s, n_lines);
372      char *const *p = lines + j;
373      size_t len = p[1] - p[0];
374      if (fwrite (p[0], sizeof *p[0], len, stdout) != len)
375        return -1;
376    }
377
378  return 0;
379}
380
381int
382main (int argc, char **argv)
383{
384  bool echo = false;
385  bool input_range = false;
386  size_t lo_input = SIZE_MAX;
387  size_t hi_input = 0;
388  size_t head_lines = SIZE_MAX;
389  char const *outfile = NULL;
390  char *random_source = NULL;
391  char eolbyte = '\n';
392  char **input_lines = NULL;
393  bool use_reservoir_sampling = false;
394  bool repeat = false;
395
396  int optc;
397  int n_operands;
398  char **operand;
399  size_t n_lines;
400  char **line = NULL;
401  struct linebuffer *reservoir = NULL;
402  struct randint_source *randint_source;
403  size_t *permutation = NULL;
404  int i;
405
406  initialize_main (&argc, &argv);
407  set_program_name (argv[0]);
408  setlocale (LC_ALL, "");
409  bindtextdomain (PACKAGE, LOCALEDIR);
410  textdomain (PACKAGE);
411
412  atexit (close_stdout);
413
414  while ((optc = getopt_long (argc, argv, "ei:n:o:rz", long_opts, NULL)) != -1)
415    switch (optc)
416      {
417      case 'e':
418        echo = true;
419        break;
420
421      case 'i':
422        {
423          char *p = strchr (optarg, '-');
424          char const *hi_optarg = optarg;
425          bool invalid = !p;
426
427          if (input_range)
428            error (EXIT_FAILURE, 0, _("multiple -i options specified"));
429          input_range = true;
430
431          if (p)
432            {
433              *p = '\0';
434              lo_input = xdectoumax (optarg, 0, SIZE_MAX, "",
435                                     _("invalid input range"), 0);
436              *p = '-';
437              hi_optarg = p + 1;
438            }
439
440          hi_input = xdectoumax (hi_optarg, 0, SIZE_MAX, "",
441                                 _("invalid input range"), 0);
442
443          n_lines = hi_input - lo_input + 1;
444          invalid |= ((lo_input <= hi_input) == (n_lines == 0));
445          if (invalid)
446            error (EXIT_FAILURE, errno, "%s: %s", _("invalid input range"),
447                   quote (optarg));
448        }
449        break;
450
451      case 'n':
452        {
453          unsigned long int argval;
454          strtol_error e = xstrtoul (optarg, NULL, 10, &argval, NULL);
455
456          if (e == LONGINT_OK)
457            head_lines = MIN (head_lines, argval);
458          else if (e != LONGINT_OVERFLOW)
459            error (EXIT_FAILURE, 0, _("invalid line count: %s"),
460                   quote (optarg));
461        }
462        break;
463
464      case 'o':
465        if (outfile && !STREQ (outfile, optarg))
466          error (EXIT_FAILURE, 0, _("multiple output files specified"));
467        outfile = optarg;
468        break;
469
470      case RANDOM_SOURCE_OPTION:
471        if (random_source && !STREQ (random_source, optarg))
472          error (EXIT_FAILURE, 0, _("multiple random sources specified"));
473        random_source = optarg;
474        break;
475
476      case 'r':
477        repeat = true;
478        break;
479
480      case 'z':
481        eolbyte = '\0';
482        break;
483
484      case_GETOPT_HELP_CHAR;
485      case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
486      default:
487        usage (EXIT_FAILURE);
488      }
489
490  n_operands = argc - optind;
491  operand = argv + optind;
492
493  /* Check invalid usage.  */
494  if (echo && input_range)
495    {
496      error (0, 0, _("cannot combine -e and -i options"));
497      usage (EXIT_FAILURE);
498    }
499  if (input_range ? 0 < n_operands : !echo && 1 < n_operands)
500    {
501      error (0, 0, _("extra operand %s"), quote (operand[!input_range]));
502      usage (EXIT_FAILURE);
503    }
504
505  /* Prepare input.  */
506  if (echo)
507    {
508      input_from_argv (operand, n_operands, eolbyte);
509      n_lines = n_operands;
510      line = operand;
511    }
512  else if (input_range)
513    {
514      n_lines = hi_input - lo_input + 1;
515      line = NULL;
516    }
517  else
518    {
519      /* If an input file is specified, re-open it as stdin.  */
520      if (n_operands == 1)
521        if (! (STREQ (operand[0], "-") || ! head_lines
522               || freopen (operand[0], "r", stdin)))
523          error (EXIT_FAILURE, errno, "%s", operand[0]);
524
525      fadvise (stdin, FADVISE_SEQUENTIAL);
526
527      if (! repeat && head_lines != SIZE_MAX
528          && (! head_lines || input_size () > RESERVOIR_MIN_INPUT))
529        {
530          use_reservoir_sampling = true;
531          n_lines = SIZE_MAX;   /* unknown number of input lines, for now.  */
532        }
533      else
534        {
535          n_lines = read_input (stdin, eolbyte, &input_lines);
536          line = input_lines;
537        }
538    }
539
540  if (! repeat)
541    head_lines = MIN (head_lines, n_lines);
542
543  randint_source = randint_all_new (random_source,
544                                    (use_reservoir_sampling || repeat
545                                     ? SIZE_MAX
546                                     : randperm_bound (head_lines, n_lines)));
547  if (! randint_source)
548    error (EXIT_FAILURE, errno, "%s", quotearg_colon (random_source));
549
550  if (use_reservoir_sampling)
551    {
552      /* Instead of reading the entire file into 'line',
553         use reservoir-sampling to store just "head_lines" random lines.  */
554      n_lines = read_input_reservoir_sampling (stdin, eolbyte, head_lines,
555                                               randint_source, &reservoir);
556      head_lines = n_lines;
557    }
558
559  /* Close stdin now, rather than earlier, so that randint_all_new
560     doesn't have to worry about opening something other than
561     stdin.  */
562  if (! (echo || input_range)
563      && (fclose (stdin) != 0))
564    error (EXIT_FAILURE, errno, _("read error"));
565
566  if (!repeat)
567    permutation = randperm_new (randint_source, head_lines, n_lines);
568
569  if (outfile && ! freopen (outfile, "w", stdout))
570    error (EXIT_FAILURE, errno, "%s", quotearg_colon (outfile));
571
572  /* Generate output according to requested method */
573  if (repeat)
574    {
575      if (head_lines == 0)
576        i = 0;
577      else
578        {
579          if (n_lines == 0)
580            error (EXIT_FAILURE, 0, _("no lines to repeat"));
581          if (input_range)
582            i = write_random_numbers (randint_source, head_lines,
583                                      lo_input, hi_input, eolbyte);
584          else
585            i = write_random_lines (randint_source, head_lines, line, n_lines);
586        }
587    }
588  else
589    {
590      if (use_reservoir_sampling)
591        i = write_permuted_output_reservoir (n_lines, reservoir, permutation);
592      else if (input_range)
593        i = write_permuted_numbers (head_lines, lo_input,
594                                    permutation, eolbyte);
595      else
596        i = write_permuted_lines (head_lines, line, permutation);
597    }
598
599  if (i != 0)
600    error (EXIT_FAILURE, errno, _("write error"));
601
602#ifdef lint
603  free (permutation);
604  randint_all_free (randint_source);
605  if (input_lines)
606    {
607      free (input_lines[0]);
608      free (input_lines);
609    }
610  if (reservoir)
611    {
612      size_t j;
613      for (j = 0; j < n_lines; ++j)
614        freebuffer (&reservoir[j]);
615      free (reservoir);
616    }
617#endif
618
619  return EXIT_SUCCESS;
620}
621