1/* Shuffle lines of text.
2
3   Copyright (C) 2006-2014 Free Software Foundation, Inc.
4
5   This program is free software: you can redistribute it and/or modify
6   it under the terms of the GNU General Public License as published by
7   the Free Software Foundation, either version 3 of the License, or
8   (at your option) any later version.
9
10   This program is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   GNU General Public License for more details.
14
15   You should have received a copy of the GNU General Public License
16   along with this program.  If not, see <http://www.gnu.org/licenses/>.
17
18   Written by Paul Eggert.  */
19
20#include <config.h>
21
22#include <sys/types.h>
23#include "system.h"
24
25#include "error.h"
26#include "fadvise.h"
27#include "getopt.h"
28#include "linebuffer.h"
29#include "quote.h"
30#include "quotearg.h"
31#include "randint.h"
32#include "randperm.h"
33#include "read-file.h"
34#include "stdio--.h"
35#include "xstrtol.h"
36
37/* The official name of this program (e.g., no 'g' prefix).  */
38#define PROGRAM_NAME "shuf"
39
40#define AUTHORS proper_name ("Paul Eggert")
41
42/* For reservoir-sampling, allocate the reservoir lines in batches.  */
43enum { RESERVOIR_LINES_INCREMENT = 1024 };
44
45/* reservoir-sampling introduces CPU overhead for small inputs.
46   So only enable it for inputs >= this limit.
47   This limit was determined using these commands:
48     $ for p in $(seq 7); do src/seq $((10**$p)) > 10p$p.in; done
49     $ for p in $(seq 7); do time shuf-nores -n10 10p$p.in >/dev/null; done
50     $ for p in $(seq 7); do time shuf -n10 10p$p.in >/dev/null; done  .*/
51enum { RESERVOIR_MIN_INPUT = 8192 * 1024 };
52
53
54void
55usage (int status)
56{
57  if (status != EXIT_SUCCESS)
58    emit_try_help ();
59  else
60    {
61      printf (_("\
62Usage: %s [OPTION]... [FILE]\n\
63  or:  %s -e [OPTION]... [ARG]...\n\
64  or:  %s -i LO-HI [OPTION]...\n\
65"),
66              program_name, program_name, program_name);
67      fputs (_("\
68Write a random permutation of the input lines to standard output.\n\
69"), stdout);
70
71      emit_mandatory_arg_note ();
72
73      fputs (_("\
74  -e, --echo                treat each ARG as an input line\n\
75  -i, --input-range=LO-HI   treat each number LO through HI as an input line\n\
76  -n, --head-count=COUNT    output at most COUNT lines\n\
77  -o, --output=FILE         write result to FILE instead of standard output\n\
78      --random-source=FILE  get random bytes from FILE\n\
79  -r, --repeat              output lines can be repeated\n\
80"), stdout);
81      fputs (_("\
82  -z, --zero-terminated     line delimiter is NUL, not newline\n\
83"), stdout);
84      fputs (HELP_OPTION_DESCRIPTION, stdout);
85      fputs (VERSION_OPTION_DESCRIPTION, stdout);
86      fputs (_("\
87\n\
88With no FILE, or when FILE is -, read standard input.\n\
89"), stdout);
90      emit_ancillary_info (PROGRAM_NAME);
91    }
92
93  exit (status);
94}
95
96/* For long options that have no equivalent short option, use a
97   non-character as a pseudo short option, starting with CHAR_MAX + 1.  */
98enum
99{
100  RANDOM_SOURCE_OPTION = CHAR_MAX + 1
101};
102
103static struct option const long_opts[] =
104{
105  {"echo", no_argument, NULL, 'e'},
106  {"input-range", required_argument, NULL, 'i'},
107  {"head-count", required_argument, NULL, 'n'},
108  {"output", required_argument, NULL, 'o'},
109  {"random-source", required_argument, NULL, RANDOM_SOURCE_OPTION},
110  {"repeat", no_argument, NULL, 'r'},
111  {"zero-terminated", no_argument, NULL, 'z'},
112  {GETOPT_HELP_OPTION_DECL},
113  {GETOPT_VERSION_OPTION_DECL},
114  {0, 0, 0, 0},
115};
116
117static void
118input_from_argv (char **operand, int n_operands, char eolbyte)
119{
120  char *p;
121  size_t size = n_operands;
122  int i;
123
124  for (i = 0; i < n_operands; i++)
125    size += strlen (operand[i]);
126  p = xmalloc (size);
127
128  for (i = 0; i < n_operands; i++)
129    {
130      char *p1 = stpcpy (p, operand[i]);
131      operand[i] = p;
132      p = p1;
133      *p++ = eolbyte;
134    }
135
136  operand[n_operands] = p;
137}
138
139/* Return the start of the next line after LINE.  The current line
140   ends in EOLBYTE, and is guaranteed to end before LINE + N.  */
141
142static char *
143next_line (char *line, char eolbyte, size_t n)
144{
145  char *p = memchr (line, eolbyte, n);
146  return p + 1;
147}
148
149/* Return the size of the input if possible or OFF_T_MAX if not.  */
150
151static off_t
152input_size (void)
153{
154  off_t file_size;
155
156  struct stat stat_buf;
157  if (fstat (STDIN_FILENO, &stat_buf) != 0)
158    return OFF_T_MAX;
159  if (usable_st_size (&stat_buf))
160    file_size = stat_buf.st_size;
161  else
162    return OFF_T_MAX;
163
164  off_t input_offset = lseek (STDIN_FILENO, 0, SEEK_CUR);
165  if (input_offset < 0)
166    return OFF_T_MAX;
167
168  file_size -= input_offset;
169
170  return file_size;
171}
172
173/* Read all lines and store up to K permuted lines in *OUT_RSRV.
174   Return the number of lines read, up to a maximum of K.  */
175
176static size_t
177read_input_reservoir_sampling (FILE *in, char eolbyte, size_t k,
178                               struct randint_source *s,
179                               struct linebuffer **out_rsrv)
180{
181  randint n_lines = 0;
182  size_t n_alloc_lines = MIN (k, RESERVOIR_LINES_INCREMENT);
183  struct linebuffer *line = NULL;
184  struct linebuffer *rsrv;
185
186  rsrv = xcalloc (n_alloc_lines, sizeof (struct linebuffer));
187
188  /* Fill the first K lines, directly into the reservoir.  */
189  while (n_lines < k
190         && (line =
191             readlinebuffer_delim (&rsrv[n_lines], in, eolbyte)) != NULL)
192    {
193      n_lines++;
194
195      /* Enlarge reservoir.  */
196      if (n_lines >= n_alloc_lines)
197        {
198          n_alloc_lines += RESERVOIR_LINES_INCREMENT;
199          rsrv = xnrealloc (rsrv, n_alloc_lines, sizeof (struct linebuffer));
200          memset (&rsrv[n_lines], 0,
201                  RESERVOIR_LINES_INCREMENT * sizeof (struct linebuffer));
202        }
203    }
204
205  /* last line wasn't NULL - so there may be more lines to read.  */
206  if (line != NULL)
207    {
208      struct linebuffer dummy;
209      initbuffer (&dummy);  /* space for lines not put in reservoir.  */
210
211      /* Choose the fate of the next line, with decreasing probability (as
212         n_lines increases in size).
213
214         If the line will be used, store it directly in the reservoir.
215         Otherwise, store it in dummy space.
216
217         With 'struct linebuffer', storing into existing buffer will reduce
218         re-allocations (will only re-allocate if the new line is longer than
219         the currently allocated space).  */
220      do
221        {
222          randint j = randint_choose (s, n_lines + 1);  /* 0 .. n_lines.  */
223          line = (j < k) ? (&rsrv[j]) : (&dummy);
224        }
225      while (readlinebuffer_delim (line, in, eolbyte) != NULL && n_lines++);
226
227      if (! n_lines)
228        error (EXIT_FAILURE, EOVERFLOW, _("too many input lines"));
229
230      freebuffer (&dummy);
231    }
232
233  /* no more input lines, or an input error.  */
234  if (ferror (in))
235    error (EXIT_FAILURE, errno, _("read error"));
236
237  *out_rsrv = rsrv;
238  return MIN (k, n_lines);
239}
240
241static int
242write_permuted_output_reservoir (size_t n_lines, struct linebuffer *lines,
243                                 size_t const *permutation)
244{
245  size_t i;
246
247  for (i = 0; i < n_lines; i++)
248    {
249      const struct linebuffer *p = &lines[permutation[i]];
250      if (fwrite (p->buffer, sizeof (char), p->length, stdout) != p->length)
251        return -1;
252    }
253
254  return 0;
255}
256
257/* Read data from file IN.  Input lines are delimited by EOLBYTE;
258   silently append a trailing EOLBYTE if the file ends in some other
259   byte.  Store a pointer to the resulting array of lines into *PLINE.
260   Return the number of lines read.  Report an error and exit on
261   failure.  */
262
263static size_t
264read_input (FILE *in, char eolbyte, char ***pline)
265{
266  char *p;
267  char *buf = NULL;
268  size_t used;
269  char *lim;
270  char **line;
271  size_t i;
272  size_t n_lines;
273
274  /* TODO: We should limit the amount of data read here,
275     to less than RESERVOIR_MIN_INPUT.  I.E. adjust fread_file() to support
276     taking a byte limit.  We'd then need to ensure we handle a line spanning
277     this boundary.  With that in place we could set use_reservoir_sampling
278     when used==RESERVOIR_MIN_INPUT, and have read_input_reservoir_sampling()
279     call a wrapper function to populate a linebuffer from the internal pline
280     or if none left, stdin.  Doing that would give better performance by
281     avoiding the reservoir CPU overhead when reading < RESERVOIR_MIN_INPUT
282     from a pipe, and allow us to dispense with the input_size() function.  */
283  if (!(buf = fread_file (in, &used)))
284    error (EXIT_FAILURE, errno, _("read error"));
285
286  if (used && buf[used - 1] != eolbyte)
287    buf[used++] = eolbyte;
288
289  lim = buf + used;
290
291  n_lines = 0;
292  for (p = buf; p < lim; p = next_line (p, eolbyte, lim - p))
293    n_lines++;
294
295  *pline = line = xnmalloc (n_lines + 1, sizeof *line);
296
297  line[0] = p = buf;
298  for (i = 1; i <= n_lines; i++)
299    line[i] = p = next_line (p, eolbyte, lim - p);
300
301  return n_lines;
302}
303
304/* Output N_LINES lines to stdout from LINE array,
305   chosen by the indices in PERMUTATION.
306   PERMUTATION and LINE must have at least N_LINES elements.
307   Strings in LINE must include the line-terminator character.  */
308static int
309write_permuted_lines (size_t n_lines, char *const *line,
310                      size_t const *permutation)
311{
312  size_t i;
313
314  for (i = 0; i < n_lines; i++)
315    {
316      char *const *p = line + permutation[i];
317      size_t len = p[1] - p[0];
318      if (fwrite (p[0], sizeof *p[0], len, stdout) != len)
319        return -1;
320    }
321
322  return 0;
323}
324
325/* Output N_LINES of numbers to stdout, from PERMUTATION array.
326   PERMUTATION must have at least N_LINES elements.  */
327static int
328write_permuted_numbers (size_t n_lines, size_t lo_input,
329                        size_t const *permutation, char eolbyte)
330{
331  size_t i;
332
333  for (i = 0; i < n_lines; i++)
334    {
335      unsigned long int n = lo_input + permutation[i];
336      if (printf ("%lu%c", n, eolbyte) < 0)
337        return -1;
338    }
339
340  return 0;
341}
342
343/* Output COUNT numbers to stdout, chosen randomly from range
344   LO_INPUT through HI_INPUT.  */
345static int
346write_random_numbers (struct randint_source *s, size_t count,
347                      size_t lo_input, size_t hi_input, char eolbyte)
348{
349  size_t i;
350  const randint range = hi_input - lo_input + 1;
351
352  for (i = 0; i < count; i++)
353    {
354      unsigned long int j = lo_input + randint_choose (s, range);
355      if (printf ("%lu%c", j, eolbyte) < 0)
356        return -1;
357    }
358
359  return 0;
360}
361
362/* Output COUNT lines to stdout from LINES array.
363   LINES must have at least N_LINES elements in it.
364   Strings in LINES_ must include the line-terminator character.  */
365static int
366write_random_lines (struct randint_source *s, size_t count,
367                    char *const *lines, size_t n_lines)
368{
369  size_t i;
370
371  for (i = 0; i < count; i++)
372    {
373      const randint j = randint_choose (s, n_lines);
374      char *const *p = lines + j;
375      size_t len = p[1] - p[0];
376      if (fwrite (p[0], sizeof *p[0], len, stdout) != len)
377        return -1;
378    }
379
380  return 0;
381}
382
383int
384main (int argc, char **argv)
385{
386  bool echo = false;
387  bool input_range = false;
388  size_t lo_input = SIZE_MAX;
389  size_t hi_input = 0;
390  size_t head_lines = SIZE_MAX;
391  char const *outfile = NULL;
392  char *random_source = NULL;
393  char eolbyte = '\n';
394  char **input_lines = NULL;
395  bool use_reservoir_sampling = false;
396  bool repeat = false;
397
398  int optc;
399  int n_operands;
400  char **operand;
401  size_t n_lines;
402  char **line = NULL;
403  struct linebuffer *reservoir = NULL;
404  struct randint_source *randint_source;
405  size_t *permutation = NULL;
406  int i;
407
408  initialize_main (&argc, &argv);
409  set_program_name (argv[0]);
410  setlocale (LC_ALL, "");
411  bindtextdomain (PACKAGE, LOCALEDIR);
412  textdomain (PACKAGE);
413
414  atexit (close_stdout);
415
416  while ((optc = getopt_long (argc, argv, "ei:n:o:rz", long_opts, NULL)) != -1)
417    switch (optc)
418      {
419      case 'e':
420        echo = true;
421        break;
422
423      case 'i':
424        {
425          unsigned long int argval = 0;
426          char *p = strchr (optarg, '-');
427          char const *hi_optarg = optarg;
428          bool invalid = !p;
429
430          if (input_range)
431            error (EXIT_FAILURE, 0, _("multiple -i options specified"));
432          input_range = true;
433
434          if (p)
435            {
436              *p = '\0';
437              invalid = ((xstrtoul (optarg, NULL, 10, &argval, NULL)
438                          != LONGINT_OK)
439                         || SIZE_MAX < argval);
440              *p = '-';
441              lo_input = argval;
442              hi_optarg = p + 1;
443            }
444
445          invalid |= ((xstrtoul (hi_optarg, NULL, 10, &argval, NULL)
446                       != LONGINT_OK)
447                      || SIZE_MAX < argval);
448          hi_input = argval;
449          n_lines = hi_input - lo_input + 1;
450          invalid |= ((lo_input <= hi_input) == (n_lines == 0));
451          if (invalid)
452            error (EXIT_FAILURE, 0, _("invalid input range %s"),
453                   quote (optarg));
454        }
455        break;
456
457      case 'n':
458        {
459          unsigned long int argval;
460          strtol_error e = xstrtoul (optarg, NULL, 10, &argval, NULL);
461
462          if (e == LONGINT_OK)
463            head_lines = MIN (head_lines, argval);
464          else if (e != LONGINT_OVERFLOW)
465            error (EXIT_FAILURE, 0, _("invalid line count %s"),
466                   quote (optarg));
467        }
468        break;
469
470      case 'o':
471        if (outfile && !STREQ (outfile, optarg))
472          error (EXIT_FAILURE, 0, _("multiple output files specified"));
473        outfile = optarg;
474        break;
475
476      case RANDOM_SOURCE_OPTION:
477        if (random_source && !STREQ (random_source, optarg))
478          error (EXIT_FAILURE, 0, _("multiple random sources specified"));
479        random_source = optarg;
480        break;
481
482      case 'r':
483        repeat = true;
484        break;
485
486      case 'z':
487        eolbyte = '\0';
488        break;
489
490      case_GETOPT_HELP_CHAR;
491      case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
492      default:
493        usage (EXIT_FAILURE);
494      }
495
496  n_operands = argc - optind;
497  operand = argv + optind;
498
499  /* Check invalid usage.  */
500  if (echo && input_range)
501    {
502      error (0, 0, _("cannot combine -e and -i options"));
503      usage (EXIT_FAILURE);
504    }
505  if (input_range ? 0 < n_operands : !echo && 1 < n_operands)
506    {
507      error (0, 0, _("extra operand %s"), quote (operand[1]));
508      usage (EXIT_FAILURE);
509    }
510
511  /* Prepare input.  */
512  if (echo)
513    {
514      input_from_argv (operand, n_operands, eolbyte);
515      n_lines = n_operands;
516      line = operand;
517    }
518  else if (input_range)
519    {
520      n_lines = hi_input - lo_input + 1;
521      line = NULL;
522    }
523  else
524    {
525      /* If an input file is specified, re-open it as stdin.  */
526      if (n_operands == 1)
527        if (! (STREQ (operand[0], "-") || ! head_lines
528               || freopen (operand[0], "r", stdin)))
529          error (EXIT_FAILURE, errno, "%s", operand[0]);
530
531      fadvise (stdin, FADVISE_SEQUENTIAL);
532
533      if (! repeat && head_lines != SIZE_MAX
534          && (! head_lines || input_size () > RESERVOIR_MIN_INPUT))
535        {
536          use_reservoir_sampling = true;
537          n_lines = SIZE_MAX;   /* unknown number of input lines, for now.  */
538        }
539      else
540        {
541          n_lines = read_input (stdin, eolbyte, &input_lines);
542          line = input_lines;
543        }
544    }
545
546  if (! repeat)
547    head_lines = MIN (head_lines, n_lines);
548
549  randint_source = randint_all_new (random_source,
550                                    (use_reservoir_sampling || repeat
551                                     ? SIZE_MAX
552                                     : randperm_bound (head_lines, n_lines)));
553  if (! randint_source)
554    error (EXIT_FAILURE, errno, "%s", quotearg_colon (random_source));
555
556  if (use_reservoir_sampling)
557    {
558      /* Instead of reading the entire file into 'line',
559         use reservoir-sampling to store just "head_lines" random lines.  */
560      n_lines = read_input_reservoir_sampling (stdin, eolbyte, head_lines,
561                                               randint_source, &reservoir);
562      head_lines = n_lines;
563    }
564
565  /* Close stdin now, rather than earlier, so that randint_all_new
566     doesn't have to worry about opening something other than
567     stdin.  */
568  if (! (echo || input_range)
569      && (fclose (stdin) != 0))
570    error (EXIT_FAILURE, errno, _("read error"));
571
572  if (!repeat)
573    permutation = randperm_new (randint_source, head_lines, n_lines);
574
575  if (outfile && ! freopen (outfile, "w", stdout))
576    error (EXIT_FAILURE, errno, "%s", quotearg_colon (outfile));
577
578  /* Generate output according to requested method */
579  if (repeat)
580    {
581      if (head_lines == 0)
582        i = 0;
583      else
584        {
585          if (n_lines == 0)
586            error (EXIT_FAILURE, 0, _("no lines to repeat"));
587          if (input_range)
588            i = write_random_numbers (randint_source, head_lines,
589                                      lo_input, hi_input, eolbyte);
590          else
591            i = write_random_lines (randint_source, head_lines, line, n_lines);
592        }
593    }
594  else
595    {
596      if (use_reservoir_sampling)
597        i = write_permuted_output_reservoir (n_lines, reservoir, permutation);
598      else if (input_range)
599        i = write_permuted_numbers (head_lines, lo_input,
600                                    permutation, eolbyte);
601      else
602        i = write_permuted_lines (head_lines, line, permutation);
603    }
604
605  if (i != 0)
606    error (EXIT_FAILURE, errno, _("write error"));
607
608#ifdef lint
609  free (permutation);
610  randint_all_free (randint_source);
611  if (input_lines)
612    {
613      free (input_lines[0]);
614      free (input_lines);
615    }
616  if (reservoir)
617    {
618      size_t j;
619      for (j = 0; j < n_lines; ++j)
620        freebuffer (&reservoir[j]);
621      free (reservoir);
622    }
623#endif
624
625  return EXIT_SUCCESS;
626}
627