| /* `L' command implementation for GNU sed, based on GNU fmt 1.22. |
| Copyright (C) 1994, 1995, 1996, 2002, 2003 Free Software Foundation, Inc. |
| |
| This program is free software; you can redistribute it and/or modify |
| it under the terms of the GNU General Public License as published by |
| the Free Software Foundation; either version 3, or (at your option) |
| any later version. |
| |
| This program is distributed in the hope that it will be useful, |
| but WITHOUT ANY WARRANTY; without even the implied warranty of |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| GNU General Public License for more details. |
| |
| You should have received a copy of the GNU General Public License |
| along with this program; if not, write to the Free Software Foundation, |
| Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ |
| |
| /* GNU fmt was written by Ross Paterson <rap@doc.ic.ac.uk>. */ |
| |
| #include "sed.h" |
| |
| #include <stdio.h> |
| #include <string.h> |
| #include <ctype.h> |
| #include <sys/types.h> |
| |
| #if HAVE_LIMITS_H |
| # include <limits.h> |
| #endif |
| |
| #ifndef UINT_MAX |
| # define UINT_MAX ((unsigned int) ~(unsigned int) 0) |
| #endif |
| |
| #ifndef INT_MAX |
| # define INT_MAX ((int) (UINT_MAX >> 1)) |
| #endif |
| |
| /* The following parameters represent the program's idea of what is |
| "best". Adjust to taste, subject to the caveats given. */ |
| |
| /* Prefer lines to be LEEWAY % shorter than the maximum width, giving |
| room for optimization. */ |
| #define LEEWAY 7 |
| |
| /* Costs and bonuses are expressed as the equivalent departure from the |
| optimal line length, multiplied by 10. e.g. assigning something a |
| cost of 50 means that it is as bad as a line 5 characters too short |
| or too long. The definition of SHORT_COST(n) should not be changed. |
| However, EQUIV(n) may need tuning. */ |
| |
| typedef long COST; |
| |
| #define MAXCOST (~(((unsigned long) 1) << (8 * sizeof (COST) -1))) |
| |
| #define SQR(n) ((n) * (n)) |
| #define EQUIV(n) SQR ((COST) (n)) |
| |
| /* Cost of a filled line n chars longer or shorter than best_width. */ |
| #define SHORT_COST(n) EQUIV ((n) * 10) |
| |
| /* Cost of the difference between adjacent filled lines. */ |
| #define RAGGED_COST(n) (SHORT_COST (n) / 2) |
| |
| /* Basic cost per line. */ |
| #define LINE_COST EQUIV (70) |
| |
| /* Cost of breaking a line after the first word of a sentence, where |
| the length of the word is N. */ |
| #define WIDOW_COST(n) (EQUIV (200) / ((n) + 2)) |
| |
| /* Cost of breaking a line before the last word of a sentence, where |
| the length of the word is N. */ |
| #define ORPHAN_COST(n) (EQUIV (150) / ((n) + 2)) |
| |
| /* Bonus for breaking a line at the end of a sentence. */ |
| #define SENTENCE_BONUS EQUIV (50) |
| |
| /* Cost of breaking a line after a period not marking end of a sentence. |
| With the definition of sentence we are using (borrowed from emacs, see |
| get_line()) such a break would then look like a sentence break. Hence |
| we assign a very high cost -- it should be avoided unless things are |
| really bad. */ |
| #define NOBREAK_COST EQUIV (600) |
| |
| /* Bonus for breaking a line before open parenthesis. */ |
| #define PAREN_BONUS EQUIV (40) |
| |
| /* Bonus for breaking a line after other punctuation. */ |
| #define PUNCT_BONUS EQUIV(40) |
| |
| /* Credit for breaking a long paragraph one line later. */ |
| #define LINE_CREDIT EQUIV(3) |
| |
| /* Size of paragraph buffer in words. Longer paragraphs are handled |
| neatly (cf. flush_paragraph()), so there's little to gain by making |
| these larger. */ |
| #define MAXWORDS 1000 |
| |
| #define GETC() (parabuf == end_of_parabuf ? EOF : *parabuf++) |
| |
| /* Extra ctype(3)-style macros. */ |
| |
| #define isopen(c) (strchr ("([`'\"", (c)) != NULL) |
| #define isclose(c) (strchr (")]'\"", (c)) != NULL) |
| #define isperiod(c) (strchr (".?!", (c)) != NULL) |
| |
| /* Size of a tab stop, for expansion on input and re-introduction on |
| output. */ |
| #define TABWIDTH 8 |
| |
| /* Word descriptor structure. */ |
| |
| typedef struct Word WORD; |
| |
| struct Word |
| { |
| |
| /* Static attributes determined during input. */ |
| |
| const char *text; /* the text of the word */ |
| short length; /* length of this word */ |
| short space; /* the size of the following space */ |
| unsigned paren:1; /* starts with open paren */ |
| unsigned period:1; /* ends in [.?!])* */ |
| unsigned punct:1; /* ends in punctuation */ |
| unsigned final:1; /* end of sentence */ |
| |
| /* The remaining fields are computed during the optimization. */ |
| |
| short line_length; /* length of the best line starting here */ |
| COST best_cost; /* cost of best paragraph starting here */ |
| WORD *next_break; /* break which achieves best_cost */ |
| }; |
| |
| /* Forward declarations. */ |
| |
| static bool get_paragraph P_ ((void)); |
| static int get_line P_ ((int c)); |
| static int get_space P_ ((int c)); |
| static int copy_rest P_ ((int c)); |
| static bool same_para P_ ((int c)); |
| static void flush_paragraph P_ ((void)); |
| static void fmt_paragraph P_ ((void)); |
| static void check_punctuation P_ ((WORD *w)); |
| static COST base_cost P_ ((WORD *this)); |
| static COST line_cost P_ ((WORD *next, int len)); |
| static void put_paragraph P_ ((WORD *finish)); |
| static void put_line P_ ((WORD *w, int indent)); |
| static void put_word P_ ((WORD *w)); |
| static void put_space P_ ((int space)); |
| |
| /* Option values. */ |
| |
| /* User-supplied maximum line width (default WIDTH). The only output |
| lines |
| longer than this will each comprise a single word. */ |
| static int max_width; |
| |
| /* Space for the paragraph text. */ |
| static const char *parabuf; |
| |
| /* End of space for the paragraph text. */ |
| static const char *end_of_parabuf; |
| |
| /* The file on which we output */ |
| static FILE *outfile; |
| |
| /* Values derived from the option values. */ |
| |
| /* The preferred width of text lines, set to LEEWAY % less than max_width. */ |
| static int best_width; |
| |
| /* Dynamic variables. */ |
| |
| /* Start column of the character most recently read from the input file. */ |
| static int in_column; |
| |
| /* Start column of the next character to be written to stdout. */ |
| static int out_column; |
| |
| /* The words of a paragraph -- longer paragraphs are handled neatly |
| (cf. flush_paragraph()). */ |
| static WORD words[MAXWORDS]; |
| |
| /* A pointer into the above word array, indicating the first position |
| after the last complete word. Sometimes it will point at an incomplete |
| word. */ |
| static WORD *word_limit; |
| |
| /* Indentation of the first line of the current paragraph. */ |
| static int first_indent; |
| |
| /* Indentation of other lines of the current paragraph */ |
| static int other_indent; |
| |
| /* The last character read from the input file. */ |
| static int next_char; |
| |
| /* If nonzero, the length of the last line output in the current |
| paragraph, used to charge for raggedness at the split point for long |
| paragraphs chosen by fmt_paragraph(). */ |
| static int last_line_length; |
| |
| /* read file F and send formatted output to stdout. */ |
| |
| void |
| fmt (const char *line, const char *line_end, int max_length, FILE *output_file) |
| { |
| parabuf = line; |
| end_of_parabuf = line_end; |
| outfile = output_file; |
| |
| max_width = max_length; |
| best_width = max_width * (201 - 2 * LEEWAY) / 200; |
| |
| in_column = 0; |
| other_indent = 0; |
| next_char = GETC(); |
| while (get_paragraph ()) |
| { |
| fmt_paragraph (); |
| put_paragraph (word_limit); |
| } |
| } |
| |
| /* Read a paragraph from input file F. A paragraph consists of a |
| maximal number of non-blank (excluding any prefix) lines |
| with the same indent. |
| |
| Return false if end-of-file was encountered before the start of a |
| paragraph, else true. */ |
| |
| static bool |
| get_paragraph () |
| { |
| register int c; |
| |
| last_line_length = 0; |
| c = next_char; |
| |
| /* Scan (and copy) blank lines, and lines not introduced by the prefix. */ |
| |
| while (c == '\n' || c == EOF) |
| { |
| c = copy_rest (c); |
| if (c == EOF) |
| { |
| next_char = EOF; |
| return false; |
| } |
| putc ('\n', outfile); |
| c = GETC(); |
| } |
| |
| /* Got a suitable first line for a paragraph. */ |
| |
| first_indent = in_column; |
| word_limit = words; |
| c = get_line (c); |
| |
| /* Read rest of paragraph. */ |
| |
| other_indent = in_column; |
| while (same_para (c) && in_column == other_indent) |
| c = get_line (c); |
| |
| (word_limit - 1)->period = (word_limit - 1)->final = true; |
| next_char = c; |
| return true; |
| } |
| |
| /* Copy to the output a blank line. In the latter, C is \n or EOF. |
| Return the character (\n or EOF) ending the line. */ |
| |
| static int |
| copy_rest (register int c) |
| { |
| out_column = 0; |
| while (c != '\n' && c != EOF) |
| { |
| putc (c, outfile); |
| c = GETC(); |
| } |
| return c; |
| } |
| |
| /* Return true if a line whose first non-blank character after the |
| prefix (if any) is C could belong to the current paragraph, |
| otherwise false. */ |
| |
| static bool |
| same_para (register int c) |
| { |
| return (c != '\n' && c != EOF); |
| } |
| |
| /* Read a line from the input data given first non-blank character C |
| after the prefix, and the following indent, and break it into words. |
| A word is a maximal non-empty string of non-white characters. A word |
| ending in [.?!]["')\]]* and followed by end-of-line or at least two |
| spaces ends a sentence, as in emacs. |
| |
| Return the first non-blank character of the next line. */ |
| |
| static int |
| get_line (register int c) |
| { |
| int start; |
| register WORD *end_of_word; |
| |
| end_of_word = &words[MAXWORDS - 2]; |
| |
| do |
| { /* for each word in a line */ |
| |
| /* Scan word. */ |
| |
| word_limit->text = parabuf - 1; |
| do |
| c = GETC(); |
| while (c != EOF && !ISSPACE (c)); |
| word_limit->length = parabuf - word_limit->text - (c != EOF); |
| in_column += word_limit->length; |
| |
| check_punctuation (word_limit); |
| |
| /* Scan inter-word space. */ |
| |
| start = in_column; |
| c = get_space (c); |
| word_limit->space = in_column - start; |
| word_limit->final = (c == EOF |
| || (word_limit->period |
| && (c == '\n' || word_limit->space > 1))); |
| if (c == '\n' || c == EOF) |
| word_limit->space = word_limit->final ? 2 : 1; |
| if (word_limit == end_of_word) |
| flush_paragraph (); |
| word_limit++; |
| if (c == EOF) |
| { |
| in_column = first_indent; |
| return EOF; |
| } |
| } |
| while (c != '\n'); |
| |
| in_column = 0; |
| c = GETC(); |
| return get_space (c); |
| } |
| |
| /* Read blank characters from the input data, starting with C, and keeping |
| in_column up-to-date. Return first non-blank character. */ |
| |
| static int |
| get_space (register int c) |
| { |
| for (;;) |
| { |
| if (c == ' ') |
| in_column++; |
| else if (c == '\t') |
| in_column = (in_column / TABWIDTH + 1) * TABWIDTH; |
| else |
| return c; |
| c = GETC(); |
| } |
| } |
| |
| /* Set extra fields in word W describing any attached punctuation. */ |
| |
| static void |
| check_punctuation (register WORD *w) |
| { |
| register const char *start, *finish; |
| |
| start = w->text; |
| finish = start + (w->length - 1); |
| w->paren = isopen (*start); |
| w->punct = ISPUNCT (*finish); |
| while (isclose (*finish) && finish > start) |
| finish--; |
| w->period = isperiod (*finish); |
| } |
| |
| /* Flush part of the paragraph to make room. This function is called on |
| hitting the limit on the number of words or characters. */ |
| |
| static void |
| flush_paragraph (void) |
| { |
| WORD *split_point; |
| register WORD *w; |
| COST best_break; |
| |
| /* - format what you have so far as a paragraph, |
| - find a low-cost line break near the end, |
| - output to there, |
| - make that the start of the paragraph. */ |
| |
| fmt_paragraph (); |
| |
| /* Choose a good split point. */ |
| |
| split_point = word_limit; |
| best_break = MAXCOST; |
| for (w = words->next_break; w != word_limit; w = w->next_break) |
| { |
| if (w->best_cost - w->next_break->best_cost < best_break) |
| { |
| split_point = w; |
| best_break = w->best_cost - w->next_break->best_cost; |
| } |
| if (best_break <= MAXCOST - LINE_CREDIT) |
| best_break += LINE_CREDIT; |
| } |
| put_paragraph (split_point); |
| |
| /* Copy words from split_point down to word -- we use memmove because |
| the source and target may overlap. */ |
| |
| memmove ((char *) words, (char *) split_point, |
| (word_limit - split_point + 1) * sizeof (WORD)); |
| word_limit -= split_point - words; |
| } |
| |
| /* Compute the optimal formatting for the whole paragraph by computing |
| and remembering the optimal formatting for each suffix from the empty |
| one to the whole paragraph. */ |
| |
| static void |
| fmt_paragraph (void) |
| { |
| register WORD *start, *w; |
| register int len; |
| register COST wcost, best; |
| int saved_length; |
| |
| word_limit->best_cost = 0; |
| saved_length = word_limit->length; |
| word_limit->length = max_width; /* sentinel */ |
| |
| for (start = word_limit - 1; start >= words; start--) |
| { |
| best = MAXCOST; |
| len = start == words ? first_indent : other_indent; |
| |
| /* At least one word, however long, in the line. */ |
| |
| w = start; |
| len += w->length; |
| do |
| { |
| w++; |
| |
| /* Consider breaking before w. */ |
| |
| wcost = line_cost (w, len) + w->best_cost; |
| if (start == words && last_line_length > 0) |
| wcost += RAGGED_COST (len - last_line_length); |
| if (wcost < best) |
| { |
| best = wcost; |
| start->next_break = w; |
| start->line_length = len; |
| } |
| len += (w - 1)->space + w->length; /* w > start >= words */ |
| } |
| while (len < max_width); |
| start->best_cost = best + base_cost (start); |
| } |
| |
| word_limit->length = saved_length; |
| } |
| |
| /* Return the constant component of the cost of breaking before the |
| word THIS. */ |
| |
| static COST |
| base_cost (register WORD *this) |
| { |
| register COST cost; |
| |
| cost = LINE_COST; |
| |
| if (this > words) |
| { |
| if ((this - 1)->period) |
| { |
| if ((this - 1)->final) |
| cost -= SENTENCE_BONUS; |
| else |
| cost += NOBREAK_COST; |
| } |
| else if ((this - 1)->punct) |
| cost -= PUNCT_BONUS; |
| else if (this > words + 1 && (this - 2)->final) |
| cost += WIDOW_COST ((this - 1)->length); |
| } |
| |
| if (this->paren) |
| cost -= PAREN_BONUS; |
| else if (this->final) |
| cost += ORPHAN_COST (this->length); |
| |
| return cost; |
| } |
| |
| /* Return the component of the cost of breaking before word NEXT that |
| depends on LEN, the length of the line beginning there. */ |
| |
| static COST |
| line_cost (register WORD *next, register int len) |
| { |
| register int n; |
| register COST cost; |
| |
| if (next == word_limit) |
| return 0; |
| n = best_width - len; |
| cost = SHORT_COST (n); |
| if (next->next_break != word_limit) |
| { |
| n = len - next->line_length; |
| cost += RAGGED_COST (n); |
| } |
| return cost; |
| } |
| |
| /* Output to stdout a paragraph from word up to (but not including) |
| FINISH, which must be in the next_break chain from word. */ |
| |
| static void |
| put_paragraph (register WORD *finish) |
| { |
| register WORD *w; |
| |
| put_line (words, first_indent); |
| for (w = words->next_break; w != finish; w = w->next_break) |
| put_line (w, other_indent); |
| } |
| |
| /* Output to stdout the line beginning with word W, beginning in column |
| INDENT, including the prefix (if any). */ |
| |
| static void |
| put_line (register WORD *w, int indent) |
| { |
| register WORD *endline; |
| out_column = 0; |
| put_space (indent); |
| |
| endline = w->next_break - 1; |
| for (; w != endline; w++) |
| { |
| put_word (w); |
| put_space (w->space); |
| } |
| put_word (w); |
| last_line_length = out_column; |
| putc ('\n', outfile); |
| } |
| |
| /* Output to stdout the word W. */ |
| |
| static void |
| put_word (register WORD *w) |
| { |
| register const char *s; |
| register int n; |
| |
| s = w->text; |
| for (n = w->length; n != 0; n--) |
| putc (*s++, outfile); |
| out_column += w->length; |
| } |
| |
| /* Output to stdout SPACE spaces, or equivalent tabs. */ |
| |
| static void |
| put_space (int space) |
| { |
| out_column += space; |
| while (space--) |
| putc (' ', outfile); |
| } |