initial release of working fmt

Note: for now, this version does not use Knuth-Plass, but everything else is in place with "greedy" breaking. All options (should) work, and performance is nearly on par with GNU fmt. Squashed commit of the following local commits: commit ebc12f5e7d19d351ada9273ec0c42d66d3730431 commit 125fdabcb2a32de161c7a8b76c3e766a40ff9f76 commit dadd62acc093b5bd4bc94ad4f8a499d2663a7097 commit e436fdaade3876e92020c61a736eba54eb5ca0cf commit bbc4f4f6ad749753efe9b2df871ddb257f33de4b commit 12bc4ecb0c56c0d43515a111e9129a4bfaf36531 commit 2e693553ed9af59c53ee13026d19c9f82f2973fc commit 9b15a130148d62dd6a1d2765848ddc4daf30c649 commit ea335eb2869afcc94709345118fab3fb2e612954 Merge: ee92573 23cc41d commit 23cc41d188cb3134c04872fd77acb331d86a64ea commit 2fa7c48133001d86da39feda04d870ff67e88400 commit eb71558ee46654b568adf167f194cb854bbf7056 commit c8baabc0b86d831b5741fa496c312134db652c55 commit ee4fab44b216c1d9c7dcdcdc29ca587c76284834 commit c5444416a531ae1341dddbfd528e4a3ee5f106bf commit e1177d47941654b8834d18599c80065943a26159 commit c7fb30e2ff32313974f99d34ba4735be064b0cc5 commit 99a9406bc6fff33fc64c190356e48f443312a6c4 commit 3d244d62c9b60b579f2e5b723da6389a5dbc8805 commit 2d4f09cb2ff83664730edba209ec129abdcf1403 commit 947c32b72bff8d50e362555ec21a6b848d5fec9f commit 8556d2a3467651ee7833ad800876af35a7dd5db7 commit a2e4bc3dc45e5f39b402e6fdd3e19edcea6d3c34 Merge: 0308884 439e65d commit 03088844f1fd2faca6c3471230730136dd140f35 commit ac80d888649dd1311fdaa68400ea45d52b2e23ab commit c1d6b36acb7038e14d5b3e1fb6a44614a3351f96 commit 6539b102593aa9d9570df8be99ca1a1bf01ea1f4 commit 439e65d3331936e00fa89a4b2f88c343b9e28c5b commit fac27de7c4918bc5cf1a1ac1a43550236ba8af4d commit 365989c5bbe5c2289648f6efbc3c9388388e30a0 commit 3dd71364cce9aaaa773fc88eb206aba31aa61390
2024-07-22 02:14:38 +00:00 · 2014-06-18 20:43:26 -04:00 · 2014-06-18 20:43:26 -04:00 · ac216c3d77
parent 6039626490
commit ac216c3d77
3 changed files with 813 additions and 0 deletions
--- a/fmt/fmt.rs
+++ b/fmt/fmt.rs
@ -0,0 +1,250 @@
+#![crate_id(name="fmt", vers="0.0.1", author="kwantam")]
+/*
+ * This file is part of `fmt` from the uutils coreutils package.
+ *
+ * (c) kwantam <kwantam@gmail.com>
+ *
+ * For the full copyright and license information, please view the LICENSE
+ * file that was distributed with this source code.
+ */
+
+#![feature(macro_rules)]
+
+extern crate core;
+extern crate getopts;
+extern crate libc;
+
+use std::io::{BufferedReader, BufferedWriter, File, IoResult};
+use std::io::stdio::{stdin_raw, stdout_raw, stdout};
+use std::os;
+use linebreak::break_simple;
+use parasplit::{ParagraphStream, ParaWords};
+
+#[macro_export]
+macro_rules! silent_unwrap(
+    ($exp:expr) => (
+        match $exp {
+            Ok(_)   => (),
+            Err(_)  => unsafe { ::libc::exit(1) }
+        }
+    )
+)
+#[path = "../common/util.rs"]
+mod util;
+mod linebreak;
+mod parasplit;
+
+// program's NAME and VERSION are used for -V and -h
+static NAME: &'static str = "fmt";
+static VERSION: &'static str = "0.0.1";
+
+struct FmtOptions {
+    crown       : bool,
+    tagged      : bool,
+    mail        : bool,
+    split_only  : bool,
+    use_prefix  : bool,
+    prefix      : String,
+    xprefix     : bool,
+    prefix_len  : uint,
+    use_anti_prefix : bool,
+    anti_prefix : String,
+    xanti_prefix: bool,
+    uniform     : bool,
+    width       : uint,
+    goal        : uint,
+    tabwidth    : uint,
+}
+
+#[allow(dead_code)]
+fn main() { os::set_exit_status(uumain(os::args())) }
+
+fn uumain(args: Vec<String>) -> int {
+
+    let opts = [
+        getopts::optflag("c", "crown-margin", "First and second line of paragraph may have different indentations, in which case the first line's indentation is preserved, and each subsequent line's indentation matches the second line."),
+        getopts::optflag("t", "tagged-paragraph", "Like -c, except that the first and second line of a paragraph *must* have different indentation or they are treated as separate paragraphs."),
+        getopts::optflag("m", "preserve-headers", "Attempt to detect and preserve mail headers in the input. Be careful when combining this flag with -p."),
+        getopts::optflag("s", "split-only", "Split lines only, do not reflow."),
+        getopts::optflag("u", "uniform-spacing", "Insert exactly one space between words, and two between sentences. Sentence breaks in the input are detected as [?!.] followed by two spaces or a newline; other punctuation is not interpreted as a sentence break."),
+
+        getopts::optopt("p", "prefix", "Reformat only lines beginning with PREFIX, reattaching PREFIX to reformatted lines. Unless -x is specified, leading whitespace will be ignored when matching PREFIX.", "PREFIX"),
+        getopts::optopt("P", "skip-prefix", "Do not reformat lines beginning with PSKIP. Unless -X is specified, leading whitespace will be ignored when matching PSKIP", "PSKIP"),
+
+        getopts::optflag("x", "exact-prefix", "PREFIX must match at the beginning of the line with no preceding whitespace."),
+        getopts::optflag("X", "exact-skip-prefix", "PSKIP must match at the beginning of the line with no preceding whitespace."),
+
+        getopts::optopt("w", "width", "Fill output lines up to a maximum of WIDTH columns, default 78.", "WIDTH"),
+        getopts::optopt("g", "goal", "Goal width, default ~0.92*WIDTH. Must be less than WIDTH.", "GOAL"),
+
+        getopts::optopt("T", "tab-width", "Treat tabs as TABWIDTH spaces for determining line length, default 8. Note that this is used only for calculating line lengths; tabs are preserved in the output.", "TABWIDTH"),
+
+        getopts::optflag("V", "version", "Output version information and exit."),
+        getopts::optflag("h", "help", "Display this help message and exit.")
+            ];
+
+    let matches = match getopts::getopts(args.tail(), opts.as_slice()) {
+        Ok(m) => m,
+        Err(f) => crash!(1, "{}\nTry `{} --help' for more information.", f, args.get(0))
+    };
+
+    if matches.opt_present("h") {
+        print_usage(args.get(0).as_slice(), opts.as_slice(), "");
+    }
+
+    if matches.opt_present("V") || matches.opt_present("h") {
+        println!("uutils {} v{}", NAME, VERSION);
+        return 0
+    }
+
+    let mut fmt_opts = FmtOptions { crown       : false
+                                  , tagged      : false
+                                  , mail        : false
+                                  , uniform     : false
+                                  , split_only  : false
+                                  , use_prefix  : false
+                                  , prefix      : String::new()
+                                  , xprefix     : false
+                                  , prefix_len  : 0
+                                  , use_anti_prefix : false
+                                  , anti_prefix : String::new()
+                                  , xanti_prefix: false
+                                  , width       : 78
+                                  , goal        : 72
+                                  , tabwidth    : 8
+                                  };
+    
+    if matches.opt_present("t") { fmt_opts.tagged       = true; }
+    if matches.opt_present("c") { fmt_opts.crown        = true; fmt_opts.tagged = false; }
+    if matches.opt_present("m") { fmt_opts.mail         = true; }
+    if matches.opt_present("u") { fmt_opts.uniform      = true; }
+    if matches.opt_present("s") { fmt_opts.split_only   = true; fmt_opts.crown = false; fmt_opts.tagged = false; }
+    if matches.opt_present("x") { fmt_opts.xprefix      = true; }
+    if matches.opt_present("X") { fmt_opts.xanti_prefix = true; }
+
+    match matches.opt_str("p") {
+        Some(s) => { fmt_opts.prefix = s; fmt_opts.use_prefix = true; fmt_opts.prefix_len = fmt_opts.prefix.as_slice().char_len() },
+        None    => ()
+    };
+
+    match matches.opt_str("P") {
+        Some(s) => { fmt_opts.anti_prefix = s; fmt_opts.use_anti_prefix = true; },
+        None    => ()
+    };
+
+    match matches.opt_str("w") {
+        Some(s) => { fmt_opts.width = match from_str(s.as_slice()) {
+                                        Some(t) => t,
+                                        None    => { crash!(1, "Invalid WIDTH specification: `{}'", s); }
+                                      };
+                     fmt_opts.goal = std::cmp::min(fmt_opts.width * 92 / 100, fmt_opts.width - 4);
+                   },
+        None    => ()
+    };
+
+    match matches.opt_str("g") {
+        Some(s) => { fmt_opts.goal = match from_str(s.as_slice()) {
+                                        Some(t) => t,
+                                        None    => { crash!(1, "Invalid GOAL specification: `{}'", s); }
+                                     };
+                     if ! matches.opt_present("w") {
+                         fmt_opts.width = std::cmp::max(fmt_opts.goal * 100 / 92, fmt_opts.goal + 4);
+                     } else if fmt_opts.goal > fmt_opts.width {
+                         crash!(1, "GOAL cannot be greater than WIDTH.");
+                     }
+                   },
+        None    => ()
+    };
+
+    match matches.opt_str("T") {
+        Some(s) => fmt_opts.tabwidth = match from_str(s.as_slice()) {
+                                        Some(t) => t,
+                                        None    => { crash!(1, "Invalid TABWIDTH specification: `{}'", s); }
+                                       },
+        None    => ()
+    };
+
+    if fmt_opts.tabwidth < 1 {
+        fmt_opts.tabwidth = 1;
+    }
+
+    // immutable now
+    let fmt_opts = fmt_opts;
+
+    let mut files = matches.free;
+    if files.is_empty() {
+        files.push("-".to_string());
+    }
+
+    let mut ostream = box BufferedWriter::new(stdout_raw()) as Box<Writer>;
+
+    for i in files.iter().map(|x| x.as_slice()) {
+        let mut fp = match open_file(i) {
+                         Err(e) =>  { show_warning!("{}: {}",i,e);
+                                      continue;
+                                    }
+                         Ok(f)  => f
+                     };
+        let mut pStream = ParagraphStream::new(&fmt_opts, &mut fp);
+        for paraResult in pStream {
+            match paraResult {
+                Err(s)      => silent_unwrap!(ostream.write(s.as_bytes())),
+                Ok(para)    => {
+                    // indent
+                    let pIndent = para.pfxind_str.clone().append(fmt_opts.prefix.as_slice()).append(para.indent_str.as_slice());
+                    let pIndentLen = para.pfxind_len + fmt_opts.prefix_len + para.indent_len;
+
+                    // words
+                    let pWords = ParaWords::new(&fmt_opts, &para);
+                    let mut pWords_words = pWords.words().map(|&x| x);
+
+                    // print the init, if it exists, and get its length
+                    let pInitLen =
+                        if fmt_opts.crown || fmt_opts.tagged {
+                            // handle "init" portion
+                            silent_unwrap!(ostream.write(para.init_str.as_bytes()));
+                            para.init_len
+                        } else if ! para.mail_header {
+                            // for non-(crown, tagged) that's the same as a normal indent
+                            silent_unwrap!(ostream.write(pIndent.as_bytes()));
+                            pIndentLen
+                        } else {
+                            // except that mail headers get no indent at all
+                            0
+                        };
+
+                    // does ths paragraph require uniform spacing?
+                    let uniform = para.mail_header || fmt_opts.uniform;
+
+                    break_simple(&mut pWords_words, fmt_opts.width, pIndent.as_slice(), pIndentLen, pInitLen, uniform, &mut ostream);
+                    silent_unwrap!(ostream.write("\n".as_bytes()));
+                }
+            }
+        }
+
+        // flush the output after each file
+        silent_unwrap!(ostream.flush());
+    }
+
+    0
+}
+
+fn print_usage(arg0: &str, opts: &[getopts::OptGroup], errmsg: &str) {
+    break_simple(&mut getopts::short_usage(arg0, opts).as_slice().words(), 64, "       ", 7, 0, true, &mut(box stdout() as Box<Writer>));
+    println!("\n\n{}{}", getopts::usage("Reformat paragraphs from input files (or stdin) to stdout.", opts), errmsg);
+}
+
+// uniform interface for opening files
+// since we don't need seeking
+type FileOrStdReader = BufferedReader<Box<Reader>>;
+
+fn open_file(filename: &str) -> IoResult<FileOrStdReader> {
+    if filename == "-" {
+        Ok(BufferedReader::new(box stdin_raw() as Box<Reader>))
+    } else {
+        match File::open(&Path::new(filename)) {
+            Ok(f)   => Ok(BufferedReader::new(box f as Box<Reader>)),
+            Err(e)  => return Err(e)
+        }
+    }
+}
--- a/fmt/linebreak.rs
+++ b/fmt/linebreak.rs
@ -0,0 +1,33 @@
+/*
+ * This file is part of `fmt` from the uutils coreutils package.
+ *
+ * (c) kwantam <kwantam@gmail.com>
+ *
+ * For the full copyright and license information, please view the LICENSE
+ * file that was distributed with this source code.
+ */
+
+// break_simple implements the "tight" breaking algorithm: print words until
+// maxlength would be exceeded, then print a linebreak and indent and continue.
+// Note that any first line indent should already have been printed before
+// calling this function, and the length of said indent should be passed as
+// init_len
+pub fn break_simple<'a, T: Iterator<&'a str>>(s: &'a mut T, maxlen: uint, indent_str: &'a str, indent_len: uint, init_len: uint, uniform: bool, ostream: &mut Box<Writer>) -> uint {
+    s.fold(init_len, |l, w| accum_words_simple(maxlen, indent_len, indent_str, ostream, uniform, l, w))
+}
+
+fn accum_words_simple(maxlen: uint, indent_len: uint, indent_str: &str, ostream: &mut Box<Writer>, uniform: bool, l: uint, w: &str) -> uint {
+    let wlen = w.len();
+    let lnew =
+        if l + wlen > maxlen {
+            silent_unwrap!(ostream.write("\n".as_bytes()));
+            silent_unwrap!(ostream.write(indent_str.as_bytes()));
+            indent_len
+        } else {
+            l
+        };
+
+    silent_unwrap!(ostream.write(w.as_bytes()));
+    if uniform { silent_unwrap!(ostream.write(" ".as_bytes())); }
+    lnew + wlen + 1
+}
--- a/fmt/parasplit.rs
+++ b/fmt/parasplit.rs
@ -0,0 +1,530 @@
+/*
+ * This file is part of `fmt` from the uutils coreutils package.
+ *
+ * (c) kwantam <kwantam@gmail.com>
+ *
+ * For the full copyright and license information, please view the LICENSE
+ * file that was distributed with this source code.
+ */
+
+use core::iter::Peekable;
+use std::io::Lines;
+use std::slice::Items;
+use std::str::CharRange;
+use FileOrStdReader;
+use FmtOptions;
+
+// lines with PSKIP, lacking PREFIX, or which are entirely blank are
+// NoFormatLines; otherwise, they are FormatLines
+#[deriving(Show)]
+enum Line {
+    FormatLine(FileLine),
+    NoFormatLine(String, bool)
+}
+
+impl Line {
+    // when we know that it's a FormatLine, as in the ParagraphStream iterator
+    fn get_fileline(self) -> FileLine {
+        match self {
+            FormatLine(fl)      => fl,
+            NoFormatLine(..)    => fail!("Found NoFormatLine when expecting FormatLine")
+        }
+    }
+
+    // when we know that it's a NoFormatLine, as in the ParagraphStream iterator
+    fn get_noformatline(self) -> (String, bool) {
+        match self {
+            NoFormatLine(s, b)  => (s, b),
+            FormatLine(..)      => fail!("Found FormatLine when expecting NoFormatLine")
+        }
+    }
+}
+
+// each line's prefix has to be considered to know whether to merge it with
+// the next line or not
+#[deriving(Show)]
+struct FileLine {
+    line            : String,
+    indent_end      : uint,     // the end of the indent, always the start of the text
+    prefix_end      : uint,     // the end of the PREFIX
+    pfxind_end      : uint,     // the end of the PREFIX's indent, that is, the spaces before the prefix
+    indent_len      : uint,     // display length of indent taking into account TABWIDTH
+    pfxind_len      : uint,     // PREFIX indent length taking into account TABWIDTH
+}
+
+// iterator that produces a stream of Lines from a file
+struct FileLines<'a> {
+    opts            : &'a FmtOptions,
+    lines           : Lines<'a, FileOrStdReader>,
+}
+
+impl<'a> FileLines<'a> {
+    fn new<'a>(opts: &'a FmtOptions, lines: Lines<'a, FileOrStdReader>) -> FileLines<'a> {
+        FileLines { opts: opts, lines: lines }
+    }
+
+    // returns true if this line should be formatted
+    fn match_prefix(&self, line: &str) -> (bool, uint) {
+        if ! self.opts.use_prefix { return (true, 0u); }
+
+        FileLines::match_prefix_generic(self.opts.prefix.as_slice(), line, self.opts.xprefix)
+    }
+
+    // returns true if this line should be formatted
+    fn match_anti_prefix(&self, line: &str) -> bool {
+        if ! self.opts.use_anti_prefix { return true; }
+
+        match FileLines::match_prefix_generic(self.opts.anti_prefix.as_slice(), line, self.opts.xanti_prefix) {
+            (true, _)   => false,
+            (_   , _)   => true
+        }
+    }
+
+    fn match_prefix_generic(pfx: &str, line: &str, exact: bool) -> (bool, uint) {
+        if line.starts_with(pfx) {
+            return (true, 0);
+        }
+
+        if ! exact {
+            // we do it this way rather than byte indexing to support unicode whitespace chars
+            let mut i = 0u;
+            while (i < line.len()) && line.char_at(i).is_whitespace() {
+                i = match line.char_range_at(i) { CharRange { ch: _ , next: nxi } => nxi };
+                if line.slice_from(i).starts_with(pfx) {
+                    return (true, i);
+                }
+            }
+        }
+
+        (false, 0)
+    }
+}
+
+impl<'a> Iterator<Line> for FileLines<'a> {
+    fn next(&mut self) -> Option<Line> {
+        let mut n =
+            match self.lines.next() {
+                Some(t) => match t {
+                    Ok(tt)   => tt,
+                    Err(_)   => return None
+                },
+                None    => return None
+            };
+
+        // if this line is entirely whitespace,
+        // emit a blank line
+        // Err(true) indicates that this was a linebreak,
+        // which is important to know when detecting mail headers
+        if n.as_slice().is_whitespace() {
+            return Some(NoFormatLine("\n".to_string(), true));
+        }
+
+        // if this line does not match the prefix,
+        // emit the line unprocessed and iterate again
+        let (pmatch, poffset) = self.match_prefix(n.as_slice());
+        if ! pmatch {
+            return Some(NoFormatLine(n, false));
+        }
+
+        // if this line matches the anti_prefix
+        // (NOTE definition of match_anti_prefix is TRUE if we should process)
+        if ! self.match_anti_prefix(n.as_slice()) {
+            return Some(NoFormatLine(n, false));
+        }
+
+        // replace trailing newline, if any, with space
+        let CharRange {ch, next: i} = n.as_slice().char_range_at_reverse(n.len());
+        if ch == '\n' {
+            unsafe {
+                let nmut = n.as_mut_bytes();
+                nmut[i] = ' ' as u8;
+            }
+            if i > 0 {
+                let CharRange {ch, next: _} = n.as_slice().char_range_at_reverse(i);
+                if ch == '.' {
+                    n.push_char(' ');
+                }
+            }
+        }
+
+        let nLen = n.len();
+        // figure out the indent, prefix, and prefixindent ending points
+        let (indEnd, pfxEnd, pfxIndEnd) = 
+            if self.opts.use_prefix {
+                let pfxEnd = poffset + self.opts.prefix.len();
+                let nSlice = n.as_slice().slice_from(pfxEnd);
+                let nSlice2 = nSlice.trim_left();
+
+                (pfxEnd + nSlice.len() - nSlice2.len(), pfxEnd, poffset)
+            } else {
+                let nSlice = n.as_slice().trim_left();
+
+                (nLen - nSlice.len(), 0, 0)
+            };
+
+        // indent length
+        let indLen =
+            if indEnd > 0 {
+                let nSlice = n.as_slice().slice(pfxEnd, indEnd);
+                nSlice.char_len() + (self.opts.tabwidth - 1) * nSlice.chars().filter(|x| x == &'\t').count()
+            } else {
+                0
+            };
+
+        // prefix indent length
+        let pfxIndLen =
+            if pfxIndEnd > 0 {
+                let nSlice = n.as_slice().slice_to(pfxIndEnd);
+                nSlice.char_len() + (self.opts.tabwidth - 1) * nSlice.chars().filter(|x| x == &'\t').count()
+            } else {
+                0
+            };
+
+        // if we are in uniform mode, all tabs after the indent should be replaced by spaces.
+        // NOTE that in this implementation, [?!.]\t is NOT detected as a sentence break, but
+        // [?!.]\t\t is. We could expand tabs to two spaces to force detection of tab as
+        // sentence ending
+        if self.opts.uniform {
+            let tabinds: Vec<uint> = n.as_slice().slice_from(indEnd).char_indices().filter_map(|(i,c)| if c == '\t' { Some(i) } else { None }).collect();
+            unsafe {
+                let nmut = n.as_mut_bytes();
+                for i in tabinds.iter() {
+                    nmut[*i] = ' ' as u8;
+                }
+            }
+        }
+
+        Some(FormatLine(FileLine { line: n
+                                 , indent_end: indEnd
+                                 , prefix_end: pfxEnd
+                                 , pfxind_end: pfxIndEnd
+                                 , indent_len: indLen
+                                 , pfxind_len: pfxIndLen
+                                 }))
+    }
+}
+
+// a paragraph : a collection of FileLines that are to be formatted
+// plus info about the paragraph's indentation
+// (but we only retain the String from the FileLine; the other info
+// is only there to help us in deciding how to merge lines into Paragraphs
+#[deriving(Show)]
+pub struct Paragraph {
+    lines               : Vec<String>,  // the lines of the file
+    pub init_str        : String,       // string representing the init, that is, the first line's indent
+    pub init_len        : uint,         // printable length of the init string considering TABWIDTH
+    init_end            : uint,         // byte location of end of init in first line String
+    pub indent_str      : String,       // string representing indent
+    pub indent_len      : uint,         // length of above
+    indent_end          : uint,         // byte location of end of indent (in crown and tagged mode, only applies to 2nd line and onward)
+    pub pfxind_str      : String,       // string representing the prefix indent
+    pub pfxind_len      : uint,         // length of above
+    pub mail_header     : bool          // we need to know if this is a mail header because we do word splitting differently in that case
+}
+
+// an iterator producing a stream of paragraphs from a stream of lines
+// given a set of options.
+// NOTE as you iterate through the paragraphs, any NoFormatLines are
+// immediately dumped to stdout!
+pub struct ParagraphStream<'a> {
+    lines               : Peekable<Line,FileLines<'a>>,
+    next_mail           : bool,
+    opts                : &'a FmtOptions,
+}
+
+impl<'a> ParagraphStream<'a> {
+    pub fn new<'a>(opts: &'a FmtOptions, reader: &'a mut FileOrStdReader) -> ParagraphStream<'a> {
+        let lines = FileLines::new(opts, reader.lines()).peekable();
+        // at the beginning of the file, we might find mail headers
+        ParagraphStream { lines: lines, next_mail: true, opts: opts }
+    }
+
+    // detect RFC822 mail header
+    fn is_mail_header(line: &FileLine) -> bool {
+        // a mail header begins with either "From " (envelope sender line)
+        // or with a sequence of printable ASCII chars (33 to 126, inclusive,
+        // except colon) followed by a colon.
+        if line.indent_end > 0 {
+            return false;
+        } else {
+            let lSlice = line.line.as_slice();
+            if lSlice.starts_with("From ") {
+                return true;
+            } else {
+                let colonPosn =
+                    match lSlice.find(':') {
+                        Some(n) => n,
+                        None    => return false
+                    };
+
+                // header field must be nonzero length
+                if colonPosn == 0 { return false; }
+
+                return lSlice.slice_to(colonPosn).chars()
+                    .all(|x| match x as uint {
+                                y if y < 33 || y > 126  => false,
+                                _                       => true
+                             });
+            }
+        }
+    }
+}
+
+impl<'a> Iterator<Result<Paragraph,String>> for ParagraphStream<'a> {
+    fn next(&mut self) -> Option<Result<Paragraph,String>> {
+        // return a NoFormatLine in an Err; it should immediately be output
+        let noformat =
+            match self.lines.peek() {
+                None    => return None,
+                Some(l) => match l {
+                    &FormatLine(_)      => false,
+                    &NoFormatLine(_, _) => true
+                }
+            };
+
+        // found a NoFormatLine, immediately dump it out
+        if noformat {
+            let (s, nm) = self.lines.next().unwrap().get_noformatline();
+            self.next_mail = nm;
+            return Some(Err(s));
+        }
+
+        // found a FormatLine, now build a paragraph
+        let mut init_str = String::new();
+        let mut init_end = 0;
+        let mut init_len = 0;
+        let mut indent_str = String::new();
+        let mut indent_end = 0;
+        let mut indent_len = 0;
+        let mut pfxind_str = String::new();
+        let mut pfxind_len = 0;
+        let mut pLines = Vec::new();
+
+        let mut in_mail = false;
+        let mut second_done = false;    // for when we use crown or tagged mode
+        loop {
+            {   // peek ahead
+            // need to explicitly force fl out of scope before we can call self.lines.next()
+                let fl =
+                    match self.lines.peek() {
+                        None    => break,
+                        Some(l) => {
+                            match l {
+                                &FormatLine(ref x)   => x,
+                                &NoFormatLine(..)    => break
+                            }
+                        }
+                    };
+
+                if pLines.len() == 0 {
+                    // first time through the loop, get things set up
+                    // detect mail header
+                    if self.opts.mail && self.next_mail && ParagraphStream::is_mail_header(fl) {
+                        in_mail = true;
+                        // there can't be any indent or pfxind because otherwise is_mail_header would fail
+                        // since there cannot be any whitespace before the colon in a valid header field
+                        indent_str.push_str("  ");
+                        indent_len = 2;
+                    } else {
+                        if self.opts.crown || self.opts.tagged {
+                            init_str.push_str(fl.line.as_slice().slice_to(fl.indent_end));
+                            init_len = fl.indent_len + fl.pfxind_len + self.opts.prefix_len;
+                            init_end = fl.indent_end;
+                        } 
+
+                        // these will be overwritten in the 2nd line of crown or tagged mode, but
+                        // we are not guaranteed to get to the 2nd line, e.g., if the next line
+                        // is a NoFormatLine or None. Thus, we set sane defaults the 1st time around
+                        indent_str.push_str(fl.line.as_slice().slice(fl.prefix_end,fl.indent_end));
+                        indent_len = fl.indent_len;
+                        indent_end = fl.indent_end;
+
+                        // in tagged mode, add 4 spaces of additional indenting by default
+                        // (gnu fmt's behavior is different: it seems to find the closest column to
+                        // indent_end that is divisible by 3. But honesly that behavior seems
+                        // pretty arbitrary.
+                        // Perhaps a better default would be 1 TABWIDTH? But ugh that's so big.
+                        if self.opts.tagged {
+                            indent_str.push_str("    ");
+                            indent_len += 4;
+                        }
+
+                        if self.opts.use_prefix {
+                            pfxind_str.push_str(fl.line.as_slice().slice_to(fl.pfxind_end));
+                            pfxind_len = fl.pfxind_len;
+                        }
+                    }
+                } else if in_mail {
+                    // lines following mail headers must begin with spaces
+                    if (self.opts.use_prefix && fl.pfxind_end == 0) || (! self.opts.use_prefix && fl.indent_end == 0) {
+                        break;  // this line does not begin with spaces
+                    }
+                } else if ! second_done && (self.opts.crown || self.opts.tagged) {
+                    // now we have enough info to handle crown margin and tagged mode
+                    if pfxind_len != fl.pfxind_len {
+                        // in both crown and tagged modes we require that pfxind is the same
+                        break;
+                    } else if self.opts.tagged && (indent_end == fl.indent_end) {
+                        // in tagged mode, indent also has to be different
+                        break;
+                    } else {
+                        // this is part of the same paragraph, get the indent info from this line
+                        indent_str.clear();
+                        indent_str.push_str(fl.line.as_slice().slice(fl.prefix_end,fl.indent_end));
+                        indent_len = fl.indent_len;
+                        indent_end = fl.indent_end;
+                    }
+                    second_done = true;
+                } else {
+                    // detect mismatch
+                    if (indent_end != fl.indent_end) || (indent_len != fl.indent_len) || (pfxind_len != fl.pfxind_len) {
+                        break;
+                    }
+                }
+            }
+
+            pLines.push(self.lines.next().unwrap().get_fileline().line);
+
+            // when we're in split-only mode, we never join lines, so stop here
+            if self.opts.split_only {
+                break;
+            }
+        }
+
+        // if this was a mail header, then the next line can be detected as one. Otherwise, it cannot.
+        // NOTE next_mail is true at ParagraphStream instantiation, and is set to true after a blank
+        // NoFormatLine.
+        self.next_mail = in_mail;
+
+        Some(Ok(Paragraph { lines: pLines
+                          , init_str: init_str
+                          , init_len: init_len
+                          , init_end: init_end
+                          , indent_str: indent_str
+                          , indent_len: indent_len
+                          , indent_end: indent_end
+                          , pfxind_str: pfxind_str
+                          , pfxind_len: pfxind_len
+                          , mail_header: in_mail
+                          }))
+    }
+}
+
+pub struct ParaWords<'a> {
+    opts    : &'a FmtOptions,
+    para    : &'a Paragraph,
+    words   : Vec<&'a str>
+}
+
+impl<'a> ParaWords<'a> {
+    pub fn new<'a>(opts: &'a FmtOptions, para: &'a Paragraph) -> ParaWords<'a> {
+        let mut pw = ParaWords { opts: opts, para: para, words: Vec::new() };
+        pw.create_words();
+        pw
+    }
+
+    fn create_words<'r>(&'r mut self) {
+        if self.para.mail_header {
+            // no extra spacing for mail headers; always exactly 1 space
+            // safe to trim_left on every line of a mail header, since the
+            // first line is guaranteed not to have any spaces
+            self.words.push_all_move(self.para.lines.iter().flat_map(|x| x.as_slice().trim_left().words()).collect());
+        } else {
+            // first line
+            self.words.push_all_move(
+                if self.opts.crown || self.opts.tagged {
+                    // crown and tagged mode has the "init" in the first line, so slice from there
+                    WordSplit::new(self.opts.uniform, self.para.lines.get(0).as_slice().slice_from(self.para.init_end))
+                } else {
+                    // otherwise we slice from the indent
+                    WordSplit::new(self.opts.uniform, self.para.lines.get(0).as_slice().slice_from(self.para.indent_end))
+                }.collect());
+
+            if self.para.lines.len() > 1 {
+                let indent_end = self.para.indent_end;
+                let uniform = self.opts.uniform;
+                self.words.push_all_move(
+                    self.para.lines.iter().skip(1)
+                    .flat_map(|x| WordSplit::new(uniform, x.as_slice().slice_from(indent_end)))
+                    .collect());
+            }
+        }
+    }
+
+    pub fn words(&'a self) -> Items<'a,&'a str> { return self.words.iter() }
+}
+
+struct WordSplit<'a> {
+    uniform     : bool,
+    string      : &'a str,
+    length      : uint,
+    position    : uint
+}
+
+impl<'a> WordSplit<'a> {
+    fn new<'a>(uniform: bool, string: &'a str) -> WordSplit<'a> {
+        // wordsplits *must* start at a non-whitespace character
+        let trim_string = string.trim_left();
+        WordSplit { uniform: uniform, string: trim_string, length: string.len(), position: 0 }
+    }
+
+    fn is_punctuation(c: char) -> bool {
+        match c {
+            '!' | '.' | '?' => true,
+            _               => false
+        }
+    }
+}
+
+impl<'a> Iterator<&'a str> for WordSplit<'a> {
+    fn next(&mut self) -> Option<&'a str> {
+        if self.position >= self.length {
+            return None
+        }
+
+        let old_position = self.position;
+
+        // find the start of the next whitespace segment
+        let ws_start =
+            match self.string.slice_from(old_position).find(|x: char| x.is_whitespace()) {
+                None    => self.length,
+                Some(s) => s + old_position
+            };
+
+        if ws_start == self.length {
+            self.position = self.length;
+            return Some(self.string.slice_from(old_position));
+        }
+
+        // find the end of the next whitespace segment
+        // note that this preserves the invariant that self.position points to
+        // non-whitespace character OR end of string
+        self.position =
+            match self.string.slice_from(ws_start).find(|x: char| ! x.is_whitespace()) {
+                None    => self.length,
+                Some(s) => s + ws_start
+            };
+
+        let is_sentence_end = match self.string.char_range_at_reverse(ws_start) {
+            CharRange { ch, next: _ } if WordSplit::is_punctuation(ch)  => self.position - ws_start > 2,
+                        _                                               => false
+        };
+
+        Some(
+        if self.uniform {
+            // if the last non-whitespace character is a [?!.] and
+            // there are two or more spaces, this is the end of a
+            // sentence, so keep one extra space.
+            if is_sentence_end {
+                self.string.slice(old_position, ws_start + 1)
+            } else {
+                self.string.slice(old_position, ws_start)
+            }
+        } else {
+            // in non-uniform mode, we just keep the whole thing
+            // eventually we will want to annotate where the sentence boundaries are
+            // so that we can give preference to splitting lines appropriately
+            self.string.slice(old_position, self.position)
+        })
+    }
+}