initial release of working fmt

Note: for now, this version does not use Knuth-Plass,
but everything else is in place with "greedy" breaking.

All options (should) work, and performance is nearly
on par with GNU fmt.

Squashed commit of the following local commits:

commit ebc12f5e7d19d351ada9273ec0c42d66d3730431
commit 125fdabcb2a32de161c7a8b76c3e766a40ff9f76
commit dadd62acc093b5bd4bc94ad4f8a499d2663a7097
commit e436fdaade3876e92020c61a736eba54eb5ca0cf
commit bbc4f4f6ad749753efe9b2df871ddb257f33de4b
commit 12bc4ecb0c56c0d43515a111e9129a4bfaf36531
commit 2e693553ed9af59c53ee13026d19c9f82f2973fc
commit 9b15a130148d62dd6a1d2765848ddc4daf30c649
commit ea335eb2869afcc94709345118fab3fb2e612954
Merge: ee92573 23cc41d
commit 23cc41d188cb3134c04872fd77acb331d86a64ea
commit 2fa7c48133001d86da39feda04d870ff67e88400
commit eb71558ee46654b568adf167f194cb854bbf7056
commit c8baabc0b86d831b5741fa496c312134db652c55
commit ee4fab44b216c1d9c7dcdcdc29ca587c76284834
commit c5444416a531ae1341dddbfd528e4a3ee5f106bf
commit e1177d47941654b8834d18599c80065943a26159
commit c7fb30e2ff32313974f99d34ba4735be064b0cc5
commit 99a9406bc6fff33fc64c190356e48f443312a6c4
commit 3d244d62c9b60b579f2e5b723da6389a5dbc8805
commit 2d4f09cb2ff83664730edba209ec129abdcf1403
commit 947c32b72bff8d50e362555ec21a6b848d5fec9f
commit 8556d2a3467651ee7833ad800876af35a7dd5db7
commit a2e4bc3dc45e5f39b402e6fdd3e19edcea6d3c34
Merge: 0308884 439e65d
commit 03088844f1fd2faca6c3471230730136dd140f35
commit ac80d888649dd1311fdaa68400ea45d52b2e23ab
commit c1d6b36acb7038e14d5b3e1fb6a44614a3351f96
commit 6539b102593aa9d9570df8be99ca1a1bf01ea1f4
commit 439e65d3331936e00fa89a4b2f88c343b9e28c5b
commit fac27de7c4918bc5cf1a1ac1a43550236ba8af4d
commit 365989c5bbe5c2289648f6efbc3c9388388e30a0
commit 3dd71364cce9aaaa773fc88eb206aba31aa61390
This commit is contained in:
kwantam 2014-06-18 20:43:26 -04:00
parent 6039626490
commit ac216c3d77
3 changed files with 813 additions and 0 deletions

250
fmt/fmt.rs Normal file
View file

@ -0,0 +1,250 @@
#![crate_id(name="fmt", vers="0.0.1", author="kwantam")]
/*
* This file is part of `fmt` from the uutils coreutils package.
*
* (c) kwantam <kwantam@gmail.com>
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/
#![feature(macro_rules)]
extern crate core;
extern crate getopts;
extern crate libc;
use std::io::{BufferedReader, BufferedWriter, File, IoResult};
use std::io::stdio::{stdin_raw, stdout_raw, stdout};
use std::os;
use linebreak::break_simple;
use parasplit::{ParagraphStream, ParaWords};
#[macro_export]
macro_rules! silent_unwrap(
($exp:expr) => (
match $exp {
Ok(_) => (),
Err(_) => unsafe { ::libc::exit(1) }
}
)
)
#[path = "../common/util.rs"]
mod util;
mod linebreak;
mod parasplit;
// program's NAME and VERSION are used for -V and -h
static NAME: &'static str = "fmt";
static VERSION: &'static str = "0.0.1";
struct FmtOptions {
crown : bool,
tagged : bool,
mail : bool,
split_only : bool,
use_prefix : bool,
prefix : String,
xprefix : bool,
prefix_len : uint,
use_anti_prefix : bool,
anti_prefix : String,
xanti_prefix: bool,
uniform : bool,
width : uint,
goal : uint,
tabwidth : uint,
}
#[allow(dead_code)]
fn main() { os::set_exit_status(uumain(os::args())) }
fn uumain(args: Vec<String>) -> int {
let opts = [
getopts::optflag("c", "crown-margin", "First and second line of paragraph may have different indentations, in which case the first line's indentation is preserved, and each subsequent line's indentation matches the second line."),
getopts::optflag("t", "tagged-paragraph", "Like -c, except that the first and second line of a paragraph *must* have different indentation or they are treated as separate paragraphs."),
getopts::optflag("m", "preserve-headers", "Attempt to detect and preserve mail headers in the input. Be careful when combining this flag with -p."),
getopts::optflag("s", "split-only", "Split lines only, do not reflow."),
getopts::optflag("u", "uniform-spacing", "Insert exactly one space between words, and two between sentences. Sentence breaks in the input are detected as [?!.] followed by two spaces or a newline; other punctuation is not interpreted as a sentence break."),
getopts::optopt("p", "prefix", "Reformat only lines beginning with PREFIX, reattaching PREFIX to reformatted lines. Unless -x is specified, leading whitespace will be ignored when matching PREFIX.", "PREFIX"),
getopts::optopt("P", "skip-prefix", "Do not reformat lines beginning with PSKIP. Unless -X is specified, leading whitespace will be ignored when matching PSKIP", "PSKIP"),
getopts::optflag("x", "exact-prefix", "PREFIX must match at the beginning of the line with no preceding whitespace."),
getopts::optflag("X", "exact-skip-prefix", "PSKIP must match at the beginning of the line with no preceding whitespace."),
getopts::optopt("w", "width", "Fill output lines up to a maximum of WIDTH columns, default 78.", "WIDTH"),
getopts::optopt("g", "goal", "Goal width, default ~0.92*WIDTH. Must be less than WIDTH.", "GOAL"),
getopts::optopt("T", "tab-width", "Treat tabs as TABWIDTH spaces for determining line length, default 8. Note that this is used only for calculating line lengths; tabs are preserved in the output.", "TABWIDTH"),
getopts::optflag("V", "version", "Output version information and exit."),
getopts::optflag("h", "help", "Display this help message and exit.")
];
let matches = match getopts::getopts(args.tail(), opts.as_slice()) {
Ok(m) => m,
Err(f) => crash!(1, "{}\nTry `{} --help' for more information.", f, args.get(0))
};
if matches.opt_present("h") {
print_usage(args.get(0).as_slice(), opts.as_slice(), "");
}
if matches.opt_present("V") || matches.opt_present("h") {
println!("uutils {} v{}", NAME, VERSION);
return 0
}
let mut fmt_opts = FmtOptions { crown : false
, tagged : false
, mail : false
, uniform : false
, split_only : false
, use_prefix : false
, prefix : String::new()
, xprefix : false
, prefix_len : 0
, use_anti_prefix : false
, anti_prefix : String::new()
, xanti_prefix: false
, width : 78
, goal : 72
, tabwidth : 8
};
if matches.opt_present("t") { fmt_opts.tagged = true; }
if matches.opt_present("c") { fmt_opts.crown = true; fmt_opts.tagged = false; }
if matches.opt_present("m") { fmt_opts.mail = true; }
if matches.opt_present("u") { fmt_opts.uniform = true; }
if matches.opt_present("s") { fmt_opts.split_only = true; fmt_opts.crown = false; fmt_opts.tagged = false; }
if matches.opt_present("x") { fmt_opts.xprefix = true; }
if matches.opt_present("X") { fmt_opts.xanti_prefix = true; }
match matches.opt_str("p") {
Some(s) => { fmt_opts.prefix = s; fmt_opts.use_prefix = true; fmt_opts.prefix_len = fmt_opts.prefix.as_slice().char_len() },
None => ()
};
match matches.opt_str("P") {
Some(s) => { fmt_opts.anti_prefix = s; fmt_opts.use_anti_prefix = true; },
None => ()
};
match matches.opt_str("w") {
Some(s) => { fmt_opts.width = match from_str(s.as_slice()) {
Some(t) => t,
None => { crash!(1, "Invalid WIDTH specification: `{}'", s); }
};
fmt_opts.goal = std::cmp::min(fmt_opts.width * 92 / 100, fmt_opts.width - 4);
},
None => ()
};
match matches.opt_str("g") {
Some(s) => { fmt_opts.goal = match from_str(s.as_slice()) {
Some(t) => t,
None => { crash!(1, "Invalid GOAL specification: `{}'", s); }
};
if ! matches.opt_present("w") {
fmt_opts.width = std::cmp::max(fmt_opts.goal * 100 / 92, fmt_opts.goal + 4);
} else if fmt_opts.goal > fmt_opts.width {
crash!(1, "GOAL cannot be greater than WIDTH.");
}
},
None => ()
};
match matches.opt_str("T") {
Some(s) => fmt_opts.tabwidth = match from_str(s.as_slice()) {
Some(t) => t,
None => { crash!(1, "Invalid TABWIDTH specification: `{}'", s); }
},
None => ()
};
if fmt_opts.tabwidth < 1 {
fmt_opts.tabwidth = 1;
}
// immutable now
let fmt_opts = fmt_opts;
let mut files = matches.free;
if files.is_empty() {
files.push("-".to_string());
}
let mut ostream = box BufferedWriter::new(stdout_raw()) as Box<Writer>;
for i in files.iter().map(|x| x.as_slice()) {
let mut fp = match open_file(i) {
Err(e) => { show_warning!("{}: {}",i,e);
continue;
}
Ok(f) => f
};
let mut pStream = ParagraphStream::new(&fmt_opts, &mut fp);
for paraResult in pStream {
match paraResult {
Err(s) => silent_unwrap!(ostream.write(s.as_bytes())),
Ok(para) => {
// indent
let pIndent = para.pfxind_str.clone().append(fmt_opts.prefix.as_slice()).append(para.indent_str.as_slice());
let pIndentLen = para.pfxind_len + fmt_opts.prefix_len + para.indent_len;
// words
let pWords = ParaWords::new(&fmt_opts, &para);
let mut pWords_words = pWords.words().map(|&x| x);
// print the init, if it exists, and get its length
let pInitLen =
if fmt_opts.crown || fmt_opts.tagged {
// handle "init" portion
silent_unwrap!(ostream.write(para.init_str.as_bytes()));
para.init_len
} else if ! para.mail_header {
// for non-(crown, tagged) that's the same as a normal indent
silent_unwrap!(ostream.write(pIndent.as_bytes()));
pIndentLen
} else {
// except that mail headers get no indent at all
0
};
// does ths paragraph require uniform spacing?
let uniform = para.mail_header || fmt_opts.uniform;
break_simple(&mut pWords_words, fmt_opts.width, pIndent.as_slice(), pIndentLen, pInitLen, uniform, &mut ostream);
silent_unwrap!(ostream.write("\n".as_bytes()));
}
}
}
// flush the output after each file
silent_unwrap!(ostream.flush());
}
0
}
fn print_usage(arg0: &str, opts: &[getopts::OptGroup], errmsg: &str) {
break_simple(&mut getopts::short_usage(arg0, opts).as_slice().words(), 64, " ", 7, 0, true, &mut(box stdout() as Box<Writer>));
println!("\n\n{}{}", getopts::usage("Reformat paragraphs from input files (or stdin) to stdout.", opts), errmsg);
}
// uniform interface for opening files
// since we don't need seeking
type FileOrStdReader = BufferedReader<Box<Reader>>;
fn open_file(filename: &str) -> IoResult<FileOrStdReader> {
if filename == "-" {
Ok(BufferedReader::new(box stdin_raw() as Box<Reader>))
} else {
match File::open(&Path::new(filename)) {
Ok(f) => Ok(BufferedReader::new(box f as Box<Reader>)),
Err(e) => return Err(e)
}
}
}

33
fmt/linebreak.rs Normal file
View file

@ -0,0 +1,33 @@
/*
* This file is part of `fmt` from the uutils coreutils package.
*
* (c) kwantam <kwantam@gmail.com>
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/
// break_simple implements the "tight" breaking algorithm: print words until
// maxlength would be exceeded, then print a linebreak and indent and continue.
// Note that any first line indent should already have been printed before
// calling this function, and the length of said indent should be passed as
// init_len
pub fn break_simple<'a, T: Iterator<&'a str>>(s: &'a mut T, maxlen: uint, indent_str: &'a str, indent_len: uint, init_len: uint, uniform: bool, ostream: &mut Box<Writer>) -> uint {
s.fold(init_len, |l, w| accum_words_simple(maxlen, indent_len, indent_str, ostream, uniform, l, w))
}
fn accum_words_simple(maxlen: uint, indent_len: uint, indent_str: &str, ostream: &mut Box<Writer>, uniform: bool, l: uint, w: &str) -> uint {
let wlen = w.len();
let lnew =
if l + wlen > maxlen {
silent_unwrap!(ostream.write("\n".as_bytes()));
silent_unwrap!(ostream.write(indent_str.as_bytes()));
indent_len
} else {
l
};
silent_unwrap!(ostream.write(w.as_bytes()));
if uniform { silent_unwrap!(ostream.write(" ".as_bytes())); }
lnew + wlen + 1
}

530
fmt/parasplit.rs Normal file
View file

@ -0,0 +1,530 @@
/*
* This file is part of `fmt` from the uutils coreutils package.
*
* (c) kwantam <kwantam@gmail.com>
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/
use core::iter::Peekable;
use std::io::Lines;
use std::slice::Items;
use std::str::CharRange;
use FileOrStdReader;
use FmtOptions;
// lines with PSKIP, lacking PREFIX, or which are entirely blank are
// NoFormatLines; otherwise, they are FormatLines
#[deriving(Show)]
enum Line {
FormatLine(FileLine),
NoFormatLine(String, bool)
}
impl Line {
// when we know that it's a FormatLine, as in the ParagraphStream iterator
fn get_fileline(self) -> FileLine {
match self {
FormatLine(fl) => fl,
NoFormatLine(..) => fail!("Found NoFormatLine when expecting FormatLine")
}
}
// when we know that it's a NoFormatLine, as in the ParagraphStream iterator
fn get_noformatline(self) -> (String, bool) {
match self {
NoFormatLine(s, b) => (s, b),
FormatLine(..) => fail!("Found FormatLine when expecting NoFormatLine")
}
}
}
// each line's prefix has to be considered to know whether to merge it with
// the next line or not
#[deriving(Show)]
struct FileLine {
line : String,
indent_end : uint, // the end of the indent, always the start of the text
prefix_end : uint, // the end of the PREFIX
pfxind_end : uint, // the end of the PREFIX's indent, that is, the spaces before the prefix
indent_len : uint, // display length of indent taking into account TABWIDTH
pfxind_len : uint, // PREFIX indent length taking into account TABWIDTH
}
// iterator that produces a stream of Lines from a file
struct FileLines<'a> {
opts : &'a FmtOptions,
lines : Lines<'a, FileOrStdReader>,
}
impl<'a> FileLines<'a> {
fn new<'a>(opts: &'a FmtOptions, lines: Lines<'a, FileOrStdReader>) -> FileLines<'a> {
FileLines { opts: opts, lines: lines }
}
// returns true if this line should be formatted
fn match_prefix(&self, line: &str) -> (bool, uint) {
if ! self.opts.use_prefix { return (true, 0u); }
FileLines::match_prefix_generic(self.opts.prefix.as_slice(), line, self.opts.xprefix)
}
// returns true if this line should be formatted
fn match_anti_prefix(&self, line: &str) -> bool {
if ! self.opts.use_anti_prefix { return true; }
match FileLines::match_prefix_generic(self.opts.anti_prefix.as_slice(), line, self.opts.xanti_prefix) {
(true, _) => false,
(_ , _) => true
}
}
fn match_prefix_generic(pfx: &str, line: &str, exact: bool) -> (bool, uint) {
if line.starts_with(pfx) {
return (true, 0);
}
if ! exact {
// we do it this way rather than byte indexing to support unicode whitespace chars
let mut i = 0u;
while (i < line.len()) && line.char_at(i).is_whitespace() {
i = match line.char_range_at(i) { CharRange { ch: _ , next: nxi } => nxi };
if line.slice_from(i).starts_with(pfx) {
return (true, i);
}
}
}
(false, 0)
}
}
impl<'a> Iterator<Line> for FileLines<'a> {
fn next(&mut self) -> Option<Line> {
let mut n =
match self.lines.next() {
Some(t) => match t {
Ok(tt) => tt,
Err(_) => return None
},
None => return None
};
// if this line is entirely whitespace,
// emit a blank line
// Err(true) indicates that this was a linebreak,
// which is important to know when detecting mail headers
if n.as_slice().is_whitespace() {
return Some(NoFormatLine("\n".to_string(), true));
}
// if this line does not match the prefix,
// emit the line unprocessed and iterate again
let (pmatch, poffset) = self.match_prefix(n.as_slice());
if ! pmatch {
return Some(NoFormatLine(n, false));
}
// if this line matches the anti_prefix
// (NOTE definition of match_anti_prefix is TRUE if we should process)
if ! self.match_anti_prefix(n.as_slice()) {
return Some(NoFormatLine(n, false));
}
// replace trailing newline, if any, with space
let CharRange {ch, next: i} = n.as_slice().char_range_at_reverse(n.len());
if ch == '\n' {
unsafe {
let nmut = n.as_mut_bytes();
nmut[i] = ' ' as u8;
}
if i > 0 {
let CharRange {ch, next: _} = n.as_slice().char_range_at_reverse(i);
if ch == '.' {
n.push_char(' ');
}
}
}
let nLen = n.len();
// figure out the indent, prefix, and prefixindent ending points
let (indEnd, pfxEnd, pfxIndEnd) =
if self.opts.use_prefix {
let pfxEnd = poffset + self.opts.prefix.len();
let nSlice = n.as_slice().slice_from(pfxEnd);
let nSlice2 = nSlice.trim_left();
(pfxEnd + nSlice.len() - nSlice2.len(), pfxEnd, poffset)
} else {
let nSlice = n.as_slice().trim_left();
(nLen - nSlice.len(), 0, 0)
};
// indent length
let indLen =
if indEnd > 0 {
let nSlice = n.as_slice().slice(pfxEnd, indEnd);
nSlice.char_len() + (self.opts.tabwidth - 1) * nSlice.chars().filter(|x| x == &'\t').count()
} else {
0
};
// prefix indent length
let pfxIndLen =
if pfxIndEnd > 0 {
let nSlice = n.as_slice().slice_to(pfxIndEnd);
nSlice.char_len() + (self.opts.tabwidth - 1) * nSlice.chars().filter(|x| x == &'\t').count()
} else {
0
};
// if we are in uniform mode, all tabs after the indent should be replaced by spaces.
// NOTE that in this implementation, [?!.]\t is NOT detected as a sentence break, but
// [?!.]\t\t is. We could expand tabs to two spaces to force detection of tab as
// sentence ending
if self.opts.uniform {
let tabinds: Vec<uint> = n.as_slice().slice_from(indEnd).char_indices().filter_map(|(i,c)| if c == '\t' { Some(i) } else { None }).collect();
unsafe {
let nmut = n.as_mut_bytes();
for i in tabinds.iter() {
nmut[*i] = ' ' as u8;
}
}
}
Some(FormatLine(FileLine { line: n
, indent_end: indEnd
, prefix_end: pfxEnd
, pfxind_end: pfxIndEnd
, indent_len: indLen
, pfxind_len: pfxIndLen
}))
}
}
// a paragraph : a collection of FileLines that are to be formatted
// plus info about the paragraph's indentation
// (but we only retain the String from the FileLine; the other info
// is only there to help us in deciding how to merge lines into Paragraphs
#[deriving(Show)]
pub struct Paragraph {
lines : Vec<String>, // the lines of the file
pub init_str : String, // string representing the init, that is, the first line's indent
pub init_len : uint, // printable length of the init string considering TABWIDTH
init_end : uint, // byte location of end of init in first line String
pub indent_str : String, // string representing indent
pub indent_len : uint, // length of above
indent_end : uint, // byte location of end of indent (in crown and tagged mode, only applies to 2nd line and onward)
pub pfxind_str : String, // string representing the prefix indent
pub pfxind_len : uint, // length of above
pub mail_header : bool // we need to know if this is a mail header because we do word splitting differently in that case
}
// an iterator producing a stream of paragraphs from a stream of lines
// given a set of options.
// NOTE as you iterate through the paragraphs, any NoFormatLines are
// immediately dumped to stdout!
pub struct ParagraphStream<'a> {
lines : Peekable<Line,FileLines<'a>>,
next_mail : bool,
opts : &'a FmtOptions,
}
impl<'a> ParagraphStream<'a> {
pub fn new<'a>(opts: &'a FmtOptions, reader: &'a mut FileOrStdReader) -> ParagraphStream<'a> {
let lines = FileLines::new(opts, reader.lines()).peekable();
// at the beginning of the file, we might find mail headers
ParagraphStream { lines: lines, next_mail: true, opts: opts }
}
// detect RFC822 mail header
fn is_mail_header(line: &FileLine) -> bool {
// a mail header begins with either "From " (envelope sender line)
// or with a sequence of printable ASCII chars (33 to 126, inclusive,
// except colon) followed by a colon.
if line.indent_end > 0 {
return false;
} else {
let lSlice = line.line.as_slice();
if lSlice.starts_with("From ") {
return true;
} else {
let colonPosn =
match lSlice.find(':') {
Some(n) => n,
None => return false
};
// header field must be nonzero length
if colonPosn == 0 { return false; }
return lSlice.slice_to(colonPosn).chars()
.all(|x| match x as uint {
y if y < 33 || y > 126 => false,
_ => true
});
}
}
}
}
impl<'a> Iterator<Result<Paragraph,String>> for ParagraphStream<'a> {
fn next(&mut self) -> Option<Result<Paragraph,String>> {
// return a NoFormatLine in an Err; it should immediately be output
let noformat =
match self.lines.peek() {
None => return None,
Some(l) => match l {
&FormatLine(_) => false,
&NoFormatLine(_, _) => true
}
};
// found a NoFormatLine, immediately dump it out
if noformat {
let (s, nm) = self.lines.next().unwrap().get_noformatline();
self.next_mail = nm;
return Some(Err(s));
}
// found a FormatLine, now build a paragraph
let mut init_str = String::new();
let mut init_end = 0;
let mut init_len = 0;
let mut indent_str = String::new();
let mut indent_end = 0;
let mut indent_len = 0;
let mut pfxind_str = String::new();
let mut pfxind_len = 0;
let mut pLines = Vec::new();
let mut in_mail = false;
let mut second_done = false; // for when we use crown or tagged mode
loop {
{ // peek ahead
// need to explicitly force fl out of scope before we can call self.lines.next()
let fl =
match self.lines.peek() {
None => break,
Some(l) => {
match l {
&FormatLine(ref x) => x,
&NoFormatLine(..) => break
}
}
};
if pLines.len() == 0 {
// first time through the loop, get things set up
// detect mail header
if self.opts.mail && self.next_mail && ParagraphStream::is_mail_header(fl) {
in_mail = true;
// there can't be any indent or pfxind because otherwise is_mail_header would fail
// since there cannot be any whitespace before the colon in a valid header field
indent_str.push_str(" ");
indent_len = 2;
} else {
if self.opts.crown || self.opts.tagged {
init_str.push_str(fl.line.as_slice().slice_to(fl.indent_end));
init_len = fl.indent_len + fl.pfxind_len + self.opts.prefix_len;
init_end = fl.indent_end;
}
// these will be overwritten in the 2nd line of crown or tagged mode, but
// we are not guaranteed to get to the 2nd line, e.g., if the next line
// is a NoFormatLine or None. Thus, we set sane defaults the 1st time around
indent_str.push_str(fl.line.as_slice().slice(fl.prefix_end,fl.indent_end));
indent_len = fl.indent_len;
indent_end = fl.indent_end;
// in tagged mode, add 4 spaces of additional indenting by default
// (gnu fmt's behavior is different: it seems to find the closest column to
// indent_end that is divisible by 3. But honesly that behavior seems
// pretty arbitrary.
// Perhaps a better default would be 1 TABWIDTH? But ugh that's so big.
if self.opts.tagged {
indent_str.push_str(" ");
indent_len += 4;
}
if self.opts.use_prefix {
pfxind_str.push_str(fl.line.as_slice().slice_to(fl.pfxind_end));
pfxind_len = fl.pfxind_len;
}
}
} else if in_mail {
// lines following mail headers must begin with spaces
if (self.opts.use_prefix && fl.pfxind_end == 0) || (! self.opts.use_prefix && fl.indent_end == 0) {
break; // this line does not begin with spaces
}
} else if ! second_done && (self.opts.crown || self.opts.tagged) {
// now we have enough info to handle crown margin and tagged mode
if pfxind_len != fl.pfxind_len {
// in both crown and tagged modes we require that pfxind is the same
break;
} else if self.opts.tagged && (indent_end == fl.indent_end) {
// in tagged mode, indent also has to be different
break;
} else {
// this is part of the same paragraph, get the indent info from this line
indent_str.clear();
indent_str.push_str(fl.line.as_slice().slice(fl.prefix_end,fl.indent_end));
indent_len = fl.indent_len;
indent_end = fl.indent_end;
}
second_done = true;
} else {
// detect mismatch
if (indent_end != fl.indent_end) || (indent_len != fl.indent_len) || (pfxind_len != fl.pfxind_len) {
break;
}
}
}
pLines.push(self.lines.next().unwrap().get_fileline().line);
// when we're in split-only mode, we never join lines, so stop here
if self.opts.split_only {
break;
}
}
// if this was a mail header, then the next line can be detected as one. Otherwise, it cannot.
// NOTE next_mail is true at ParagraphStream instantiation, and is set to true after a blank
// NoFormatLine.
self.next_mail = in_mail;
Some(Ok(Paragraph { lines: pLines
, init_str: init_str
, init_len: init_len
, init_end: init_end
, indent_str: indent_str
, indent_len: indent_len
, indent_end: indent_end
, pfxind_str: pfxind_str
, pfxind_len: pfxind_len
, mail_header: in_mail
}))
}
}
pub struct ParaWords<'a> {
opts : &'a FmtOptions,
para : &'a Paragraph,
words : Vec<&'a str>
}
impl<'a> ParaWords<'a> {
pub fn new<'a>(opts: &'a FmtOptions, para: &'a Paragraph) -> ParaWords<'a> {
let mut pw = ParaWords { opts: opts, para: para, words: Vec::new() };
pw.create_words();
pw
}
fn create_words<'r>(&'r mut self) {
if self.para.mail_header {
// no extra spacing for mail headers; always exactly 1 space
// safe to trim_left on every line of a mail header, since the
// first line is guaranteed not to have any spaces
self.words.push_all_move(self.para.lines.iter().flat_map(|x| x.as_slice().trim_left().words()).collect());
} else {
// first line
self.words.push_all_move(
if self.opts.crown || self.opts.tagged {
// crown and tagged mode has the "init" in the first line, so slice from there
WordSplit::new(self.opts.uniform, self.para.lines.get(0).as_slice().slice_from(self.para.init_end))
} else {
// otherwise we slice from the indent
WordSplit::new(self.opts.uniform, self.para.lines.get(0).as_slice().slice_from(self.para.indent_end))
}.collect());
if self.para.lines.len() > 1 {
let indent_end = self.para.indent_end;
let uniform = self.opts.uniform;
self.words.push_all_move(
self.para.lines.iter().skip(1)
.flat_map(|x| WordSplit::new(uniform, x.as_slice().slice_from(indent_end)))
.collect());
}
}
}
pub fn words(&'a self) -> Items<'a,&'a str> { return self.words.iter() }
}
struct WordSplit<'a> {
uniform : bool,
string : &'a str,
length : uint,
position : uint
}
impl<'a> WordSplit<'a> {
fn new<'a>(uniform: bool, string: &'a str) -> WordSplit<'a> {
// wordsplits *must* start at a non-whitespace character
let trim_string = string.trim_left();
WordSplit { uniform: uniform, string: trim_string, length: string.len(), position: 0 }
}
fn is_punctuation(c: char) -> bool {
match c {
'!' | '.' | '?' => true,
_ => false
}
}
}
impl<'a> Iterator<&'a str> for WordSplit<'a> {
fn next(&mut self) -> Option<&'a str> {
if self.position >= self.length {
return None
}
let old_position = self.position;
// find the start of the next whitespace segment
let ws_start =
match self.string.slice_from(old_position).find(|x: char| x.is_whitespace()) {
None => self.length,
Some(s) => s + old_position
};
if ws_start == self.length {
self.position = self.length;
return Some(self.string.slice_from(old_position));
}
// find the end of the next whitespace segment
// note that this preserves the invariant that self.position points to
// non-whitespace character OR end of string
self.position =
match self.string.slice_from(ws_start).find(|x: char| ! x.is_whitespace()) {
None => self.length,
Some(s) => s + ws_start
};
let is_sentence_end = match self.string.char_range_at_reverse(ws_start) {
CharRange { ch, next: _ } if WordSplit::is_punctuation(ch) => self.position - ws_start > 2,
_ => false
};
Some(
if self.uniform {
// if the last non-whitespace character is a [?!.] and
// there are two or more spaces, this is the end of a
// sentence, so keep one extra space.
if is_sentence_end {
self.string.slice(old_position, ws_start + 1)
} else {
self.string.slice(old_position, ws_start)
}
} else {
// in non-uniform mode, we just keep the whole thing
// eventually we will want to annotate where the sentence boundaries are
// so that we can give preference to splitting lines appropriately
self.string.slice(old_position, self.position)
})
}
}