From 2ab586459b0a9a34ce461fd9e268fd8b85c4188f Mon Sep 17 00:00:00 2001 From: polyphemus Date: Sun, 8 Jun 2014 23:51:22 +0200 Subject: [PATCH 1/6] Add initial cut support, only bytes cutting --- cut/cut.rs | 285 ++++++++++++++++++++++++++++++++++++++++++++++++++ cut/ranges.rs | 108 +++++++++++++++++++ 2 files changed, 393 insertions(+) create mode 100644 cut/cut.rs create mode 100644 cut/ranges.rs diff --git a/cut/cut.rs b/cut/cut.rs new file mode 100644 index 000000000..2055dee1b --- /dev/null +++ b/cut/cut.rs @@ -0,0 +1,285 @@ +#![crate_id(name="cut", vers="1.0.0", author="Rolf Morel")] +#![feature(macro_rules)] + +extern crate getopts; +extern crate libc; + +use std::os; +use std::io::{print,stdin,stdout,File,BufferedWriter,BufferedReader}; +use getopts::{optopt, optflag, getopts, usage}; + +use ranges::Range; + +#[path = "../common/util.rs"] +mod util; +mod ranges; + +static NAME: &'static str = "cut"; +static VERSION: &'static str = "1.0.0"; + +struct Options { + out_delim: Option, +} + +struct FieldOptions { + delimiter: char, + out_delimeter: String, + only_delimited: bool, +} + +enum Mode { + Bytes(Vec, Options), + Characters(Vec, Options), + Fields(Vec, FieldOptions), +} + +fn list_to_ranges(list: &str, complement: bool) -> Result, String> { + use std::uint; + + let mut range_vec = { + try!( + if complement { + Range::from_list(list).map(|r| ranges::complement(&r)) + } else { + Range::from_list(list) + } + ) + }; + + // add sentinel value for increased performance during cutting + range_vec.push(Range{ low: uint::MAX, high: uint::MAX }); + + Ok(range_vec) +} + +fn cut_bytes(files: Vec, ranges: Vec, opts: Options) -> int { + let mut out = BufferedWriter::new(std::io::stdio::stdout_raw()); + let (use_delim, out_delim) = match opts.out_delim { + Some(delim) => (true, delim), + None => (false, "".to_string()) + }; + + for filename in files.move_iter() { + let mut file = match open(&filename) { + Some(file) => file, + None => continue + }; + + let mut byte_pos = 0; + let mut print_delim = false; + let mut range_pos = 0; + + loop { + let byte = match file.read_u8() { + Ok(byte) => byte, + Err(std::io::IoError{ kind: std::io::EndOfFile, ..}) => { + if byte_pos > 0 { + out.write_u8('\n' as u8); + } + break + } + _ => fail!(), + }; + + if byte == ('\n' as u8) { + out.write_u8('\n' as u8); + byte_pos = 0; + print_delim = false; + range_pos = 0; + } else { + byte_pos += 1; + + if byte_pos > ranges.get(range_pos).high { + range_pos += 1; + } + + let cur_range = *ranges.get(range_pos); + + if byte_pos >= cur_range.low { + if use_delim { + if print_delim && byte_pos == cur_range.low { + out.write_str(out_delim.as_slice()); + } + + print_delim = true; + } + + out.write_u8(byte); + } + } + } + } + + return 0; +} + +fn cut_charachters(files: Vec, ranges: Vec, + opts: Options) -> int { + return 0; +} + +fn cut_fields(files: Vec, ranges: Vec, + opts: FieldOptions) -> int { + for range in ranges.iter() { + println!("{}-{}", range.low, range.high); + } + + return 0; +} + +#[allow(dead_code)] +fn main() { os::set_exit_status(uumain(os::args())); } + +pub fn uumain(args: Vec) -> int { + let program = args.get(0).clone(); + let opts = [ + optopt("b", "bytes", "select only these bytes", "LIST"), + optopt("c", "characters", "select only these characters", "LIST"), + optopt("d", "delimiter", "use DELIM instead of TAB for field delimiter", "DELIM"), + optopt("f", "fields", "select only these fields; also print any line that contains no delimiter character, unless the -s option is specified", "LIST"), + optflag("n", "", "(ignored)"), + optflag("", "complement", "complement the set of selected bytes, characters or fields"), + optflag("s", "only-delimited", "do not print lines not containing delimiters"), + optopt("", "output-delimiter", "use STRING as the output delimiter the default is to use the input delimiter", "STRING"), + optflag("", "help", "display this help and exit"), + optflag("", "version", "output version information and exit"), + ]; + + let mut matches = match getopts(args.tail(), opts) { + Ok(m) => m, + Err(f) => { + show_error!(1, "Invalid options\n{}", f.to_err_msg()) + return 1; + } + }; + + if matches.opt_present("help") { + println!("Usage:"); + println!(" {0:s} OPTION... [FILE]...", program); + println!(""); + print(usage("Print selected parts of lines from each FILE to standard output.", opts).as_slice()); + println!(""); + println!("Use one, and only one of -b, -c or -f. Each LIST is made up of one"); + println!("range, or many ranges separated by commas. Selected input is written"); + println!("in the same order that it is read, and is written exactly once."); + println!("Each range is one of:"); + println!(""); + println!(" N N'th byte, character or field, counted from 1"); + println!(" N- from N'th byte, character or field, to end of line"); + println!(" N-M from N'th to M'th (included) byte, character or field"); + println!(" -M from first to M'th (included) byte, character or field"); + println!(""); + println!("With no FILE, or when FILE is -, read standard input."); + return 0; + } + + if matches.opt_present("version") { + println!("{} {}", NAME, VERSION); + return 0; + } + + let complement = matches.opt_present("complement"); + let mut out_delim = matches.opt_str("output-delimiter"); + + let mode = match (matches.opt_str("bytes"), matches.opt_str("characters"), + matches.opt_str("fields")) { + (Some(byte_ranges), None, None) => { + match list_to_ranges(byte_ranges.as_slice(), complement) { + Ok(ranges) => Bytes(ranges, Options{ out_delim: out_delim }), + Err(msg) => { + show_error!(1, "{}", msg); + return 1; + } + } + } + (None ,Some(char_ranges), None) => { + match list_to_ranges(char_ranges.as_slice(), complement) { + Ok(ranges) => Characters(ranges, + Options{ out_delim: out_delim }), + Err(msg) => { + show_error!(1, "{}", msg); + return 1; + } + } + } + (None, None ,Some(field_ranges)) => { + match list_to_ranges(field_ranges.as_slice(), complement) { + Ok(ranges) => { + use std::str::from_char; + + let only_delimited = matches.opt_present("only-delimited"); + let delim = matches.opt_str("delimiter") + .filtered(|s| s.len() == 1) + .map(|s| s.as_slice().char_at(0)) + .unwrap_or('\t'); + if out_delim.is_none() { + out_delim = Some(from_char(delim)); + } + + Fields(ranges, + FieldOptions{ delimiter: delim, + out_delimeter: out_delim.unwrap(), + only_delimited: only_delimited }) + } + Err(msg) => { + show_error!(1, "{}", msg); + return 1; + } + } + } + (ref b, ref c, ref f) if b.is_some() || c.is_some() || f.is_some() => { + crash!(1, "only one type of list may be specified"); + } + _ => crash!(1, "you must specify a list of bytes, characters, or fields") + }; + + match mode { + Bytes(..) | Characters(..) => { + if matches.opt_present("delimiter") { + show_error!(1, "an input delimiter may be specified only when operating on fields"); + return 1; + } + if matches.opt_present("only-delimited") { + show_error!(1, "suppressing non-delimited lines makes sense only when operating on fields"); + return 1; + } + } + _ => () + } + + for filename in matches.free.iter() { + if ! (filename.as_slice() == "-" || + Path::new(filename.as_slice()).exists()) { + show_error!(1, "{}: No such file or directory", filename); + return 1; + } + } + + if matches.free.len() == 0 { matches.free.push("-".to_string()); } + + match mode { + Bytes(ranges, opts) => return cut_bytes(matches.free, ranges, opts), + Characters(ranges, opts) => return cut_charachters(matches.free, + ranges, opts), + Fields(ranges, opts) => return cut_fields(matches.free, ranges, opts), + } +} + +fn open(path: &String) -> Option>> { + if "-" == path.as_slice() { + let reader = box stdin() as Box; + return Some(BufferedReader::new(reader)); + } + + match File::open(&std::path::Path::new(path.as_slice())) { + Ok(fd) => { + let reader = box fd as Box; + return Some(BufferedReader::new(reader)); + }, + Err(e) => { + show_error!(1, "{0:s}: {1:s}", *path, e.desc.to_str()); + } + } + + None +} diff --git a/cut/ranges.rs b/cut/ranges.rs new file mode 100644 index 000000000..e0fed0a68 --- /dev/null +++ b/cut/ranges.rs @@ -0,0 +1,108 @@ +/* + * This file is part of the uutils coreutils package. + * + * (c) Rolf Morel + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +use std; + +#[deriving(PartialEq,Eq,PartialOrd,Ord,Show)] +pub struct Range { + pub low: uint, + pub high: uint, +} + +impl std::from_str::FromStr for Range { + fn from_str(s: &str) -> Option { + use std::uint::MAX; + + let mut parts = s.splitn('-', 1); + + match (parts.next(), parts.next()) { + (Some(nm), None) => { + from_str::(nm).filtered(|nm| *nm > 0) + .map(|nm| Range{ low: nm, high: nm }) + } + (Some(n), Some(m)) if m.len() == 0 => { + from_str::(n).filtered(|low| *low > 0) + .map(|low| Range{ low: low, high: MAX }) + } + (Some(n), Some(m)) if n.len() == 0 => { + from_str::(m).filtered(|high| *high >= 1) + .map(|high| Range{ low: 1, high: high }) + } + (Some(n), Some(m)) => { + match (from_str::(n), from_str::(m)) { + (Some(low), Some(high)) if low > 0 && low <= high => { + Some(Range{ low: low, high: high }) + } + _ => None + } + } + _ => unreachable!() + } + } +} + +impl Range { + pub fn from_list(list: &str) -> Result, String> { + use std::cmp::max; + + let mut ranges = vec!(); + + for item in list.split(',') { + match from_str::(item) { + Some(range_item) => ranges.push(range_item), + None => return Err(format!("range '{}' was invalid", item)) + } + } + + ranges.sort(); + + // merge overlapping ranges + for i in range(0, ranges.len()) { + let j = i + 1; + + while j < ranges.len() && ranges.get(j).low <= ranges.get(i).high { + let j_high = ranges.remove(j).unwrap().high; + ranges.get_mut(i).high = max(ranges.get(i).high, j_high); + } + } + + Ok(ranges) + } +} + +pub fn complement(ranges: &Vec) -> Vec { + use std::uint; + + let mut complements = Vec::with_capacity(ranges.len() + 1); + + if ranges.len() > 0 && ranges.get(0).low > 1 { + complements.push(Range{ low: 1, high: ranges.get(0).low - 1 }); + } + + let mut ranges_iter = ranges.iter().peekable(); + loop { + match (ranges_iter.next(), ranges_iter.peek()) { + (Some(left), Some(right)) => { + if left.high + 1 != right.low { + complements.push(Range{ low: left.high + 1, + high: right.low - 1 }); + } + } + (Some(last), None) => { + if last.high < uint::MAX { + complements.push(Range{ low: last.high + 1, + high: uint::MAX }); + } + } + _ => break + } + } + + complements +} From 8b1ff08bd5be6b21bd30ae58737d6c29f1d0c637 Mon Sep 17 00:00:00 2001 From: polyphemus Date: Thu, 12 Jun 2014 23:04:01 +0200 Subject: [PATCH 2/6] Add cut_characters implementation, based on cut_bytes This implementation uses rust's concept of characters and fails if the input isn't valid utf-8. GNU cut implements '--characters' as an alias for '--bytes' and thus has different semantics, for this option, from this implemtation. --- Makefile | 1 + cut/cut.rs | 323 ++++++++++++++++++++++++++++------------------- uutils/uutils.rs | 2 + 3 files changed, 196 insertions(+), 130 deletions(-) diff --git a/Makefile b/Makefile index c2c4a8b1d..b445daddf 100644 --- a/Makefile +++ b/Makefile @@ -13,6 +13,7 @@ PROGS := \ cksum \ comm \ cp \ + cut \ dirname \ echo \ env \ diff --git a/cut/cut.rs b/cut/cut.rs index 2055dee1b..deedda907 100644 --- a/cut/cut.rs +++ b/cut/cut.rs @@ -1,11 +1,21 @@ #![crate_id(name="cut", vers="1.0.0", author="Rolf Morel")] + +/* + * This file is part of the uutils coreutils package. + * + * (c) Rolf Morel + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + #![feature(macro_rules)] extern crate getopts; extern crate libc; use std::os; -use std::io::{print,stdin,stdout,File,BufferedWriter,BufferedReader}; +use std::io::{print,File,BufferedWriter,BufferedReader,stdin}; use getopts::{optopt, optflag, getopts, usage}; use ranges::Range; @@ -52,79 +62,186 @@ fn list_to_ranges(list: &str, complement: bool) -> Result, String> { Ok(range_vec) } -fn cut_bytes(files: Vec, ranges: Vec, opts: Options) -> int { +fn cut_bytes(mut reader: BufferedReader, + ranges: &Vec, + opts: &Options) -> int { let mut out = BufferedWriter::new(std::io::stdio::stdout_raw()); - let (use_delim, out_delim) = match opts.out_delim { + let (use_delim, out_delim) = match opts.out_delim.clone() { Some(delim) => (true, delim), - None => (false, "".to_string()) + None => (false, "".to_str()) }; - for filename in files.move_iter() { - let mut file = match open(&filename) { - Some(file) => file, - None => continue - }; + let mut byte_pos = 0; + let mut print_delim = false; + let mut range_pos = 0; - let mut byte_pos = 0; - let mut print_delim = false; - let mut range_pos = 0; - - loop { - let byte = match file.read_u8() { - Ok(byte) => byte, - Err(std::io::IoError{ kind: std::io::EndOfFile, ..}) => { - if byte_pos > 0 { - out.write_u8('\n' as u8); - } - break + loop { + let mut byte = [0u8]; + match reader.read(byte) { + Ok(1) => (), + Err(std::io::IoError{ kind: std::io::EndOfFile, ..}) => { + if byte_pos > 0 { + out.write_u8('\n' as u8).unwrap(); } - _ => fail!(), - }; + break + } + _ => fail!(), + } + let byte = byte[0]; - if byte == ('\n' as u8) { - out.write_u8('\n' as u8); - byte_pos = 0; - print_delim = false; - range_pos = 0; - } else { - byte_pos += 1; + if byte == ('\n' as u8) { + out.write_u8('\n' as u8).unwrap(); + byte_pos = 0; + print_delim = false; + range_pos = 0; + } else { + byte_pos += 1; - if byte_pos > ranges.get(range_pos).high { - range_pos += 1; - } + if byte_pos > ranges.get(range_pos).high { + range_pos += 1; + } - let cur_range = *ranges.get(range_pos); + let cur_range = *ranges.get(range_pos); - if byte_pos >= cur_range.low { - if use_delim { - if print_delim && byte_pos == cur_range.low { - out.write_str(out_delim.as_slice()); - } - - print_delim = true; + if byte_pos >= cur_range.low { + if use_delim { + if print_delim && byte_pos == cur_range.low { + out.write_str(out_delim.as_slice()).unwrap(); } - out.write_u8(byte); + print_delim = true; } + + out.write_u8(byte).unwrap(); } } } - return 0; + 0 } -fn cut_charachters(files: Vec, ranges: Vec, - opts: Options) -> int { - return 0; +fn cut_characters(mut reader: BufferedReader, + ranges: &Vec, + opts: &Options) -> int { + let mut out = BufferedWriter::new(std::io::stdio::stdout_raw()); + let (use_delim, out_delim) = match opts.out_delim.clone() { + Some(delim) => (true, delim), + None => (false, "".to_str()) + }; + + let mut char_pos = 0; + let mut print_delim = false; + let mut range_pos = 0; + + loop { + let character = match reader.read_char() { + Ok(character) => character, + Err(std::io::IoError{ kind: std::io::EndOfFile, ..}) => { + if char_pos > 0 { + out.write_u8('\n' as u8).unwrap(); + } + break + } + Err(std::io::IoError{ kind: std::io::InvalidInput, ..}) => { + fail!("Invalid utf8"); + } + _ => fail!(), + }; + + if character == '\n' { + out.write_u8('\n' as u8).unwrap(); + char_pos = 0; + print_delim = false; + range_pos = 0; + } else { + char_pos += 1; + + if char_pos > ranges.get(range_pos).high { + range_pos += 1; + } + + let cur_range = *ranges.get(range_pos); + + if char_pos >= cur_range.low { + if use_delim { + if print_delim && char_pos == cur_range.low { + out.write_str(out_delim.as_slice()).unwrap(); + } + + print_delim = true; + } + + out.write_char(character).unwrap(); + } + } + } + + 0 } -fn cut_fields(files: Vec, ranges: Vec, - opts: FieldOptions) -> int { +fn cut_fields(reader: BufferedReader, + ranges: &Vec, + opts: &FieldOptions) -> int { for range in ranges.iter() { println!("{}-{}", range.low, range.high); } - return 0; + 0 +} + +fn cut_files(mut filenames: Vec, mode: Mode) -> int { + let mut stdin_read = false; + let mut exit_code = 0; + + if filenames.len() == 0 { filenames.push("-".to_str()); } + + for filename in filenames.iter() { + if filename.as_slice() == "-" { + if stdin_read { continue; } + + exit_code |= match mode { + Bytes(ref ranges, ref opts) => { + cut_bytes(stdin(), ranges, opts) + } + Characters(ref ranges, ref opts) => { + cut_characters(stdin(), ranges, opts) + } + Fields(ref ranges, ref opts) => { + cut_fields(stdin(), ranges, opts) + } + }; + + stdin_read = true; + } else { + let path = Path::new(filename.as_slice()); + + if ! path.exists() { + show_error!("{}: No such file or directory", filename); + continue; + } + + let buf_file = match File::open(&path) { + Ok(file) => BufferedReader::new(file), + Err(e) => { + show_error!("{0:s}: {1:s}", filename.as_slice(), + e.desc.to_str()); + continue + } + }; + + exit_code |= match mode { + Bytes(ref ranges, ref opts) => cut_bytes(buf_file, ranges, opts), + Characters(ref ranges, ref opts) => { + cut_characters(buf_file, ranges, opts) + } + Fields(ref ranges, ref opts) => { + cut_fields(buf_file, ranges, opts) + } + }; + } + } + + exit_code } #[allow(dead_code)] @@ -145,10 +262,10 @@ pub fn uumain(args: Vec) -> int { optflag("", "version", "output version information and exit"), ]; - let mut matches = match getopts(args.tail(), opts) { + let matches = match getopts(args.tail(), opts) { Ok(m) => m, Err(f) => { - show_error!(1, "Invalid options\n{}", f.to_err_msg()) + show_error!("Invalid options\n{}", f.to_err_msg()) return 1; } }; @@ -179,107 +296,53 @@ pub fn uumain(args: Vec) -> int { } let complement = matches.opt_present("complement"); - let mut out_delim = matches.opt_str("output-delimiter"); - let mode = match (matches.opt_str("bytes"), matches.opt_str("characters"), - matches.opt_str("fields")) { + let mode_parse = match (matches.opt_str("bytes"), + matches.opt_str("characters"), + matches.opt_str("fields")) { (Some(byte_ranges), None, None) => { - match list_to_ranges(byte_ranges.as_slice(), complement) { - Ok(ranges) => Bytes(ranges, Options{ out_delim: out_delim }), - Err(msg) => { - show_error!(1, "{}", msg); - return 1; - } - } + list_to_ranges(byte_ranges.as_slice(), complement).map(|ranges| + Bytes(ranges, + Options{ out_delim: matches.opt_str("output-delimiter") }) + ) } (None ,Some(char_ranges), None) => { - match list_to_ranges(char_ranges.as_slice(), complement) { - Ok(ranges) => Characters(ranges, - Options{ out_delim: out_delim }), - Err(msg) => { - show_error!(1, "{}", msg); - return 1; - } - } + list_to_ranges(char_ranges.as_slice(), complement).map(|ranges| + Characters(ranges, + Options{ out_delim: matches.opt_str("output-delimiter") }) + ) } (None, None ,Some(field_ranges)) => { - match list_to_ranges(field_ranges.as_slice(), complement) { - Ok(ranges) => { + list_to_ranges(field_ranges.as_slice(), complement).map(|ranges| + { use std::str::from_char; - let only_delimited = matches.opt_present("only-delimited"); let delim = matches.opt_str("delimiter") .filtered(|s| s.len() == 1) .map(|s| s.as_slice().char_at(0)) .unwrap_or('\t'); - if out_delim.is_none() { - out_delim = Some(from_char(delim)); - } + let out_delim = matches.opt_str("output-delimiter") + .unwrap_or(from_char(delim)); + let only_delimited = matches.opt_present("only-delimited"); Fields(ranges, FieldOptions{ delimiter: delim, - out_delimeter: out_delim.unwrap(), + out_delimeter: out_delim, only_delimited: only_delimited }) } - Err(msg) => { - show_error!(1, "{}", msg); - return 1; - } - } + ) } (ref b, ref c, ref f) if b.is_some() || c.is_some() || f.is_some() => { - crash!(1, "only one type of list may be specified"); + Err("only one type of list may be specified".to_str()) } - _ => crash!(1, "you must specify a list of bytes, characters, or fields") + _ => Err("you must specify a list of bytes, characters, or fields".to_str()) }; - match mode { - Bytes(..) | Characters(..) => { - if matches.opt_present("delimiter") { - show_error!(1, "an input delimiter may be specified only when operating on fields"); - return 1; - } - if matches.opt_present("only-delimited") { - show_error!(1, "suppressing non-delimited lines makes sense only when operating on fields"); - return 1; - } + match mode_parse { + Ok(mode) => cut_files(matches.free, mode), + Err(err_msg) => { + show_error!("{}", err_msg); + 1 } - _ => () - } - - for filename in matches.free.iter() { - if ! (filename.as_slice() == "-" || - Path::new(filename.as_slice()).exists()) { - show_error!(1, "{}: No such file or directory", filename); - return 1; - } - } - - if matches.free.len() == 0 { matches.free.push("-".to_string()); } - - match mode { - Bytes(ranges, opts) => return cut_bytes(matches.free, ranges, opts), - Characters(ranges, opts) => return cut_charachters(matches.free, - ranges, opts), - Fields(ranges, opts) => return cut_fields(matches.free, ranges, opts), } } - -fn open(path: &String) -> Option>> { - if "-" == path.as_slice() { - let reader = box stdin() as Box; - return Some(BufferedReader::new(reader)); - } - - match File::open(&std::path::Path::new(path.as_slice())) { - Ok(fd) => { - let reader = box fd as Box; - return Some(BufferedReader::new(reader)); - }, - Err(e) => { - show_error!(1, "{0:s}: {1:s}", *path, e.desc.to_str()); - } - } - - None -} diff --git a/uutils/uutils.rs b/uutils/uutils.rs index 680c581ff..fd9243904 100644 --- a/uutils/uutils.rs +++ b/uutils/uutils.rs @@ -18,6 +18,7 @@ extern crate chroot; extern crate cksum; extern crate comm; extern crate cp; +extern crate cut; extern crate dirname; extern crate du; extern crate echo; @@ -80,6 +81,7 @@ fn util_map() -> HashMap<&str, fn(Vec) -> int> { map.insert("cksum", cksum::uumain); map.insert("comm", comm::uumain); map.insert("cp", cp::uumain); + map.insert("cut", cut::uumain); map.insert("dirname", dirname::uumain); map.insert("du", du::uumain); map.insert("echo", echo::uumain); From b1c2d7ac7c100653e6e1056cd9b1f7ffe5cc34d9 Mon Sep 17 00:00:00 2001 From: polyphemus Date: Tue, 17 Jun 2014 14:22:44 +0200 Subject: [PATCH 3/6] Rewrite cut_bytes() Do no longer iterate over each byte and instead rely on the Buffer trait to find the newline for us. Iterate over the ranges to specify slices of the line which need to be printed out. This rewrite gives a signifcant performance increase: Old: 1.32s mahkoh: 0.90s New: 0.20s GNU: 0.15s --- cut/cut.rs | 64 ++++++++++++++++++++++++------------------------------ 1 file changed, 28 insertions(+), 36 deletions(-) diff --git a/cut/cut.rs b/cut/cut.rs index deedda907..1d3156913 100644 --- a/cut/cut.rs +++ b/cut/cut.rs @@ -71,50 +71,42 @@ fn cut_bytes(mut reader: BufferedReader, None => (false, "".to_str()) }; - let mut byte_pos = 0; - let mut print_delim = false; - let mut range_pos = 0; - - loop { - let mut byte = [0u8]; - match reader.read(byte) { - Ok(1) => (), - Err(std::io::IoError{ kind: std::io::EndOfFile, ..}) => { - if byte_pos > 0 { - out.write_u8('\n' as u8).unwrap(); - } - break - } + 'newline: loop { + let line = match reader.read_until(b'\n') { + Ok(line) => line, + Err(std::io::IoError{ kind: std::io::EndOfFile, ..}) => break, _ => fail!(), - } - let byte = byte[0]; + }; - if byte == ('\n' as u8) { - out.write_u8('\n' as u8).unwrap(); - byte_pos = 0; - print_delim = false; - range_pos = 0; - } else { - byte_pos += 1; + let line_len = line.len(); + let mut print_delim = false; - if byte_pos > ranges.get(range_pos).high { - range_pos += 1; - } + for &Range{ low: low, high: high } in ranges.iter() { + if low > line_len { break; } - let cur_range = *ranges.get(range_pos); - - if byte_pos >= cur_range.low { - if use_delim { - if print_delim && byte_pos == cur_range.low { - out.write_str(out_delim.as_slice()).unwrap(); - } - - print_delim = true; + if use_delim { + if print_delim { + out.write_str(out_delim.as_slice()); } + print_delim = true; + } - out.write_u8(byte).unwrap(); + if high >= line_len { + let segment = line.slice(low - 1, line_len); + + out.write(segment); + + if *line.get(line_len - 1) == b'\n' { + continue 'newline + } + } else { + let segment = line.slice(low - 1, high); + + out.write(segment); } } + + out.write(&[b'\n']); } 0 From 0e46d453b76989419faf704a46e98b08e09e270f Mon Sep 17 00:00:00 2001 From: polyphemus Date: Wed, 18 Jun 2014 21:25:58 +0200 Subject: [PATCH 4/6] Rewrite cut_characters This follows the cut_bytes() approach of letting read_line() create a buffer and find the newline. read_line() guarantees our buffer is a string of utf8 characters. When writing out the bytes segment we need to make sure we are cutting on utf8 boundaries, there for we must iterate over the buffer from read_line(). This implementation is(/should be) efficient as it only iterates once over the buffer. The previous performance was about 4x as slow as cut_bytes() and now it is about 2x as slow as cut_bytes(). --- cut/cut.rs | 80 ++++++++++++++++++++++++++---------------------------- 1 file changed, 39 insertions(+), 41 deletions(-) diff --git a/cut/cut.rs b/cut/cut.rs index 1d3156913..909b54c2d 100644 --- a/cut/cut.rs +++ b/cut/cut.rs @@ -121,51 +121,51 @@ fn cut_characters(mut reader: BufferedReader, None => (false, "".to_str()) }; - let mut char_pos = 0; - let mut print_delim = false; - let mut range_pos = 0; - - loop { - let character = match reader.read_char() { - Ok(character) => character, - Err(std::io::IoError{ kind: std::io::EndOfFile, ..}) => { - if char_pos > 0 { - out.write_u8('\n' as u8).unwrap(); - } - break - } - Err(std::io::IoError{ kind: std::io::InvalidInput, ..}) => { - fail!("Invalid utf8"); - } + 'newline: loop { + let line = match reader.read_line() { + Ok(line) => line, + Err(std::io::IoError{ kind: std::io::EndOfFile, ..}) => break, _ => fail!(), }; - if character == '\n' { - out.write_u8('\n' as u8).unwrap(); - char_pos = 0; - print_delim = false; - range_pos = 0; - } else { - char_pos += 1; + let mut char_pos = 0; + let mut char_indices = line.as_slice().char_indices(); + let mut print_delim = false; - if char_pos > ranges.get(range_pos).high { - range_pos += 1; - } + for &Range{ low: low, high: high } in ranges.iter() { + let low_idx = match char_indices.nth(low - char_pos - 1) { + Some((low_idx, _)) => low_idx, + None => break + }; - let cur_range = *ranges.get(range_pos); - - if char_pos >= cur_range.low { - if use_delim { - if print_delim && char_pos == cur_range.low { - out.write_str(out_delim.as_slice()).unwrap(); - } - - print_delim = true; + if use_delim { + if print_delim { + out.write_str(out_delim.as_slice()); } - - out.write_char(character).unwrap(); + print_delim = true; } + + match char_indices.nth(high - low) { + Some((high_idx, _)) => { + let segment = line.as_bytes().slice(low_idx, high_idx); + + out.write(segment); + } + None => { + let bytes = line.as_bytes(); + let segment = bytes.slice(low_idx, bytes.len()); + + out.write(segment); + + if line.as_bytes()[bytes.len() - 1] == b'\n' { + continue 'newline + } + } + } + + char_pos = high + 1; } + out.write(&[b'\n']); } 0 @@ -215,8 +215,7 @@ fn cut_files(mut filenames: Vec, mode: Mode) -> int { let buf_file = match File::open(&path) { Ok(file) => BufferedReader::new(file), Err(e) => { - show_error!("{0:s}: {1:s}", filename.as_slice(), - e.desc.to_str()); + show_error!("{}: {}", filename, e.desc); continue } }; @@ -240,7 +239,6 @@ fn cut_files(mut filenames: Vec, mode: Mode) -> int { fn main() { os::set_exit_status(uumain(os::args())); } pub fn uumain(args: Vec) -> int { - let program = args.get(0).clone(); let opts = [ optopt("b", "bytes", "select only these bytes", "LIST"), optopt("c", "characters", "select only these characters", "LIST"), @@ -264,7 +262,7 @@ pub fn uumain(args: Vec) -> int { if matches.opt_present("help") { println!("Usage:"); - println!(" {0:s} OPTION... [FILE]...", program); + println!(" {0} OPTION... [FILE]...", args.get(0)); println!(""); print(usage("Print selected parts of lines from each FILE to standard output.", opts).as_slice()); println!(""); From 798af520774c30bd10f5b2240380b30084aeb6d3 Mon Sep 17 00:00:00 2001 From: polyphemus Date: Wed, 25 Jun 2014 19:17:38 +0200 Subject: [PATCH 5/6] Implement fields cutting Adds an implementation for cut_fields() and creates a separate funtion for the --output-delimiter, for performance reasons. This implementation relies on ::read_until() to find the newline for us but read_until() allocates a vector every time to return it's result. This is not ideal and should be improved upon by passing a buffer to read(). This follows/implements the POSIX specification and all the GNU conventions. It is a drop-in replacement for GNU cut. One improvement to GNU is that the --delimter option takes a character as UTF8 as apposed to single byte only for GNU. Performance is about two times slower than that of GNU cut. Remove ranges' sentinel value, All cut functions iterate over the ranges and therefore it only adds an extra iteration instead of improving performance. --- cut/cut.rs | 296 +++++++++++++++++++++++++++++++++++++++++--------- cut/ranges.rs | 22 ++-- 2 files changed, 256 insertions(+), 62 deletions(-) diff --git a/cut/cut.rs b/cut/cut.rs index 909b54c2d..16cc56636 100644 --- a/cut/cut.rs +++ b/cut/cut.rs @@ -15,7 +15,7 @@ extern crate getopts; extern crate libc; use std::os; -use std::io::{print,File,BufferedWriter,BufferedReader,stdin}; +use std::io::{File, BufferedWriter, BufferedReader, stdin, print}; use getopts::{optopt, optflag, getopts, usage}; use ranges::Range; @@ -32,8 +32,8 @@ struct Options { } struct FieldOptions { - delimiter: char, - out_delimeter: String, + delimiter: String, // one char long, String because of UTF8 representation + out_delimeter: Option, only_delimited: bool, } @@ -44,22 +44,11 @@ enum Mode { } fn list_to_ranges(list: &str, complement: bool) -> Result, String> { - use std::uint; - - let mut range_vec = { - try!( - if complement { - Range::from_list(list).map(|r| ranges::complement(&r)) - } else { - Range::from_list(list) - } - ) - }; - - // add sentinel value for increased performance during cutting - range_vec.push(Range{ low: uint::MAX, high: uint::MAX }); - - Ok(range_vec) + if complement { + Range::from_list(list).map(|r| ranges::complement(&r)) + } else { + Range::from_list(list) + } } fn cut_bytes(mut reader: BufferedReader, @@ -74,19 +63,19 @@ fn cut_bytes(mut reader: BufferedReader, 'newline: loop { let line = match reader.read_until(b'\n') { Ok(line) => line, - Err(std::io::IoError{ kind: std::io::EndOfFile, ..}) => break, + Err(std::io::IoError { kind: std::io::EndOfFile, .. }) => break, _ => fail!(), }; let line_len = line.len(); let mut print_delim = false; - for &Range{ low: low, high: high } in ranges.iter() { + for &Range { low: low, high: high } in ranges.iter() { if low > line_len { break; } if use_delim { if print_delim { - out.write_str(out_delim.as_slice()); + out.write_str(out_delim.as_slice()).unwrap(); } print_delim = true; } @@ -94,7 +83,7 @@ fn cut_bytes(mut reader: BufferedReader, if high >= line_len { let segment = line.slice(low - 1, line_len); - out.write(segment); + out.write(segment).unwrap(); if *line.get(line_len - 1) == b'\n' { continue 'newline @@ -102,11 +91,11 @@ fn cut_bytes(mut reader: BufferedReader, } else { let segment = line.slice(low - 1, high); - out.write(segment); + out.write(segment).unwrap(); } } - out.write(&[b'\n']); + out.write(&[b'\n']).unwrap(); } 0 @@ -124,7 +113,7 @@ fn cut_characters(mut reader: BufferedReader, 'newline: loop { let line = match reader.read_line() { Ok(line) => line, - Err(std::io::IoError{ kind: std::io::EndOfFile, ..}) => break, + Err(std::io::IoError { kind: std::io::EndOfFile, .. }) => break, _ => fail!(), }; @@ -132,7 +121,7 @@ fn cut_characters(mut reader: BufferedReader, let mut char_indices = line.as_slice().char_indices(); let mut print_delim = false; - for &Range{ low: low, high: high } in ranges.iter() { + for &Range { low: low, high: high } in ranges.iter() { let low_idx = match char_indices.nth(low - char_pos - 1) { Some((low_idx, _)) => low_idx, None => break @@ -140,7 +129,7 @@ fn cut_characters(mut reader: BufferedReader, if use_delim { if print_delim { - out.write_str(out_delim.as_slice()); + out.write_str(out_delim.as_slice()).unwrap(); } print_delim = true; } @@ -149,13 +138,13 @@ fn cut_characters(mut reader: BufferedReader, Some((high_idx, _)) => { let segment = line.as_bytes().slice(low_idx, high_idx); - out.write(segment); + out.write(segment).unwrap(); } None => { let bytes = line.as_bytes(); let segment = bytes.slice(low_idx, bytes.len()); - out.write(segment); + out.write(segment).unwrap(); if line.as_bytes()[bytes.len() - 1] == b'\n' { continue 'newline @@ -165,17 +154,207 @@ fn cut_characters(mut reader: BufferedReader, char_pos = high + 1; } - out.write(&[b'\n']); + out.write(&[b'\n']).unwrap(); } 0 } -fn cut_fields(reader: BufferedReader, +#[deriving(Clone)] +struct Searcher<'a> { + haystack: &'a [u8], + needle: &'a [u8], + position: uint +} + +impl<'a> Searcher<'a> { + fn new(haystack: &'a [u8], needle: &'a [u8]) -> Searcher<'a> { + Searcher { + haystack: haystack, + needle: needle, + position: 0 + } + } +} + +impl<'a> Iterator<(uint, uint)> for Searcher<'a> { + fn next(&mut self) -> Option<(uint, uint)> { + if self.needle.len() == 1 { + for offset in range(self.position, self.haystack.len()) { + if self.haystack[offset] == self.needle[0] { + self.position = offset + 1; + return Some((offset, offset + 1)); + } + } + + self.position = self.haystack.len(); + return None; + } + + while self.position + self.needle.len() <= self.haystack.len() { + if self.haystack.slice(self.position, + self.position + self.needle.len()) == self.needle { + let match_pos = self.position; + self.position += self.needle.len(); + return Some((match_pos, match_pos + self.needle.len())); + } else { + self.position += 1; + } + } + None + } +} + +fn cut_fields_delimiter(mut reader: BufferedReader, + ranges: &Vec, + delim: &String, + only_delimited: bool, + out_delim: &String) -> int { + let mut out = BufferedWriter::new(std::io::stdio::stdout_raw()); + + 'newline: loop { + let line = match reader.read_until(b'\n') { + Ok(line) => line, + Err(std::io::IoError { kind: std::io::EndOfFile, .. }) => break, + _ => fail!(), + }; + + let mut fields_pos = 1; + let mut low_idx = 0; + let mut delim_search = Searcher::new(line.as_slice(), + delim.as_bytes()).peekable(); + let mut print_delim = false; + + if delim_search.peek().is_none() { + if ! only_delimited { + out.write(line.as_slice()).unwrap(); + if *line.get(line.len() - 1) != b'\n' { + out.write([b'\n']).unwrap(); + } + } + + continue + } + + for &Range { low: low, high: high } in ranges.iter() { + if low - fields_pos > 0 { + low_idx = match delim_search.nth(low - fields_pos - 1) { + Some((_, beyond_delim)) => beyond_delim, + None => break + }; + } + + for _ in range(0, high - low + 1) { + if print_delim { + out.write_str(out_delim.as_slice()).unwrap(); + } + + match delim_search.next() { + Some((high_idx, next_low_idx)) => { + let segment = line.slice(low_idx, high_idx); + + out.write(segment).unwrap(); + + print_delim = true; + + low_idx = next_low_idx; + fields_pos = high + 1; + } + None => { + let segment = line.slice(low_idx, line.len()); + + out.write(segment).unwrap(); + + if *line.get(line.len() - 1) == b'\n' { + continue 'newline + } + break + } + } + } + } + + out.write(&[b'\n']).unwrap(); + } + + 0 +} + +fn cut_fields(mut reader: BufferedReader, ranges: &Vec, opts: &FieldOptions) -> int { - for range in ranges.iter() { - println!("{}-{}", range.low, range.high); + match opts.out_delimeter { + Some(ref delim) => { + return cut_fields_delimiter(reader, ranges, &opts.delimiter, + opts.only_delimited, delim); + } + None => () + } + + let mut out = BufferedWriter::new(std::io::stdio::stdout_raw()); + + 'newline: loop { + let line = match reader.read_until(b'\n') { + Ok(line) => line, + Err(std::io::IoError { kind: std::io::EndOfFile, .. }) => break, + _ => fail!(), + }; + + let mut fields_pos = 1; + let mut low_idx = 0; + let mut delim_search = Searcher::new(line.as_slice(), + opts.delimiter.as_bytes()).peekable(); + let mut print_delim = false; + + if delim_search.peek().is_none() { + if ! opts.only_delimited { + out.write(line.as_slice()).unwrap(); + if *line.get(line.len() - 1) != b'\n' { + out.write([b'\n']).unwrap(); + } + } + + continue + } + + for &Range { low: low, high: high } in ranges.iter() { + if low - fields_pos > 0 { + low_idx = match delim_search.nth(low - fields_pos - 1) { + Some((_, beyond_delim)) => beyond_delim, + None => break + }; + } + + if print_delim { + if low_idx >= opts.delimiter.as_bytes().len() { + low_idx -= opts.delimiter.as_bytes().len(); + } + } + + match delim_search.nth(high - low) { + Some((high_idx, next_low_idx)) => { + let segment = line.slice(low_idx, high_idx); + + out.write(segment).unwrap(); + + print_delim = true; + low_idx = next_low_idx; + fields_pos = high + 1; + } + None => { + let segment = line.slice(low_idx, line.len()); + + out.write(segment).unwrap(); + + if *line.get(line.len() - 1) == b'\n' { + continue 'newline + } + break + } + } + } + + out.write(&[b'\n']).unwrap(); } 0 @@ -255,7 +434,7 @@ pub fn uumain(args: Vec) -> int { let matches = match getopts(args.tail(), opts) { Ok(m) => m, Err(f) => { - show_error!("Invalid options\n{}", f.to_err_msg()) + show_error!("Invalid options\n{}", f) return 1; } }; @@ -293,32 +472,41 @@ pub fn uumain(args: Vec) -> int { (Some(byte_ranges), None, None) => { list_to_ranges(byte_ranges.as_slice(), complement).map(|ranges| Bytes(ranges, - Options{ out_delim: matches.opt_str("output-delimiter") }) + Options { out_delim: matches.opt_str("output-delimiter") }) ) } - (None ,Some(char_ranges), None) => { + (None, Some(char_ranges), None) => { list_to_ranges(char_ranges.as_slice(), complement).map(|ranges| Characters(ranges, - Options{ out_delim: matches.opt_str("output-delimiter") }) + Options { out_delim: matches.opt_str("output-delimiter") }) ) } - (None, None ,Some(field_ranges)) => { - list_to_ranges(field_ranges.as_slice(), complement).map(|ranges| + (None, None, Some(field_ranges)) => { + list_to_ranges(field_ranges.as_slice(), complement).and_then(|ranges| { - use std::str::from_char; - - let delim = matches.opt_str("delimiter") - .filtered(|s| s.len() == 1) - .map(|s| s.as_slice().char_at(0)) - .unwrap_or('\t'); - let out_delim = matches.opt_str("output-delimiter") - .unwrap_or(from_char(delim)); + let out_delim = matches.opt_str("output-delimiter"); let only_delimited = matches.opt_present("only-delimited"); - Fields(ranges, - FieldOptions{ delimiter: delim, - out_delimeter: out_delim, - only_delimited: only_delimited }) + match matches.opt_str("delimiter") { + Some(delim) => { + if delim.as_slice().char_len() != 1 { + Err("the delimiter must be a single character".to_str()) + } else { + Ok(Fields(ranges, + FieldOptions { + delimiter: delim, + out_delimeter: out_delim, + only_delimited: only_delimited + })) + } + } + None => Ok(Fields(ranges, + FieldOptions { + delimiter: "\t".to_str(), + out_delimeter: out_delim, + only_delimited: only_delimited + })) + } } ) } @@ -331,7 +519,9 @@ pub fn uumain(args: Vec) -> int { match mode_parse { Ok(mode) => cut_files(matches.free, mode), Err(err_msg) => { - show_error!("{}", err_msg); + show_error!("{}\n\ + Try '{} --help' for more information", + err_msg, args.get(0)); 1 } } diff --git a/cut/ranges.rs b/cut/ranges.rs index e0fed0a68..6089bcaac 100644 --- a/cut/ranges.rs +++ b/cut/ranges.rs @@ -24,20 +24,20 @@ impl std::from_str::FromStr for Range { match (parts.next(), parts.next()) { (Some(nm), None) => { from_str::(nm).filtered(|nm| *nm > 0) - .map(|nm| Range{ low: nm, high: nm }) + .map(|nm| Range { low: nm, high: nm }) } (Some(n), Some(m)) if m.len() == 0 => { from_str::(n).filtered(|low| *low > 0) - .map(|low| Range{ low: low, high: MAX }) + .map(|low| Range { low: low, high: MAX }) } (Some(n), Some(m)) if n.len() == 0 => { from_str::(m).filtered(|high| *high >= 1) - .map(|high| Range{ low: 1, high: high }) + .map(|high| Range { low: 1, high: high }) } (Some(n), Some(m)) => { match (from_str::(n), from_str::(m)) { (Some(low), Some(high)) if low > 0 && low <= high => { - Some(Range{ low: low, high: high }) + Some(Range { low: low, high: high }) } _ => None } @@ -82,7 +82,7 @@ pub fn complement(ranges: &Vec) -> Vec { let mut complements = Vec::with_capacity(ranges.len() + 1); if ranges.len() > 0 && ranges.get(0).low > 1 { - complements.push(Range{ low: 1, high: ranges.get(0).low - 1 }); + complements.push(Range { low: 1, high: ranges.get(0).low - 1 }); } let mut ranges_iter = ranges.iter().peekable(); @@ -90,14 +90,18 @@ pub fn complement(ranges: &Vec) -> Vec { match (ranges_iter.next(), ranges_iter.peek()) { (Some(left), Some(right)) => { if left.high + 1 != right.low { - complements.push(Range{ low: left.high + 1, - high: right.low - 1 }); + complements.push(Range { + low: left.high + 1, + high: right.low - 1 + }); } } (Some(last), None) => { if last.high < uint::MAX { - complements.push(Range{ low: last.high + 1, - high: uint::MAX }); + complements.push(Range { + low: last.high + 1, + high: uint::MAX + }); } } _ => break From a470c330e64977f3d9139ef441fc8f0808492a44 Mon Sep 17 00:00:00 2001 From: polyphemus Date: Thu, 26 Jun 2014 12:31:24 +0200 Subject: [PATCH 6/6] Add cut to Cargo.toml, remove cut from To Do list --- Cargo.toml | 4 ++++ README.md | 1 - 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index f0ffe370b..fffbc9e84 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -31,6 +31,10 @@ path = "comm/comm.rs" name = "cp" path = "cp/cp.rs" +[[bin]] +name = "cut" +path = "cut/cut.rs" + [[bin]] name = "dirname" path = "dirname/dirname.rs" diff --git a/README.md b/README.md index ad96e82c6..3bd9e21d6 100644 --- a/README.md +++ b/README.md @@ -129,7 +129,6 @@ To do - copy - cp (not much done) - csplit -- cut - date - dd - df