diff --git a/Cargo.toml b/Cargo.toml index f0ffe370b..fffbc9e84 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -31,6 +31,10 @@ path = "comm/comm.rs" name = "cp" path = "cp/cp.rs" +[[bin]] +name = "cut" +path = "cut/cut.rs" + [[bin]] name = "dirname" path = "dirname/dirname.rs" diff --git a/Makefile b/Makefile index 898251825..25d383219 100644 --- a/Makefile +++ b/Makefile @@ -13,6 +13,7 @@ PROGS := \ cksum \ comm \ cp \ + cut \ dirname \ echo \ env \ diff --git a/README.md b/README.md index 7bb221e8c..37cd36cd6 100644 --- a/README.md +++ b/README.md @@ -114,7 +114,6 @@ To do - copy - cp (not much done) - csplit -- cut - date - dd - df diff --git a/cut/cut.rs b/cut/cut.rs new file mode 100644 index 000000000..16cc56636 --- /dev/null +++ b/cut/cut.rs @@ -0,0 +1,528 @@ +#![crate_id(name="cut", vers="1.0.0", author="Rolf Morel")] + +/* + * This file is part of the uutils coreutils package. + * + * (c) Rolf Morel + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +#![feature(macro_rules)] + +extern crate getopts; +extern crate libc; + +use std::os; +use std::io::{File, BufferedWriter, BufferedReader, stdin, print}; +use getopts::{optopt, optflag, getopts, usage}; + +use ranges::Range; + +#[path = "../common/util.rs"] +mod util; +mod ranges; + +static NAME: &'static str = "cut"; +static VERSION: &'static str = "1.0.0"; + +struct Options { + out_delim: Option, +} + +struct FieldOptions { + delimiter: String, // one char long, String because of UTF8 representation + out_delimeter: Option, + only_delimited: bool, +} + +enum Mode { + Bytes(Vec, Options), + Characters(Vec, Options), + Fields(Vec, FieldOptions), +} + +fn list_to_ranges(list: &str, complement: bool) -> Result, String> { + if complement { + Range::from_list(list).map(|r| ranges::complement(&r)) + } else { + Range::from_list(list) + } +} + +fn cut_bytes(mut reader: BufferedReader, + ranges: &Vec, + opts: &Options) -> int { + let mut out = BufferedWriter::new(std::io::stdio::stdout_raw()); + let (use_delim, out_delim) = match opts.out_delim.clone() { + Some(delim) => (true, delim), + None => (false, "".to_str()) + }; + + 'newline: loop { + let line = match reader.read_until(b'\n') { + Ok(line) => line, + Err(std::io::IoError { kind: std::io::EndOfFile, .. }) => break, + _ => fail!(), + }; + + let line_len = line.len(); + let mut print_delim = false; + + for &Range { low: low, high: high } in ranges.iter() { + if low > line_len { break; } + + if use_delim { + if print_delim { + out.write_str(out_delim.as_slice()).unwrap(); + } + print_delim = true; + } + + if high >= line_len { + let segment = line.slice(low - 1, line_len); + + out.write(segment).unwrap(); + + if *line.get(line_len - 1) == b'\n' { + continue 'newline + } + } else { + let segment = line.slice(low - 1, high); + + out.write(segment).unwrap(); + } + } + + out.write(&[b'\n']).unwrap(); + } + + 0 +} + +fn cut_characters(mut reader: BufferedReader, + ranges: &Vec, + opts: &Options) -> int { + let mut out = BufferedWriter::new(std::io::stdio::stdout_raw()); + let (use_delim, out_delim) = match opts.out_delim.clone() { + Some(delim) => (true, delim), + None => (false, "".to_str()) + }; + + 'newline: loop { + let line = match reader.read_line() { + Ok(line) => line, + Err(std::io::IoError { kind: std::io::EndOfFile, .. }) => break, + _ => fail!(), + }; + + let mut char_pos = 0; + let mut char_indices = line.as_slice().char_indices(); + let mut print_delim = false; + + for &Range { low: low, high: high } in ranges.iter() { + let low_idx = match char_indices.nth(low - char_pos - 1) { + Some((low_idx, _)) => low_idx, + None => break + }; + + if use_delim { + if print_delim { + out.write_str(out_delim.as_slice()).unwrap(); + } + print_delim = true; + } + + match char_indices.nth(high - low) { + Some((high_idx, _)) => { + let segment = line.as_bytes().slice(low_idx, high_idx); + + out.write(segment).unwrap(); + } + None => { + let bytes = line.as_bytes(); + let segment = bytes.slice(low_idx, bytes.len()); + + out.write(segment).unwrap(); + + if line.as_bytes()[bytes.len() - 1] == b'\n' { + continue 'newline + } + } + } + + char_pos = high + 1; + } + out.write(&[b'\n']).unwrap(); + } + + 0 +} + +#[deriving(Clone)] +struct Searcher<'a> { + haystack: &'a [u8], + needle: &'a [u8], + position: uint +} + +impl<'a> Searcher<'a> { + fn new(haystack: &'a [u8], needle: &'a [u8]) -> Searcher<'a> { + Searcher { + haystack: haystack, + needle: needle, + position: 0 + } + } +} + +impl<'a> Iterator<(uint, uint)> for Searcher<'a> { + fn next(&mut self) -> Option<(uint, uint)> { + if self.needle.len() == 1 { + for offset in range(self.position, self.haystack.len()) { + if self.haystack[offset] == self.needle[0] { + self.position = offset + 1; + return Some((offset, offset + 1)); + } + } + + self.position = self.haystack.len(); + return None; + } + + while self.position + self.needle.len() <= self.haystack.len() { + if self.haystack.slice(self.position, + self.position + self.needle.len()) == self.needle { + let match_pos = self.position; + self.position += self.needle.len(); + return Some((match_pos, match_pos + self.needle.len())); + } else { + self.position += 1; + } + } + None + } +} + +fn cut_fields_delimiter(mut reader: BufferedReader, + ranges: &Vec, + delim: &String, + only_delimited: bool, + out_delim: &String) -> int { + let mut out = BufferedWriter::new(std::io::stdio::stdout_raw()); + + 'newline: loop { + let line = match reader.read_until(b'\n') { + Ok(line) => line, + Err(std::io::IoError { kind: std::io::EndOfFile, .. }) => break, + _ => fail!(), + }; + + let mut fields_pos = 1; + let mut low_idx = 0; + let mut delim_search = Searcher::new(line.as_slice(), + delim.as_bytes()).peekable(); + let mut print_delim = false; + + if delim_search.peek().is_none() { + if ! only_delimited { + out.write(line.as_slice()).unwrap(); + if *line.get(line.len() - 1) != b'\n' { + out.write([b'\n']).unwrap(); + } + } + + continue + } + + for &Range { low: low, high: high } in ranges.iter() { + if low - fields_pos > 0 { + low_idx = match delim_search.nth(low - fields_pos - 1) { + Some((_, beyond_delim)) => beyond_delim, + None => break + }; + } + + for _ in range(0, high - low + 1) { + if print_delim { + out.write_str(out_delim.as_slice()).unwrap(); + } + + match delim_search.next() { + Some((high_idx, next_low_idx)) => { + let segment = line.slice(low_idx, high_idx); + + out.write(segment).unwrap(); + + print_delim = true; + + low_idx = next_low_idx; + fields_pos = high + 1; + } + None => { + let segment = line.slice(low_idx, line.len()); + + out.write(segment).unwrap(); + + if *line.get(line.len() - 1) == b'\n' { + continue 'newline + } + break + } + } + } + } + + out.write(&[b'\n']).unwrap(); + } + + 0 +} + +fn cut_fields(mut reader: BufferedReader, + ranges: &Vec, + opts: &FieldOptions) -> int { + match opts.out_delimeter { + Some(ref delim) => { + return cut_fields_delimiter(reader, ranges, &opts.delimiter, + opts.only_delimited, delim); + } + None => () + } + + let mut out = BufferedWriter::new(std::io::stdio::stdout_raw()); + + 'newline: loop { + let line = match reader.read_until(b'\n') { + Ok(line) => line, + Err(std::io::IoError { kind: std::io::EndOfFile, .. }) => break, + _ => fail!(), + }; + + let mut fields_pos = 1; + let mut low_idx = 0; + let mut delim_search = Searcher::new(line.as_slice(), + opts.delimiter.as_bytes()).peekable(); + let mut print_delim = false; + + if delim_search.peek().is_none() { + if ! opts.only_delimited { + out.write(line.as_slice()).unwrap(); + if *line.get(line.len() - 1) != b'\n' { + out.write([b'\n']).unwrap(); + } + } + + continue + } + + for &Range { low: low, high: high } in ranges.iter() { + if low - fields_pos > 0 { + low_idx = match delim_search.nth(low - fields_pos - 1) { + Some((_, beyond_delim)) => beyond_delim, + None => break + }; + } + + if print_delim { + if low_idx >= opts.delimiter.as_bytes().len() { + low_idx -= opts.delimiter.as_bytes().len(); + } + } + + match delim_search.nth(high - low) { + Some((high_idx, next_low_idx)) => { + let segment = line.slice(low_idx, high_idx); + + out.write(segment).unwrap(); + + print_delim = true; + low_idx = next_low_idx; + fields_pos = high + 1; + } + None => { + let segment = line.slice(low_idx, line.len()); + + out.write(segment).unwrap(); + + if *line.get(line.len() - 1) == b'\n' { + continue 'newline + } + break + } + } + } + + out.write(&[b'\n']).unwrap(); + } + + 0 +} + +fn cut_files(mut filenames: Vec, mode: Mode) -> int { + let mut stdin_read = false; + let mut exit_code = 0; + + if filenames.len() == 0 { filenames.push("-".to_str()); } + + for filename in filenames.iter() { + if filename.as_slice() == "-" { + if stdin_read { continue; } + + exit_code |= match mode { + Bytes(ref ranges, ref opts) => { + cut_bytes(stdin(), ranges, opts) + } + Characters(ref ranges, ref opts) => { + cut_characters(stdin(), ranges, opts) + } + Fields(ref ranges, ref opts) => { + cut_fields(stdin(), ranges, opts) + } + }; + + stdin_read = true; + } else { + let path = Path::new(filename.as_slice()); + + if ! path.exists() { + show_error!("{}: No such file or directory", filename); + continue; + } + + let buf_file = match File::open(&path) { + Ok(file) => BufferedReader::new(file), + Err(e) => { + show_error!("{}: {}", filename, e.desc); + continue + } + }; + + exit_code |= match mode { + Bytes(ref ranges, ref opts) => cut_bytes(buf_file, ranges, opts), + Characters(ref ranges, ref opts) => { + cut_characters(buf_file, ranges, opts) + } + Fields(ref ranges, ref opts) => { + cut_fields(buf_file, ranges, opts) + } + }; + } + } + + exit_code +} + +#[allow(dead_code)] +fn main() { os::set_exit_status(uumain(os::args())); } + +pub fn uumain(args: Vec) -> int { + let opts = [ + optopt("b", "bytes", "select only these bytes", "LIST"), + optopt("c", "characters", "select only these characters", "LIST"), + optopt("d", "delimiter", "use DELIM instead of TAB for field delimiter", "DELIM"), + optopt("f", "fields", "select only these fields; also print any line that contains no delimiter character, unless the -s option is specified", "LIST"), + optflag("n", "", "(ignored)"), + optflag("", "complement", "complement the set of selected bytes, characters or fields"), + optflag("s", "only-delimited", "do not print lines not containing delimiters"), + optopt("", "output-delimiter", "use STRING as the output delimiter the default is to use the input delimiter", "STRING"), + optflag("", "help", "display this help and exit"), + optflag("", "version", "output version information and exit"), + ]; + + let matches = match getopts(args.tail(), opts) { + Ok(m) => m, + Err(f) => { + show_error!("Invalid options\n{}", f) + return 1; + } + }; + + if matches.opt_present("help") { + println!("Usage:"); + println!(" {0} OPTION... [FILE]...", args.get(0)); + println!(""); + print(usage("Print selected parts of lines from each FILE to standard output.", opts).as_slice()); + println!(""); + println!("Use one, and only one of -b, -c or -f. Each LIST is made up of one"); + println!("range, or many ranges separated by commas. Selected input is written"); + println!("in the same order that it is read, and is written exactly once."); + println!("Each range is one of:"); + println!(""); + println!(" N N'th byte, character or field, counted from 1"); + println!(" N- from N'th byte, character or field, to end of line"); + println!(" N-M from N'th to M'th (included) byte, character or field"); + println!(" -M from first to M'th (included) byte, character or field"); + println!(""); + println!("With no FILE, or when FILE is -, read standard input."); + return 0; + } + + if matches.opt_present("version") { + println!("{} {}", NAME, VERSION); + return 0; + } + + let complement = matches.opt_present("complement"); + + let mode_parse = match (matches.opt_str("bytes"), + matches.opt_str("characters"), + matches.opt_str("fields")) { + (Some(byte_ranges), None, None) => { + list_to_ranges(byte_ranges.as_slice(), complement).map(|ranges| + Bytes(ranges, + Options { out_delim: matches.opt_str("output-delimiter") }) + ) + } + (None, Some(char_ranges), None) => { + list_to_ranges(char_ranges.as_slice(), complement).map(|ranges| + Characters(ranges, + Options { out_delim: matches.opt_str("output-delimiter") }) + ) + } + (None, None, Some(field_ranges)) => { + list_to_ranges(field_ranges.as_slice(), complement).and_then(|ranges| + { + let out_delim = matches.opt_str("output-delimiter"); + let only_delimited = matches.opt_present("only-delimited"); + + match matches.opt_str("delimiter") { + Some(delim) => { + if delim.as_slice().char_len() != 1 { + Err("the delimiter must be a single character".to_str()) + } else { + Ok(Fields(ranges, + FieldOptions { + delimiter: delim, + out_delimeter: out_delim, + only_delimited: only_delimited + })) + } + } + None => Ok(Fields(ranges, + FieldOptions { + delimiter: "\t".to_str(), + out_delimeter: out_delim, + only_delimited: only_delimited + })) + } + } + ) + } + (ref b, ref c, ref f) if b.is_some() || c.is_some() || f.is_some() => { + Err("only one type of list may be specified".to_str()) + } + _ => Err("you must specify a list of bytes, characters, or fields".to_str()) + }; + + match mode_parse { + Ok(mode) => cut_files(matches.free, mode), + Err(err_msg) => { + show_error!("{}\n\ + Try '{} --help' for more information", + err_msg, args.get(0)); + 1 + } + } +} diff --git a/cut/ranges.rs b/cut/ranges.rs new file mode 100644 index 000000000..6089bcaac --- /dev/null +++ b/cut/ranges.rs @@ -0,0 +1,112 @@ +/* + * This file is part of the uutils coreutils package. + * + * (c) Rolf Morel + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +use std; + +#[deriving(PartialEq,Eq,PartialOrd,Ord,Show)] +pub struct Range { + pub low: uint, + pub high: uint, +} + +impl std::from_str::FromStr for Range { + fn from_str(s: &str) -> Option { + use std::uint::MAX; + + let mut parts = s.splitn('-', 1); + + match (parts.next(), parts.next()) { + (Some(nm), None) => { + from_str::(nm).filtered(|nm| *nm > 0) + .map(|nm| Range { low: nm, high: nm }) + } + (Some(n), Some(m)) if m.len() == 0 => { + from_str::(n).filtered(|low| *low > 0) + .map(|low| Range { low: low, high: MAX }) + } + (Some(n), Some(m)) if n.len() == 0 => { + from_str::(m).filtered(|high| *high >= 1) + .map(|high| Range { low: 1, high: high }) + } + (Some(n), Some(m)) => { + match (from_str::(n), from_str::(m)) { + (Some(low), Some(high)) if low > 0 && low <= high => { + Some(Range { low: low, high: high }) + } + _ => None + } + } + _ => unreachable!() + } + } +} + +impl Range { + pub fn from_list(list: &str) -> Result, String> { + use std::cmp::max; + + let mut ranges = vec!(); + + for item in list.split(',') { + match from_str::(item) { + Some(range_item) => ranges.push(range_item), + None => return Err(format!("range '{}' was invalid", item)) + } + } + + ranges.sort(); + + // merge overlapping ranges + for i in range(0, ranges.len()) { + let j = i + 1; + + while j < ranges.len() && ranges.get(j).low <= ranges.get(i).high { + let j_high = ranges.remove(j).unwrap().high; + ranges.get_mut(i).high = max(ranges.get(i).high, j_high); + } + } + + Ok(ranges) + } +} + +pub fn complement(ranges: &Vec) -> Vec { + use std::uint; + + let mut complements = Vec::with_capacity(ranges.len() + 1); + + if ranges.len() > 0 && ranges.get(0).low > 1 { + complements.push(Range { low: 1, high: ranges.get(0).low - 1 }); + } + + let mut ranges_iter = ranges.iter().peekable(); + loop { + match (ranges_iter.next(), ranges_iter.peek()) { + (Some(left), Some(right)) => { + if left.high + 1 != right.low { + complements.push(Range { + low: left.high + 1, + high: right.low - 1 + }); + } + } + (Some(last), None) => { + if last.high < uint::MAX { + complements.push(Range { + low: last.high + 1, + high: uint::MAX + }); + } + } + _ => break + } + } + + complements +} diff --git a/uutils/uutils.rs b/uutils/uutils.rs index 680c581ff..fd9243904 100644 --- a/uutils/uutils.rs +++ b/uutils/uutils.rs @@ -18,6 +18,7 @@ extern crate chroot; extern crate cksum; extern crate comm; extern crate cp; +extern crate cut; extern crate dirname; extern crate du; extern crate echo; @@ -80,6 +81,7 @@ fn util_map() -> HashMap<&str, fn(Vec) -> int> { map.insert("cksum", cksum::uumain); map.insert("comm", comm::uumain); map.insert("cp", cp::uumain); + map.insert("cut", cut::uumain); map.insert("dirname", dirname::uumain); map.insert("du", du::uumain); map.insert("echo", echo::uumain);