csplit: refresh of the previous PR

This commit is contained in:
Stéphane Campinas 2020-12-27 15:24:02 +01:00 committed by Sylvestre Ledru
parent da362ced71
commit 89bf7a726e
10 changed files with 2963 additions and 0 deletions

View file

@ -37,6 +37,7 @@ feat_common_core = [
"cksum",
"comm",
"cp",
"csplit",
"cut",
"date",
"df",
@ -241,6 +242,7 @@ chroot = { optional=true, version="0.0.1", package="uu_chroot", path="src/uu/c
cksum = { optional=true, version="0.0.1", package="uu_cksum", path="src/uu/cksum" }
comm = { optional=true, version="0.0.1", package="uu_comm", path="src/uu/comm" }
cp = { optional=true, version="0.0.1", package="uu_cp", path="src/uu/cp" }
csplit = { optional=true, version="0.0.1", package="uu_csplit", path="src/uu/csplit" }
cut = { optional=true, version="0.0.1", package="uu_cut", path="src/uu/cut" }
date = { optional=true, version="0.0.1", package="uu_date", path="src/uu/date" }
df = { optional=true, version="0.0.1", package="uu_df", path="src/uu/df" }
@ -332,6 +334,7 @@ pin_winapi-util = { version="0.1.2, < 0.1.3", package="winapi-util" } ## winapi-
[dev-dependencies]
conv = "0.3"
filetime = "0.2"
glob = "0.3.0"
libc = "0.2"
rand = "0.7"
regex = "1.0"

View file

@ -53,6 +53,7 @@ PROGS := \
cksum \
comm \
cp \
csplit \
cut \
df \
dircolors \
@ -160,6 +161,7 @@ TEST_PROGS := \
cksum \
comm \
cp \
csplit \
cut \
dircolors \
dirname \

27
src/uu/csplit/Cargo.toml Normal file
View file

@ -0,0 +1,27 @@
[package]
name = "uu_csplit"
version = "0.0.1"
authors = ["uutils developers"]
license = "MIT"
description = "csplit ~ (uutils) Output pieces of FILE separated by PATTERN(s) to files 'xx00', 'xx01', ..., and output byte counts of each piece to standard output"
homepage = "https://github.com/uutils/coreutils"
repository = "https://github.com/uutils/coreutils/tree/master/src/uu/ls"
keywords = ["coreutils", "uutils", "cross-platform", "cli", "utility"]
categories = ["command-line-utilities"]
edition = "2018"
[lib]
path = "src/csplit.rs"
[dependencies]
getopts = "0.2.17"
failure = "0.1.1"
failure_derive = "0.1.1"
regex = "1.0.0"
glob = "0.2.11"
uucore = { version=">=0.0.4", package="uucore", path="../../uucore", features=["entries", "fs"] }
[[bin]]
name = "csplit"
path = "src/main.rs"

760
src/uu/csplit/src/csplit.rs Normal file
View file

@ -0,0 +1,760 @@
#![crate_name = "uu_csplit"]
#[macro_use]
extern crate failure;
#[macro_use]
extern crate uucore;
extern crate getopts;
extern crate regex;
use std::{fs::{File, remove_file}, io::{BufRead, BufWriter, Write}};
use std::io::{self, BufReader};
use getopts::Matches;
use regex::Regex;
/*
mod split_name;
mod patterns;
*/
mod splitname;
mod patterns;
mod csplitError;
use crate::splitname::SplitName;
use crate::csplitError::CsplitError;
//mod split_name;
//mod csplit;
static SYNTAX: &'static str = "[OPTION]... FILE PATTERN...";
static SUMMARY: &'static str = "split a file into sections determined by context lines";
static LONG_HELP: &'static str = "Output pieces of FILE separated by PATTERN(s) to files 'xx00', 'xx01', ..., and output byte counts of each piece to standard output.";
static SUFFIX_FORMAT_OPT: &'static str = "suffix-format";
static SUPPRESS_MATCHED_OPT: &'static str = "suppress-matched";
static DIGITS_OPT: &'static str = "digits";
static PREFIX_OPT: &'static str = "prefix";
static KEEP_FILES_OPT: &'static str = "keep-files";
static QUIET_OPT: &'static str = "quiet";
static ELIDE_EMPTY_FILES_OPT: &'static str = "elide-empty-files";
/// Command line options for csplit.
pub struct CsplitOptions {
split_name: crate::SplitName,
keep_files: bool,
quiet: bool,
elide_empty_files: bool,
suppress_matched: bool,
}
impl CsplitOptions {
fn new(matches: &Matches) -> CsplitOptions {
let keep_files = matches.opt_present(KEEP_FILES_OPT);
let quiet = matches.opt_present(QUIET_OPT);
let elide_empty_files = matches.opt_present(ELIDE_EMPTY_FILES_OPT);
let suppress_matched = matches.opt_present(SUPPRESS_MATCHED_OPT);
CsplitOptions {
split_name: crash_if_err!(
1,
SplitName::new(
matches.opt_str(PREFIX_OPT),
matches.opt_str(SUFFIX_FORMAT_OPT),
matches.opt_str(DIGITS_OPT)
)
),
keep_files,
quiet,
elide_empty_files,
suppress_matched,
}
}
}
/// Splits a file into severals according to the command line patterns.
///
/// # Errors
///
/// - [`io::Error`] if there is some problem reading/writing from/to a file.
/// - [`::CsplitError::LineOutOfRange`] if the linenum pattern is larger than the number of input
/// lines.
/// - [`::CsplitError::LineOutOfRangeOnRepetition`], like previous but after applying the pattern
/// more than once.
/// - [`::CsplitError::MatchNotFound`] if no line matched a regular expression.
/// - [`::CsplitError::MatchNotFoundOnRepetition`], like previous but after applying the pattern
/// more than once.
pub fn csplit<T>(
options: &CsplitOptions,
patterns: Vec<patterns::Pattern>,
input: T,
) -> Result<(), CsplitError>
where
T: BufRead,
{
let mut input_iter = InputSplitter::new(input.lines().enumerate());
let mut split_writer = SplitWriter::new(&options)?;
let ret = do_csplit(&mut split_writer, patterns, &mut input_iter);
// consume the rest
input_iter.rewind_buffer();
if let Some((_, line)) = input_iter.next() {
split_writer.new_writer()?;
split_writer.writeln(line?)?;
for (_, line) in input_iter {
split_writer.writeln(line?)?;
}
split_writer.finish_split()?;
}
// delete files on error by default
if ret.is_err() && !options.keep_files {
split_writer.delete_all_splits()?;
}
ret
}
fn do_csplit<I>(
split_writer: &mut SplitWriter,
patterns: Vec<patterns::Pattern>,
input_iter: &mut InputSplitter<I>,
) -> Result<(), CsplitError>
where
I: Iterator<Item = (usize, io::Result<String>)>,
{
// split the file based on patterns
for pattern in patterns.into_iter() {
let pattern_as_str = pattern.to_string();
let is_skip = if let patterns::Pattern::SkipToMatch(_, _, _) = pattern {
true
} else {
false
};
match pattern {
patterns::Pattern::UpToLine(n, ex) => {
let mut up_to_line = n;
for (_, ith) in ex.iter() {
split_writer.new_writer()?;
match split_writer.do_to_line(&pattern_as_str, up_to_line, input_iter) {
// the error happened when applying the pattern more than once
Err(CsplitError::LineOutOfRange(_)) if ith != 1 => {
return Err(CsplitError::LineOutOfRangeOnRepetition(
pattern_as_str.to_string(),
ith - 1,
));
}
Err(err) => return Err(err),
// continue the splitting process
Ok(()) => (),
}
up_to_line += n;
}
}
patterns::Pattern::UpToMatch(regex, offset, ex)
| patterns::Pattern::SkipToMatch(regex, offset, ex) => {
for (max, ith) in ex.iter() {
if is_skip {
// when skipping a part of the input, no writer is created
split_writer.as_dev_null();
} else {
split_writer.new_writer()?;
}
match (
split_writer.do_to_match(&pattern_as_str, &regex, offset, input_iter),
max,
) {
// in case of ::pattern::ExecutePattern::Always, then it's fine not to find a
// matching line
(Err(CsplitError::MatchNotFound(_)), None) => {
return Ok(());
}
// the error happened when applying the pattern more than once
(Err(CsplitError::MatchNotFound(_)), Some(m)) if m != 1 && ith != 1 => {
return Err(CsplitError::MatchNotFoundOnRepetition(
pattern_as_str.to_string(),
ith - 1,
));
}
(Err(err), _) => return Err(err),
// continue the splitting process
(Ok(()), _) => (),
};
}
}
};
}
Ok(())
}
/// Write a portion of the input file into a split which filename is based on an incrementing
/// counter.
struct SplitWriter<'a> {
/// the options set through the command line
options: &'a CsplitOptions,
/// a split counter
counter: usize,
/// the writer to the current split
current_writer: Option<BufWriter<File>>,
/// the size in bytes of the current split
size: usize,
/// flag to indicate that no content should be written to a split
dev_null: bool,
}
impl<'a> Drop for SplitWriter<'a> {
fn drop(&mut self) {
if self.options.elide_empty_files && self.size == 0 {
let file_name = self.options.split_name.get(self.counter);
remove_file(file_name).expect("Failed to elide split");
}
}
}
impl<'a> SplitWriter<'a> {
fn new(options: &CsplitOptions) -> io::Result<SplitWriter> {
Ok(SplitWriter {
options,
counter: 0,
current_writer: None,
size: 0,
dev_null: false,
})
}
/// Creates a new split and returns its filename.
///
/// # Errors
///
/// The creation of the split file may fail with some [`io::Error`].
fn new_writer(&mut self) -> io::Result<()> {
let file_name = self.options.split_name.get(self.counter);
let file = File::create(&file_name)?;
self.current_writer = Some(BufWriter::new(file));
self.counter += 1;
self.size = 0;
self.dev_null = false;
Ok(())
}
/// The current split will not keep any of the read input lines.
fn as_dev_null(&mut self) {
self.dev_null = true;
}
/// Writes the line to the current split, appending a newline character.
/// If [`dev_null`] is true, then the line is discarded.
///
/// # Errors
///
/// Some [`io::Error`] may occur when attempting to write the line.
fn writeln(&mut self, line: String) -> io::Result<()> {
if !self.dev_null {
match self.current_writer {
Some(ref mut current_writer) => {
let bytes = line.as_bytes();
current_writer.write_all(bytes)?;
current_writer.write(b"\n")?;
self.size += bytes.len() + 1;
}
None => panic!("trying to write to a split that was not created"),
}
}
Ok(())
}
/// Perform some operations after completing a split, i.e., either remove it
/// if the [`::ELIDE_EMPTY_FILES_OPT`] option is enabled, or print how much bytes were written
/// to it if [`::QUIET_OPT`] is disabled.
///
/// # Errors
///
/// Some [`io::Error`] if the split could not be removed in case it should be elided.
fn finish_split(&mut self) -> io::Result<()> {
if !self.dev_null {
if self.options.elide_empty_files && self.size == 0 {
self.counter -= 1;
} else if !self.options.quiet {
println!("{}", self.size);
}
}
return Ok(());
}
/// Removes all the split files that were created.
///
/// # Errors
///
/// Returns an [`io::Error`] if there was a problem removing a split.
fn delete_all_splits(&self) -> io::Result<()> {
let mut ret = Ok(());
for ith in 0..self.counter {
let file_name = self.options.split_name.get(ith);
if let Err(err) = remove_file(file_name) {
ret = Err(err);
}
}
ret
}
/// Split the input stream up to the line number `n`.
///
/// If the line number `n` is smaller than the current position in the input, then an empty
/// split is created.
///
/// # Errors
///
/// In addition to errors reading/writing from/to a file, if the line number
/// `n` is greater than the total available lines, then a
/// [`::CsplitError::LineOutOfRange`] error is returned.
fn do_to_line<I>(
&mut self,
pattern_as_str: &str,
n: usize,
input_iter: &mut InputSplitter<I>,
) -> Result<(), CsplitError>
where
I: Iterator<Item = (usize, io::Result<String>)>,
{
input_iter.rewind_buffer();
input_iter.set_size_of_buffer(1);
let mut ret = Err(CsplitError::LineOutOfRange(pattern_as_str.to_string()));
while let Some((ln, line)) = input_iter.next() {
let l = line?;
if ln + 1 > n {
if input_iter.add_line_to_buffer(ln, l).is_some() {
panic!("the buffer is big enough to contain 1 line");
}
ret = Ok(());
break;
} else if ln + 1 == n {
if !self.options.suppress_matched {
if input_iter.add_line_to_buffer(ln, l).is_some() {
panic!("the buffer is big enough to contain 1 line");
}
}
ret = Ok(());
break;
}
self.writeln(l)?;
}
self.finish_split()?;
ret
}
/// Read lines up to the line matching a [`Regex`]. With a non-zero offset,
/// the block of relevant lines can be extended (if positive), or reduced
/// (if negative).
///
/// # Errors
///
/// In addition to errors reading/writing from/to a file, the following errors may be returned:
/// - if no line matched, an [`::CsplitError::MatchNotFound`].
/// - if there are not enough lines to accomodate the offset, an
/// [`::CsplitError::LineOutOfRange`].
fn do_to_match<I>(
&mut self,
pattern_as_str: &str,
regex: &Regex,
mut offset: i32,
input_iter: &mut InputSplitter<I>,
) -> Result<(), CsplitError>
where
I: Iterator<Item = (usize, io::Result<String>)>,
{
if offset >= 0 {
// The offset is zero or positive, no need for a buffer on the lines read.
// NOTE: drain the buffer of input_iter, no match should be done within.
for line in input_iter.drain_buffer() {
self.writeln(line)?;
}
// retain the matching line
input_iter.set_size_of_buffer(1);
while let Some((ln, line)) = input_iter.next() {
let l = line?;
if regex.is_match(&l) {
match (self.options.suppress_matched, offset) {
// no offset, add the line to the next split
(false, 0) => {
if input_iter.add_line_to_buffer(ln, l).is_some() {
panic!("the buffer is big enough to contain 1 line");
}
}
// a positive offset, some more lines need to be added to the current split
(false, _) => self.writeln(l)?,
_ => (),
};
offset -= 1;
// write the extra lines required by the offset
while offset > 0 {
match input_iter.next() {
Some((_, line)) => {
self.writeln(line?)?;
}
None => {
self.finish_split()?;
return Err(CsplitError::LineOutOfRange(
pattern_as_str.to_string(),
));
}
};
offset -= 1;
}
self.finish_split()?;
return Ok(());
}
self.writeln(l)?;
}
} else {
// With a negative offset we use a buffer to keep the lines within the offset.
// NOTE: do not drain the buffer of input_iter, in case of an LineOutOfRange error
// but do not rewind it either since no match should be done within.
// The consequence is that the buffer may already be full with lines from a previous
// split, which is taken care of when calling `shrink_buffer_to_size`.
let offset_usize = -offset as usize;
input_iter.set_size_of_buffer(offset_usize);
while let Some((ln, line)) = input_iter.next() {
let l = line?;
if regex.is_match(&l) {
for line in input_iter.shrink_buffer_to_size() {
self.writeln(line)?;
}
if !self.options.suppress_matched {
// add 1 to the buffer size to make place for the matched line
input_iter.set_size_of_buffer(offset_usize + 1);
if input_iter.add_line_to_buffer(ln, l).is_some() {
panic!("should be big enough to hold every lines");
}
}
self.finish_split()?;
if input_iter.buffer_len() < offset_usize {
return Err(CsplitError::LineOutOfRange(pattern_as_str.to_string()));
}
return Ok(());
}
if let Some(line) = input_iter.add_line_to_buffer(ln, l) {
self.writeln(line)?;
}
}
// no match, drain the buffer into the current split
for line in input_iter.drain_buffer() {
self.writeln(line)?;
}
}
self.finish_split()?;
Err(CsplitError::MatchNotFound(pattern_as_str.to_string()))
}
}
/// An iterator which can output items from a buffer filled externally.
/// This is used to pass matching lines to the next split and to support patterns with a negative offset.
struct InputSplitter<I>
where
I: Iterator<Item = (usize, io::Result<String>)>,
{
iter: I,
buffer: Vec<<I as Iterator>::Item>,
/// the number of elements the buffer may hold
size: usize,
/// flag to indicate content off the buffer should be returned instead of off the wrapped
/// iterator
rewind: bool,
}
impl<I> InputSplitter<I>
where
I: Iterator<Item = (usize, io::Result<String>)>,
{
fn new(iter: I) -> InputSplitter<I> {
InputSplitter {
iter,
buffer: Vec::new(),
rewind: false,
size: 1,
}
}
/// Rewind the iteration by outputing the buffer's content.
fn rewind_buffer(&mut self) {
self.rewind = true;
}
/// Shrink the buffer so that its length is equal to the set size, returning an iterator for
/// the elements that were too much.
fn shrink_buffer_to_size<'a>(&'a mut self) -> impl Iterator<Item = String> + 'a {
let mut shrink_offset = 0;
if self.buffer.len() > self.size {
shrink_offset = self.buffer.len() - self.size;
}
self.buffer
.drain(..shrink_offset)
.map(|(_, line)| line.unwrap())
}
/// Drain the content of the buffer.
fn drain_buffer<'a>(&'a mut self) -> impl Iterator<Item = String> + 'a {
self.buffer.drain(..).map(|(_, line)| line.unwrap())
}
/// Set the maximum number of lines to keep.
fn set_size_of_buffer(&mut self, size: usize) {
self.size = size;
}
/// Add a line to the buffer. If the buffer has [`size`] elements, then its head is removed and
/// the new line is pushed to the buffer. The removed head is then available in the returned
/// option.
fn add_line_to_buffer(&mut self, ln: usize, line: String) -> Option<String> {
if self.rewind {
self.buffer.insert(0, (ln, Ok(line)));
None
} else if self.buffer.len() >= self.size {
let (_, head_line) = self.buffer.remove(0);
self.buffer.push((ln, Ok(line)));
Some(head_line.unwrap())
} else {
self.buffer.push((ln, Ok(line)));
None
}
}
/// Returns the number of lines stored in the buffer
fn buffer_len(&self) -> usize {
self.buffer.len()
}
}
impl<I> Iterator for InputSplitter<I>
where
I: Iterator<Item = (usize, io::Result<String>)>,
{
type Item = <I as Iterator>::Item;
fn next(&mut self) -> Option<Self::Item> {
if self.rewind {
if !self.buffer.is_empty() {
return Some(self.buffer.remove(0));
}
self.rewind = false;
}
self.iter.next()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn input_splitter() {
let input = vec![
Ok(String::from("aaa")),
Ok(String::from("bbb")),
Ok(String::from("ccc")),
Ok(String::from("ddd")),
];
let mut input_splitter = InputSplitter::new(input.into_iter().enumerate());
input_splitter.set_size_of_buffer(2);
assert_eq!(input_splitter.buffer_len(), 0);
match input_splitter.next() {
Some((0, Ok(line))) => {
assert_eq!(line, String::from("aaa"));
assert_eq!(input_splitter.add_line_to_buffer(0, line), None);
assert_eq!(input_splitter.buffer_len(), 1);
}
item @ _ => panic!("wrong item: {:?}", item),
};
match input_splitter.next() {
Some((1, Ok(line))) => {
assert_eq!(line, String::from("bbb"));
assert_eq!(input_splitter.add_line_to_buffer(1, line), None);
assert_eq!(input_splitter.buffer_len(), 2);
}
item @ _ => panic!("wrong item: {:?}", item),
};
match input_splitter.next() {
Some((2, Ok(line))) => {
assert_eq!(line, String::from("ccc"));
assert_eq!(
input_splitter.add_line_to_buffer(2, line),
Some(String::from("aaa"))
);
assert_eq!(input_splitter.buffer_len(), 2);
}
item @ _ => panic!("wrong item: {:?}", item),
};
input_splitter.rewind_buffer();
match input_splitter.next() {
Some((1, Ok(line))) => {
assert_eq!(line, String::from("bbb"));
assert_eq!(input_splitter.buffer_len(), 1);
}
item @ _ => panic!("wrong item: {:?}", item),
};
match input_splitter.next() {
Some((2, Ok(line))) => {
assert_eq!(line, String::from("ccc"));
assert_eq!(input_splitter.buffer_len(), 0);
}
item @ _ => panic!("wrong item: {:?}", item),
};
match input_splitter.next() {
Some((3, Ok(line))) => {
assert_eq!(line, String::from("ddd"));
assert_eq!(input_splitter.buffer_len(), 0);
}
item @ _ => panic!("wrong item: {:?}", item),
};
assert!(input_splitter.next().is_none());
}
#[test]
fn input_splitter_interrupt_rewind() {
let input = vec![
Ok(String::from("aaa")),
Ok(String::from("bbb")),
Ok(String::from("ccc")),
Ok(String::from("ddd")),
];
let mut input_splitter = InputSplitter::new(input.into_iter().enumerate());
input_splitter.set_size_of_buffer(3);
assert_eq!(input_splitter.buffer_len(), 0);
match input_splitter.next() {
Some((0, Ok(line))) => {
assert_eq!(line, String::from("aaa"));
assert_eq!(input_splitter.add_line_to_buffer(0, line), None);
assert_eq!(input_splitter.buffer_len(), 1);
}
item @ _ => panic!("wrong item: {:?}", item),
};
match input_splitter.next() {
Some((1, Ok(line))) => {
assert_eq!(line, String::from("bbb"));
assert_eq!(input_splitter.add_line_to_buffer(1, line), None);
assert_eq!(input_splitter.buffer_len(), 2);
}
item @ _ => panic!("wrong item: {:?}", item),
};
match input_splitter.next() {
Some((2, Ok(line))) => {
assert_eq!(line, String::from("ccc"));
assert_eq!(input_splitter.add_line_to_buffer(2, line), None);
assert_eq!(input_splitter.buffer_len(), 3);
}
item @ _ => panic!("wrong item: {:?}", item),
};
input_splitter.rewind_buffer();
match input_splitter.next() {
Some((0, Ok(line))) => {
assert_eq!(line, String::from("aaa"));
assert_eq!(input_splitter.add_line_to_buffer(0, line), None);
assert_eq!(input_splitter.buffer_len(), 3);
}
item @ _ => panic!("wrong item: {:?}", item),
};
match input_splitter.next() {
Some((0, Ok(line))) => {
assert_eq!(line, String::from("aaa"));
assert_eq!(input_splitter.buffer_len(), 2);
}
item @ _ => panic!("wrong item: {:?}", item),
};
match input_splitter.next() {
Some((1, Ok(line))) => {
assert_eq!(line, String::from("bbb"));
assert_eq!(input_splitter.buffer_len(), 1);
}
item @ _ => panic!("wrong item: {:?}", item),
};
match input_splitter.next() {
Some((2, Ok(line))) => {
assert_eq!(line, String::from("ccc"));
assert_eq!(input_splitter.buffer_len(), 0);
}
item @ _ => panic!("wrong item: {:?}", item),
};
match input_splitter.next() {
Some((3, Ok(line))) => {
assert_eq!(line, String::from("ddd"));
assert_eq!(input_splitter.buffer_len(), 0);
}
item @ _ => panic!("wrong item: {:?}", item),
};
assert!(input_splitter.next().is_none());
}
}
pub fn uumain(args: impl uucore::Args) -> i32 {
let args = args.collect_str();
let matches = app!(SYNTAX, SUMMARY, LONG_HELP)
.optopt(
"b",
SUFFIX_FORMAT_OPT,
"use sprintf FORMAT instead of %02d",
"FORMAT",
)
.optopt("f", PREFIX_OPT, "use PREFIX instead of 'xx'", "PREFIX")
.optflag("k", KEEP_FILES_OPT, "do not remove output files on errors")
.optflag(
"",
SUPPRESS_MATCHED_OPT,
"suppress the lines matching PATTERN",
)
.optopt(
"n",
DIGITS_OPT,
"use specified number of digits instead of 2",
"DIGITS",
)
.optflag("s", QUIET_OPT, "do not print counts of output file sizes")
.optflag("z", ELIDE_EMPTY_FILES_OPT, "remove empty output files")
.parse(args);
// check for mandatory arguments
if matches.free.is_empty() {
show_error!("missing operand");
exit!(1);
}
if matches.free.len() == 1 {
show_error!("missing operand after '{}'", matches.free[0]);
exit!(1);
}
// get the patterns to split on
let patterns = return_if_err!(1, patterns::get_patterns(&matches.free[1..]));
// get the file to split
let file_name: &str = &matches.free[0];
let options = CsplitOptions::new(&matches);
if file_name == "-" {
let stdin = io::stdin();
crash_if_err!(1, csplit(&options, patterns, stdin.lock()));
} else {
let file = return_if_err!(1, File::open(file_name));
let file_metadata = return_if_err!(1, file.metadata());
if !file_metadata.is_file() {
crash!(1, "'{}' is not a regular file", file_name);
}
crash_if_err!(1, csplit(&options, patterns, BufReader::new(file)));
};
0
}

View file

@ -0,0 +1,34 @@
use std::io;
/// Errors thrown by the csplit command
#[derive(Debug, Fail)]
pub enum CsplitError {
#[fail(display = "IO error: {}", _0)]
IoError(io::Error),
#[fail(display = "'{}': line number out of range", _0)]
LineOutOfRange(String),
#[fail(display = "'{}': line number out of range on repetition {}", _0, _1)]
LineOutOfRangeOnRepetition(String, usize),
#[fail(display = "'{}': match not found", _0)]
MatchNotFound(String),
#[fail(display = "'{}': match not found on repetition {}", _0, _1)]
MatchNotFoundOnRepetition(String, usize),
#[fail(display = "line number must be greater than zero")]
LineNumberIsZero,
#[fail(display = "line number '{}' is smaller than preceding line number, {}", _0, _1)]
LineNumberSmallerThanPrevious(usize, usize),
#[fail(display = "invalid pattern: {}", _0)]
InvalidPattern(String),
#[fail(display = "invalid number: '{}'", _0)]
InvalidNumber(String),
#[fail(display = "incorrect conversion specification in suffix")]
SuffixFormatIncorrect,
#[fail(display = "too many % conversion specifications in suffix")]
SuffixFormatTooManyPercents,
}
impl From<io::Error> for CsplitError {
fn from(error: io::Error) -> Self {
CsplitError::IoError(error)
}
}

View file

@ -0,0 +1,2 @@
uucore_procs::main!(uu_csplit); // spell-checker:ignore procs uucore

View file

@ -0,0 +1,353 @@
use regex::Regex;
use crate::csplitError::CsplitError;
/// The definition of a pattern to match on a line.
#[derive(Debug)]
pub enum Pattern {
/// Copy the file's content to a split up to, not including, the given line number. The number
/// of times the pattern is executed is detailed in [`ExecutePattern`].
UpToLine(usize, ExecutePattern),
/// Copy the file's content to a split up to, not including, the line matching the regex. The
/// integer is an offset relative to the matched line of what to include (if positive) or
/// to exclude (if negative). The number of times the pattern is executed is detailed in
/// [`ExecutePattern`].
UpToMatch(Regex, i32, ExecutePattern),
/// Skip the file's content up to, not including, the line matching the regex. The integer
/// is an offset relative to the matched line of what to include (if positive) or to exclude
/// (if negative). The number of times the pattern is executed is detailed in [`ExecutePattern`].
SkipToMatch(Regex, i32, ExecutePattern),
}
impl ToString for Pattern {
fn to_string(&self) -> String {
match self {
Pattern::UpToLine(n, _) => n.to_string(),
Pattern::UpToMatch(regex, 0, _) => format!("/{}/", regex.as_str()),
Pattern::UpToMatch(regex, offset, _) => format!("/{}/{:+}", regex.as_str(), offset),
Pattern::SkipToMatch(regex, 0, _) => format!("%{}%", regex.as_str()),
Pattern::SkipToMatch(regex, offset, _) => format!("%{}%{:+}", regex.as_str(), offset),
}
}
}
/// The number of times a pattern can be used.
#[derive(Debug)]
pub enum ExecutePattern {
/// Execute the pattern as many times as possible
Always,
/// Execute the pattern a fixed number of times
Times(usize),
}
impl ExecutePattern {
pub fn iter(&self) -> ExecutePatternIter {
match self {
ExecutePattern::Times(n) => ExecutePatternIter::new(Some(*n)),
ExecutePattern::Always => ExecutePatternIter::new(None),
}
}
}
pub struct ExecutePatternIter {
max: Option<usize>,
cur: usize,
}
impl ExecutePatternIter {
fn new(max: Option<usize>) -> ExecutePatternIter {
ExecutePatternIter { max, cur: 0 }
}
}
impl Iterator for ExecutePatternIter {
type Item = (Option<usize>, usize);
fn next(&mut self) -> Option<(Option<usize>, usize)> {
match self.max {
// iterate until m is reached
Some(m) => {
if self.cur == m {
None
} else {
self.cur += 1;
Some((self.max, self.cur))
}
}
// no limit, just increment a counter
None => {
self.cur += 1;
Some((None, self.cur))
}
}
}
}
/// Parses the definitions of patterns given on the command line into a list of [`Pattern`]s.
///
/// # Errors
///
/// If a pattern is incorrect, a [`::CsplitError::InvalidPattern`] error is returned, which may be
/// due to, e.g.,:
/// - an invalid regular expression;
/// - an invalid number for, e.g., the offset.
pub fn get_patterns(args: &[String]) -> Result<Vec<Pattern>, CsplitError> {
let patterns = extract_patterns(args)?;
validate_line_numbers(&patterns)?;
Ok(patterns)
}
fn extract_patterns(args: &[String]) -> Result<Vec<Pattern>, CsplitError> {
let mut patterns = Vec::with_capacity(args.len());
let to_match_reg =
Regex::new(r"^(/(?P<UPTO>.+)/|%(?P<SKIPTO>.+)%)(?P<OFFSET>[\+-]\d+)?$").unwrap();
let execute_ntimes_reg = Regex::new(r"^\{(?P<TIMES>\d+)|\*\}$").unwrap();
let mut iter = args.iter().peekable();
while let Some(arg) = iter.next() {
// get the number of times a pattern is repeated, which is at least once plus whatever is
// in the quantifier.
let execute_ntimes = match iter.peek() {
None => ExecutePattern::Times(1),
Some(&next_item) => {
match execute_ntimes_reg.captures(next_item) {
None => ExecutePattern::Times(1),
Some(r) => {
// skip the next item
iter.next();
if let Some(times) = r.name("TIMES") {
ExecutePattern::Times(times.as_str().parse::<usize>().unwrap() + 1)
} else {
ExecutePattern::Always
}
}
}
}
};
// get the pattern definition
if let Some(captures) = to_match_reg.captures(arg) {
let offset = match captures.name("OFFSET") {
None => 0,
Some(m) => m.as_str().parse().unwrap(),
};
if let Some(up_to_match) = captures.name("UPTO") {
let pattern = match Regex::new(up_to_match.as_str()) {
Err(_) => {
return Err(CsplitError::InvalidPattern(arg.to_string()));
}
Ok(reg) => reg,
};
patterns.push(Pattern::UpToMatch(pattern, offset, execute_ntimes));
} else if let Some(skip_to_match) = captures.name("SKIPTO") {
let pattern = match Regex::new(skip_to_match.as_str()) {
Err(_) => {
return Err(CsplitError::InvalidPattern(arg.to_string()));
}
Ok(reg) => reg,
};
patterns.push(Pattern::SkipToMatch(pattern, offset, execute_ntimes));
}
} else if let Some(line_number) = arg.parse::<usize>().ok() {
patterns.push(Pattern::UpToLine(line_number, execute_ntimes));
} else {
return Err(CsplitError::InvalidPattern(arg.to_string()));
}
}
Ok(patterns)
}
/// Asserts the line numbers are in increasing order, starting at 1.
fn validate_line_numbers(patterns: &[Pattern]) -> Result<(), CsplitError> {
patterns
.iter()
.filter_map(|pattern| match pattern {
Pattern::UpToLine(line_number, _) => Some(line_number),
_ => None,
})
.try_fold(0, |prev_ln, &current_ln| match (prev_ln, current_ln) {
// a line number cannot be zero
(_, 0) => Err(CsplitError::LineNumberIsZero),
// two consecutifs numbers should not be equal
(n, m) if n == m => {
show_warning!("line number '{}' is the same as preceding line number", n);
Ok(n)
}
// a number cannot be greater than the one that follows
(n, m) if n > m => Err(CsplitError::LineNumberSmallerThanPrevious(m, n)),
(_, m) => Ok(m),
})?;
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn bad_pattern() {
let input = vec!["bad".to_string()];
assert!(get_patterns(input.as_slice()).is_err());
}
#[test]
fn up_to_line_pattern() {
let input: Vec<String> = vec!["24", "42", "{*}", "50", "{4}"]
.into_iter()
.map(|v| v.to_string())
.collect();
let patterns = get_patterns(input.as_slice()).unwrap();
assert_eq!(patterns.len(), 3);
match patterns.get(0) {
Some(Pattern::UpToLine(24, ExecutePattern::Times(1))) => (),
_ => panic!("expected UpToLine pattern"),
};
match patterns.get(1) {
Some(Pattern::UpToLine(42, ExecutePattern::Always)) => (),
_ => panic!("expected UpToLine pattern"),
};
match patterns.get(2) {
Some(Pattern::UpToLine(50, ExecutePattern::Times(5))) => (),
_ => panic!("expected UpToLine pattern"),
};
}
#[test]
fn up_to_match_pattern() {
let input: Vec<String> = vec![
"/test1.*end$/",
"/test2.*end$/",
"{*}",
"/test3.*end$/",
"{4}",
"/test4.*end$/+3",
"/test5.*end$/-3",
].into_iter()
.map(|v| v.to_string())
.collect();
let patterns = get_patterns(input.as_slice()).unwrap();
assert_eq!(patterns.len(), 5);
match patterns.get(0) {
Some(Pattern::UpToMatch(reg, 0, ExecutePattern::Times(1))) => {
let parsed_reg = format!("{}", reg);
assert_eq!(parsed_reg, "test1.*end$");
}
_ => panic!("expected UpToMatch pattern"),
};
match patterns.get(1) {
Some(Pattern::UpToMatch(reg, 0, ExecutePattern::Always)) => {
let parsed_reg = format!("{}", reg);
assert_eq!(parsed_reg, "test2.*end$");
}
_ => panic!("expected UpToMatch pattern"),
};
match patterns.get(2) {
Some(Pattern::UpToMatch(reg, 0, ExecutePattern::Times(5))) => {
let parsed_reg = format!("{}", reg);
assert_eq!(parsed_reg, "test3.*end$");
}
_ => panic!("expected UpToMatch pattern"),
};
match patterns.get(3) {
Some(Pattern::UpToMatch(reg, 3, ExecutePattern::Times(1))) => {
let parsed_reg = format!("{}", reg);
assert_eq!(parsed_reg, "test4.*end$");
}
_ => panic!("expected UpToMatch pattern"),
};
match patterns.get(4) {
Some(Pattern::UpToMatch(reg, -3, ExecutePattern::Times(1))) => {
let parsed_reg = format!("{}", reg);
assert_eq!(parsed_reg, "test5.*end$");
}
_ => panic!("expected UpToMatch pattern"),
};
}
#[test]
fn skip_to_match_pattern() {
let input: Vec<String> = vec![
"%test1.*end$%",
"%test2.*end$%",
"{*}",
"%test3.*end$%",
"{4}",
"%test4.*end$%+3",
"%test5.*end$%-3",
].into_iter()
.map(|v| v.to_string())
.collect();
let patterns = get_patterns(input.as_slice()).unwrap();
assert_eq!(patterns.len(), 5);
match patterns.get(0) {
Some(Pattern::SkipToMatch(reg, 0, ExecutePattern::Times(1))) => {
let parsed_reg = format!("{}", reg);
assert_eq!(parsed_reg, "test1.*end$");
}
_ => panic!("expected SkipToMatch pattern"),
};
match patterns.get(1) {
Some(Pattern::SkipToMatch(reg, 0, ExecutePattern::Always)) => {
let parsed_reg = format!("{}", reg);
assert_eq!(parsed_reg, "test2.*end$");
}
_ => panic!("expected SkipToMatch pattern"),
};
match patterns.get(2) {
Some(Pattern::SkipToMatch(reg, 0, ExecutePattern::Times(5))) => {
let parsed_reg = format!("{}", reg);
assert_eq!(parsed_reg, "test3.*end$");
}
_ => panic!("expected SkipToMatch pattern"),
};
match patterns.get(3) {
Some(Pattern::SkipToMatch(reg, 3, ExecutePattern::Times(1))) => {
let parsed_reg = format!("{}", reg);
assert_eq!(parsed_reg, "test4.*end$");
}
_ => panic!("expected SkipToMatch pattern"),
};
match patterns.get(4) {
Some(Pattern::SkipToMatch(reg, -3, ExecutePattern::Times(1))) => {
let parsed_reg = format!("{}", reg);
assert_eq!(parsed_reg, "test5.*end$");
}
_ => panic!("expected SkipToMatch pattern"),
};
}
#[test]
fn line_number_zero() {
let patterns = vec![Pattern::UpToLine(0, ExecutePattern::Times(1))];
match validate_line_numbers(&patterns) {
Err(::CsplitError::LineNumberIsZero) => (),
_ => panic!("expected LineNumberIsZero error"),
}
}
#[test]
fn line_number_smaller_than_previous() {
let input: Vec<String> = vec!["10".to_string(), "5".to_string()];
match get_patterns(input.as_slice()) {
Err(::CsplitError::LineNumberSmallerThanPrevious(5, 10)) => (),
_ => panic!("expected LineNumberSmallerThanPrevious error"),
}
}
#[test]
fn line_number_smaller_than_previous_separate() {
let input: Vec<String> = vec!["10".to_string(), "/20/".to_string(), "5".to_string()];
match get_patterns(input.as_slice()) {
Err(::CsplitError::LineNumberSmallerThanPrevious(5, 10)) => (),
_ => panic!("expected LineNumberSmallerThanPrevious error"),
}
}
#[test]
fn line_number_zero_separate() {
let input: Vec<String> = vec!["10".to_string(), "/20/".to_string(), "0".to_string()];
match get_patterns(input.as_slice()) {
Err(::CsplitError::LineNumberIsZero) => (),
_ => panic!("expected LineNumberIsZero error"),
}
}
}

View file

@ -0,0 +1,397 @@
use regex::Regex;
//mod csplit;
use crate::CsplitError;
/// Computes the filename of a split, taking into consideration a possible user-defined suffix
/// format.
pub struct SplitName {
fn_split_name: Box<dyn Fn(usize) -> String>,
}
impl SplitName {
/// Creates a new SplitName with the given user-defined options:
/// - `prefix_opt` specifies a prefix for all splits.
/// - `format_opt` specifies a custom format for the suffix part of the filename, using the
/// `sprintf` format notation.
/// - `n_digits_opt` defines the width of the split number.
///
/// # Caveats
///
/// If `prefix_opt` and `format_opt` are defined, and the `format_opt` has some string appearing
/// before the conversion pattern (e.g., "here-%05d"), then it is appended to the passed prefix
/// via `prefix_opt`.
///
/// If `n_digits_opt` and `format_opt` are defined, then width defined in `format_opt` is
/// taken.
pub fn new(
prefix_opt: Option<String>,
format_opt: Option<String>,
n_digits_opt: Option<String>,
) -> Result<SplitName, CsplitError> {
// get the prefix
let prefix = prefix_opt.unwrap_or("xx".to_string());
// the width for the split offset
let n_digits = match n_digits_opt {
None => 2,
Some(opt) => match opt.parse::<usize>() {
Ok(digits) => digits,
Err(_) => return Err(CsplitError::InvalidNumber(opt)),
},
};
// translate the custom format into a function
let fn_split_name: Box<dyn Fn(usize) -> String> = match format_opt {
None => Box::new(move |n: usize| -> String {
format!("{}{:0width$}", prefix, n, width = n_digits)
}),
Some(custom) => {
let spec = Regex::new(
r"(?P<ALL>%(?P<FLAG>[0#-])(?P<WIDTH>\d+)?(?P<TYPE>[diuoxX]))",
).unwrap();
let mut captures_iter = spec.captures_iter(&custom);
let custom_fn: Box<dyn Fn(usize) -> String> = match captures_iter.next() {
Some(captures) => {
let all = captures.name("ALL").unwrap();
let before = custom[0..all.start()].to_owned();
let after = custom[all.end()..].to_owned();
let n_digits = match captures.name("WIDTH") {
None => 0,
Some(m) => m.as_str().parse::<usize>().unwrap(),
};
match (captures.name("FLAG"), captures.name("TYPE")) {
(Some(ref f), Some(ref t)) => {
match (f.as_str(), t.as_str()) {
/*
* zero padding
*/
// decimal
("0", "d") | ("0", "i") | ("0", "u") => {
Box::new(move |n: usize| -> String {
format!(
"{}{}{:0width$}{}",
prefix,
before,
n,
after,
width = n_digits
)
})
}
// octal
("0", "o") => Box::new(move |n: usize| -> String {
format!(
"{}{}{:0width$o}{}",
prefix,
before,
n,
after,
width = n_digits
)
}),
// lower hexadecimal
("0", "x") => Box::new(move |n: usize| -> String {
format!(
"{}{}{:0width$x}{}",
prefix,
before,
n,
after,
width = n_digits
)
}),
// upper hexadecimal
("0", "X") => Box::new(move |n: usize| -> String {
format!(
"{}{}{:0width$X}{}",
prefix,
before,
n,
after,
width = n_digits
)
}),
/*
* Alternate form
*/
// octal
("#", "o") => Box::new(move |n: usize| -> String {
format!(
"{}{}{:>#width$o}{}",
prefix,
before,
n,
after,
width = n_digits
)
}),
// lower hexadecimal
("#", "x") => Box::new(move |n: usize| -> String {
format!(
"{}{}{:>#width$x}{}",
prefix,
before,
n,
after,
width = n_digits
)
}),
// upper hexadecimal
("#", "X") => Box::new(move |n: usize| -> String {
format!(
"{}{}{:>#width$X}{}",
prefix,
before,
n,
after,
width = n_digits
)
}),
/*
* Left adjusted
*/
// decimal
("-", "d") | ("-", "i") | ("-", "u") => {
Box::new(move |n: usize| -> String {
format!(
"{}{}{:<#width$}{}",
prefix,
before,
n,
after,
width = n_digits
)
})
}
// octal
("-", "o") => Box::new(move |n: usize| -> String {
format!(
"{}{}{:<#width$o}{}",
prefix,
before,
n,
after,
width = n_digits
)
}),
// lower hexadecimal
("-", "x") => Box::new(move |n: usize| -> String {
format!(
"{}{}{:<#width$x}{}",
prefix,
before,
n,
after,
width = n_digits
)
}),
// upper hexadecimal
("-", "X") => Box::new(move |n: usize| -> String {
format!(
"{}{}{:<#width$X}{}",
prefix,
before,
n,
after,
width = n_digits
)
}),
_ => return Err(CsplitError::SuffixFormatIncorrect),
}
}
_ => return Err(CsplitError::SuffixFormatIncorrect),
}
}
None => return Err(CsplitError::SuffixFormatIncorrect),
};
// there cannot be more than one format pattern
if captures_iter.next().is_some() {
return Err(CsplitError::SuffixFormatTooManyPercents);
}
custom_fn
}
};
Ok(SplitName { fn_split_name })
}
/// Returns the filename of the i-th split.
pub fn get(&self, n: usize) -> String {
(self.fn_split_name)(n)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn invalid_number() {
let split_name = SplitName::new(None, None, Some(String::from("bad")));
match split_name {
Err(CsplitError::InvalidNumber(_)) => (),
_ => panic!("should fail with InvalidNumber"),
};
}
#[test]
fn invalid_suffix_format1() {
let split_name = SplitName::new(None, Some(String::from("no conversion string")), None);
match split_name {
Err(CsplitError::SuffixFormatIncorrect) => (),
_ => panic!("should fail with SuffixFormatIncorrect"),
};
}
#[test]
fn invalid_suffix_format2() {
let split_name = SplitName::new(None, Some(String::from("%042a")), None);
match split_name {
Err(CsplitError::SuffixFormatIncorrect) => (),
_ => panic!("should fail with SuffixFormatIncorrect"),
};
}
#[test]
fn default_formatter() {
let split_name = SplitName::new(None, None, None).unwrap();
assert_eq!(split_name.get(2), "xx02");
}
#[test]
fn default_formatter_with_prefix() {
let split_name = SplitName::new(Some(String::from("aaa")), None, None).unwrap();
assert_eq!(split_name.get(2), "aaa02");
}
#[test]
fn default_formatter_with_width() {
let split_name = SplitName::new(None, None, Some(String::from("5"))).unwrap();
assert_eq!(split_name.get(2), "xx00002");
}
#[test]
fn zero_padding_decimal1() {
let split_name = SplitName::new(None, Some(String::from("cst-%03d-")), None).unwrap();
assert_eq!(split_name.get(2), "xxcst-002-");
}
#[test]
fn zero_padding_decimal2() {
let split_name = SplitName::new(
Some(String::from("pre-")),
Some(String::from("cst-%03d-post")),
None,
).unwrap();
assert_eq!(split_name.get(2), "pre-cst-002-post");
}
#[test]
fn zero_padding_decimal3() {
let split_name = SplitName::new(
None,
Some(String::from("cst-%03d-")),
Some(String::from("42")),
).unwrap();
assert_eq!(split_name.get(2), "xxcst-002-");
}
#[test]
fn zero_padding_decimal4() {
let split_name = SplitName::new(None, Some(String::from("cst-%03i-")), None).unwrap();
assert_eq!(split_name.get(2), "xxcst-002-");
}
#[test]
fn zero_padding_decimal5() {
let split_name = SplitName::new(None, Some(String::from("cst-%03u-")), None).unwrap();
assert_eq!(split_name.get(2), "xxcst-002-");
}
#[test]
fn zero_padding_octal() {
let split_name = SplitName::new(None, Some(String::from("cst-%03o-")), None).unwrap();
assert_eq!(split_name.get(42), "xxcst-052-");
}
#[test]
fn zero_padding_lower_hexa() {
let split_name = SplitName::new(None, Some(String::from("cst-%03x-")), None).unwrap();
assert_eq!(split_name.get(42), "xxcst-02a-");
}
#[test]
fn zero_padding_upper_hexa() {
let split_name = SplitName::new(None, Some(String::from("cst-%03X-")), None).unwrap();
assert_eq!(split_name.get(42), "xxcst-02A-");
}
#[test]
fn alternate_form_octal() {
let split_name = SplitName::new(None, Some(String::from("cst-%#10o-")), None).unwrap();
assert_eq!(split_name.get(42), "xxcst- 0o52-");
}
#[test]
fn alternate_form_lower_hexa() {
let split_name = SplitName::new(None, Some(String::from("cst-%#10x-")), None).unwrap();
assert_eq!(split_name.get(42), "xxcst- 0x2a-");
}
#[test]
fn alternate_form_upper_hexa() {
let split_name = SplitName::new(None, Some(String::from("cst-%#10X-")), None).unwrap();
assert_eq!(split_name.get(42), "xxcst- 0x2A-");
}
#[test]
fn left_adjusted_decimal1() {
let split_name = SplitName::new(None, Some(String::from("cst-%-10d-")), None).unwrap();
assert_eq!(split_name.get(42), "xxcst-42 -");
}
#[test]
fn left_adjusted_decimal2() {
let split_name = SplitName::new(None, Some(String::from("cst-%-10i-")), None).unwrap();
assert_eq!(split_name.get(42), "xxcst-42 -");
}
#[test]
fn left_adjusted_decimal3() {
let split_name = SplitName::new(None, Some(String::from("cst-%-10u-")), None).unwrap();
assert_eq!(split_name.get(42), "xxcst-42 -");
}
#[test]
fn left_adjusted_octal() {
let split_name = SplitName::new(None, Some(String::from("cst-%-10o-")), None).unwrap();
assert_eq!(split_name.get(42), "xxcst-0o52 -");
}
#[test]
fn left_adjusted_lower_hexa() {
let split_name = SplitName::new(None, Some(String::from("cst-%-10x-")), None).unwrap();
assert_eq!(split_name.get(42), "xxcst-0x2a -");
}
#[test]
fn left_adjusted_upper_hexa() {
let split_name = SplitName::new(None, Some(String::from("cst-%-10X-")), None).unwrap();
assert_eq!(split_name.get(42), "xxcst-0x2A -");
}
#[test]
fn too_many_percent() {
let split_name = SplitName::new(None, Some(String::from("%02d-%-3x")), None);
match split_name {
Err(CsplitError::SuffixFormatTooManyPercents) => (),
_ => panic!("should fail with SuffixFormatTooManyPercents"),
};
}
}

1335
tests/by-util/test_csplit.rs Normal file

File diff suppressed because it is too large Load diff

50
tests/fixtures/csplit/numbers50.txt vendored Normal file
View file

@ -0,0 +1,50 @@
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50