Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 71 additions & 45 deletions src/uu/expand/src/expand.rs
Original file line number Diff line number Diff line change
Expand Up @@ -349,7 +349,62 @@ enum CharType {
Other,
}

#[allow(clippy::cognitive_complexity)]
/// Classify a character and determine its width and byte length.
///
/// Returns `(CharType, display_width, byte_length)`.
#[inline]
fn classify_char(buf: &[u8], byte: usize, uflag: bool) -> (CharType, usize, usize) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would use a more descriptive name than uflag. Or alternatively, split the function into two functions so you don't need uflag.

use self::CharType::{Backspace, Other, Tab};

if uflag {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe I'm misunderstanding the meaning of uflag, but shouldn't the if and else blocks be switched? From the help:

-U, --no-utf8         interpret input file as 8-bit ASCII rather than UTF-8

Currently, the code does the opposite: if uflag is true, the char is handled as UTF-8.

let nbytes = char::from(buf[byte]).len_utf8();

if byte + nbytes > buf.len() {
// don't overrun buffer because of invalid UTF-8
return (Other, 1, 1);
}

if let Ok(t) = from_utf8(&buf[byte..byte + nbytes]) {
match t.chars().next() {
Some('\t') => (Tab, 0, nbytes),
Some('\x08') => (Backspace, 0, nbytes),
Comment on lines +369 to +370
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it correct that the display_width for Tab and Backspace differs between UTF-8 and ASCII?

Comment on lines +369 to +370
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would use the concrete byte length instead of nbytes:

Suggested change
Some('\t') => (Tab, 0, nbytes),
Some('\x08') => (Backspace, 0, nbytes),
Some('\t') => (Tab, 0, 1),
Some('\x08') => (Backspace, 0, 1),

Some(c) => (Other, UnicodeWidthChar::width(c).unwrap_or(0), nbytes),
None => {
// no valid char at start of t, so take 1 byte
(Other, 1, 1)
}
}
} else {
(Other, 1, 1) // implicit assumption: non-UTF-8 char is 1 col wide
}
} else {
(
match buf.get(byte) {
// always take exactly 1 byte in strict ASCII mode
Some(0x09) => Tab,
Some(0x08) => Backspace,
_ => Other,
},
1,
1,
)
}
}

/// Write spaces for a tab expansion.
#[inline]
fn write_tab_spaces(
output: &mut BufWriter<std::io::Stdout>,
nts: usize,
tspaces: &str,
) -> std::io::Result<()> {
if nts <= tspaces.len() {
output.write_all(&tspaces.as_bytes()[..nts])
} else {
output.write_all(" ".repeat(nts).as_bytes())
}
}

fn expand_line(
buf: &mut Vec<u8>,
output: &mut BufWriter<std::io::Stdout>,
Expand All @@ -372,37 +427,7 @@ fn expand_line(
let mut init = true;

while byte < buf.len() {
let (ctype, cwidth, nbytes) = if options.uflag {
let nbytes = char::from(buf[byte]).len_utf8();

if byte + nbytes > buf.len() {
// don't overrun buffer because of invalid UTF-8
(Other, 1, 1)
} else if let Ok(t) = from_utf8(&buf[byte..byte + nbytes]) {
match t.chars().next() {
Some('\t') => (Tab, 0, nbytes),
Some('\x08') => (Backspace, 0, nbytes),
Some(c) => (Other, UnicodeWidthChar::width(c).unwrap_or(0), nbytes),
None => {
// no valid char at start of t, so take 1 byte
(Other, 1, 1)
}
}
} else {
(Other, 1, 1) // implicit assumption: non-UTF-8 char is 1 col wide
}
} else {
(
match buf.get(byte) {
// always take exactly 1 byte in strict ASCII mode
Some(0x09) => Tab,
Some(0x08) => Backspace,
_ => Other,
},
1,
1,
)
};
let (ctype, cwidth, nbytes) = classify_char(buf, byte, options.uflag);

// figure out how many columns this char takes up
match ctype {
Expand All @@ -413,23 +438,24 @@ fn expand_line(

// now dump out either spaces if we're expanding, or a literal tab if we're not
if init || !options.iflag {
if nts <= options.tspaces.len() {
output.write_all(&options.tspaces.as_bytes()[..nts])?;
} else {
output.write_all(" ".repeat(nts).as_bytes())?;
}
write_tab_spaces(output, nts, &options.tspaces)?;
} else {
output.write_all(&buf[byte..byte + nbytes])?;
}
}
_ => {
col = if ctype == Other {
col + cwidth
} else if col > 0 {
col - 1
} else {
0
};
Backspace => {
col = col.saturating_sub(1);

// if we're writing anything other than a space, then we're
// done with the line's leading spaces
if buf[byte] != 0x20 {
init = false;
}

output.write_all(&buf[byte..byte + nbytes])?;
}
Other => {
col += cwidth;

// if we're writing anything other than a space, then we're
// done with the line's leading spaces
Expand Down
Loading