From d49d1d44995a75fd288f860c10b2bc18cfc8a5b4 Mon Sep 17 00:00:00 2001 From: PFPoitras Date: Fri, 10 Dec 2021 13:38:03 -0400 Subject: [PATCH] Modifications to buffer UTF-16 internally so that there is no longer a 4-byte buffer minimum. Include suggestions from @agausmann and @Mark-Simulacrum. --- library/std/src/sys/windows/stdio.rs | 72 +++++++++++++++++++++------- 1 file changed, 54 insertions(+), 18 deletions(-) diff --git a/library/std/src/sys/windows/stdio.rs b/library/std/src/sys/windows/stdio.rs index a4fe5f67f69..8478ca043a3 100644 --- a/library/std/src/sys/windows/stdio.rs +++ b/library/std/src/sys/windows/stdio.rs @@ -15,7 +15,9 @@ // the value over time (such as if a process calls `SetStdHandle` while it's running). See #40490. pub struct Stdin { surrogate: u16, + incomplete_utf8: IncompleteUtf8, } + pub struct Stdout { incomplete_utf8: IncompleteUtf8, } @@ -29,6 +31,25 @@ struct IncompleteUtf8 { len: u8, } +impl IncompleteUtf8 { + // Implemented for use in Stdin::read. + fn read(&mut self, buf: &mut [u8]) -> usize { + // Write to buffer until the buffer is full or we run out of bytes. + let to_write = cmp::min(buf.len(), self.len as usize); + buf[..to_write].copy_from_slice(&self.bytes[..to_write]); + + // Rotate the remaining bytes if not enough remaining space in buffer. + if usize::from(self.len) > buf.len() { + self.bytes.copy_within(to_write.., 0); + self.len -= to_write as u8; + } else { + self.len = 0; + } + + to_write + } +} + // Apparently Windows doesn't handle large reads on stdin or writes to stdout/stderr well (see // #13304 for details). // @@ -205,7 +226,7 @@ fn write_u16s(handle: c::HANDLE, data: &[u16]) -> io::Result { impl Stdin { pub const fn new() -> Stdin { - Stdin { surrogate: 0 } + Stdin { surrogate: 0, incomplete_utf8: IncompleteUtf8::new() } } } @@ -221,24 +242,39 @@ fn read(&mut self, buf: &mut [u8]) -> io::Result { } } - if buf.len() == 0 { - return Ok(0); - } else if buf.len() < 4 { - return Err(io::Error::new_const( - io::ErrorKind::InvalidInput, - &"Windows stdin in console mode does not support a buffer too small to \ - guarantee holding one arbitrary UTF-8 character (4 bytes)", - )); + // If there are bytes in the incomplete utf-8, start with those. + // (No-op if there is nothing in the buffer.) + let mut bytes_copied = self.incomplete_utf8.read(buf); + + if bytes_copied == buf.len() { + return Ok(bytes_copied); + } else if buf.len() - bytes_copied < 4 { + // Not enough space to get a UTF-8 byte. We will use the incomplete UTF8. + let mut utf16_buf = [0u16; 1]; + // Read one u16 character. + let read = read_u16s_fixup_surrogates(handle, &mut utf16_buf, 1, &mut self.surrogate)?; + // Read bytes, using the (now-empty) self.incomplete_utf8 as extra space. + let read_bytes = utf16_to_utf8(&utf16_buf[..read], &mut self.incomplete_utf8.bytes)?; + + // Read in the bytes from incomplete_utf8 until the buffer is full. + self.incomplete_utf8.len = read_bytes as u8; + // No-op if no bytes. + bytes_copied += self.incomplete_utf8.read(&mut buf[bytes_copied..]); + Ok(bytes_copied) + } else { + let mut utf16_buf = [0u16; MAX_BUFFER_SIZE / 2]; + // In the worst case, a UTF-8 string can take 3 bytes for every `u16` of a UTF-16. So + // we can read at most a third of `buf.len()` chars and uphold the guarantee no data gets + // lost. + let amount = cmp::min(buf.len() / 3, utf16_buf.len()); + let read = + read_u16s_fixup_surrogates(handle, &mut utf16_buf, amount, &mut self.surrogate)?; + + match utf16_to_utf8(&utf16_buf[..read], buf) { + Ok(value) => return Ok(bytes_copied + value), + Err(e) => return Err(e), + } } - - let mut utf16_buf = [0u16; MAX_BUFFER_SIZE / 2]; - // In the worst case, a UTF-8 string can take 3 bytes for every `u16` of a UTF-16. So - // we can read at most a third of `buf.len()` chars and uphold the guarantee no data gets - // lost. - let amount = cmp::min(buf.len() / 3, utf16_buf.len()); - let read = read_u16s_fixup_surrogates(handle, &mut utf16_buf, amount, &mut self.surrogate)?; - - utf16_to_utf8(&utf16_buf[..read], buf) } } -- 2.44.0