1 //! This module contains specializations that can offload `io::copy()` operations on file descriptor
2 //! containing types (`File`, `TcpStream`, etc.) to more efficient syscalls than `read(2)` and `write(2)`.
4 //! Specialization is only applied to wholly std-owned types so that user code can't observe
5 //! that the `Read` and `Write` traits are not used.
7 //! Since a copy operation involves a reader and writer side where each can consist of different types
8 //! and also involve generic wrappers (e.g. `Take`, `BufReader`) it is not practical to specialize
9 //! a single method on all possible combinations.
11 //! Instead readers and writers are handled separately by the `CopyRead` and `CopyWrite` specialization
12 //! traits and then specialized on by the `Copier::copy` method.
14 //! `Copier` uses the specialization traits to unpack the underlying file descriptors and
15 //! additional prerequisites and constraints imposed by the wrapper types.
17 //! Once it has obtained all necessary pieces and brought any wrapper types into a state where they
18 //! can be safely bypassed it will attempt to use the `copy_file_range(2)`,
19 //! `sendfile(2)` or `splice(2)` syscalls to move data directly between file descriptors.
20 //! Since those syscalls have requirements that cannot be fully checked in advance and
21 //! gathering additional information about file descriptors would require additional syscalls
22 //! anyway it simply attempts to use them one after another (guided by inaccurate hints) to
23 //! figure out which one works and and falls back to the generic read-write copy loop if none of them
25 //! Once a working syscall is found for a pair of file descriptors it will be called in a loop
26 //! until the copy operation is completed.
28 //! Advantages of using these syscalls:
30 //! * fewer context switches since reads and writes are coalesced into a single syscall
31 //! and more bytes are transferred per syscall. This translates to higher throughput
32 //! and fewer CPU cycles, at least for sufficiently large transfers to amortize the initial probing.
33 //! * `copy_file_range` creates reflink copies on CoW filesystems, thus moving less data and
34 //! consuming less disk space
35 //! * `sendfile` and `splice` can perform zero-copy IO under some circumstances while
36 //! a naive copy loop would move every byte through the CPU.
40 //! * copy operations smaller than the default buffer size can under some circumstances, especially
41 //! on older kernels, incur more syscalls than the naive approach would. As mentioned above
42 //! the syscall selection is guided by hints to minimize this possibility but they are not perfect.
43 //! * optimizations only apply to std types. If a user adds a custom wrapper type, e.g. to report
44 //! progress, they can hit a performance cliff.
48 use crate::convert::TryInto;
49 use crate::fs::{File, Metadata};
50 use crate::io::copy::generic_copy;
52 BufRead, BufReader, BufWriter, Error, Read, Result, StderrLock, StdinLock, StdoutLock, Take,
55 use crate::mem::ManuallyDrop;
56 use crate::net::TcpStream;
57 use crate::os::unix::fs::FileTypeExt;
58 use crate::os::unix::io::{AsRawFd, FromRawFd, RawFd};
59 use crate::os::unix::net::UnixStream;
60 use crate::process::{ChildStderr, ChildStdin, ChildStdout};
62 use crate::sync::atomic::{AtomicBool, Ordering};
68 pub(crate) fn copy_spec<R: Read + ?Sized, W: Write + ?Sized>(
72 let copier = Copier { read, write };
73 SpecCopy::copy(copier)
76 /// This type represents either the inferred `FileType` of a `RawFd` based on the source
77 /// type from which it was extracted or the actual metadata
79 /// The methods on this type only provide hints, due to `AsRawFd` and `FromRawFd` the inferred
80 /// type may be wrong.
82 /// We obtained the FD from a type that can contain any type of `FileType` and queried the metadata
83 /// because it is cheaper than probing all possible syscalls (reader side)
87 /// We don't have any metadata, e.g. because the original type was `File` which can represent
88 /// any `FileType` and we did not query the metadata either since it did not seem beneficial
94 fn maybe_fifo(&self) -> bool {
96 FdMeta::Metadata(meta) => meta.file_type().is_fifo(),
97 FdMeta::Socket => false,
99 FdMeta::NoneObtained => true,
103 fn potential_sendfile_source(&self) -> bool {
105 // procfs erronously shows 0 length on non-empty readable files.
106 // and if a file is truly empty then a `read` syscall will determine that and skip the write syscall
107 // thus there would be benefit from attempting sendfile
108 FdMeta::Metadata(meta)
109 if meta.file_type().is_file() && meta.len() > 0
110 || meta.file_type().is_block_device() =>
118 fn copy_file_range_candidate(&self) -> bool {
120 // copy_file_range will fail on empty procfs files. `read` can determine whether EOF has been reached
121 // without extra cost and skip the write, thus there is no benefit in attempting copy_file_range
122 FdMeta::Metadata(meta) if meta.is_file() && meta.len() > 0 => true,
123 FdMeta::NoneObtained => true,
129 struct CopyParams(FdMeta, Option<RawFd>);
131 struct Copier<'a, 'b, R: Read + ?Sized, W: Write + ?Sized> {
137 fn copy(self) -> Result<u64>;
140 impl<R: Read + ?Sized, W: Write + ?Sized> SpecCopy for Copier<'_, '_, R, W> {
141 default fn copy(self) -> Result<u64> {
142 generic_copy(self.read, self.write)
146 impl<R: CopyRead, W: CopyWrite> SpecCopy for Copier<'_, '_, R, W> {
147 fn copy(self) -> Result<u64> {
148 let (reader, writer) = (self.read, self.write);
149 let r_cfg = reader.properties();
150 let w_cfg = writer.properties();
152 // before direct operations on file descriptors ensure that all source and sink buffers are empty
153 let mut flush = || -> crate::io::Result<u64> {
154 let bytes = reader.drain_to(writer, u64::MAX)?;
155 // BufWriter buffered bytes have already been accounted for in earlier write() calls
160 let mut written = 0u64;
162 if let (CopyParams(input_meta, Some(readfd)), CopyParams(output_meta, Some(writefd))) =
166 let max_write = reader.min_limit();
168 if input_meta.copy_file_range_candidate() && output_meta.copy_file_range_candidate() {
169 let result = copy_regular_files(readfd, writefd, max_write);
172 CopyResult::Ended(Ok(bytes_copied)) => return Ok(bytes_copied + written),
173 CopyResult::Ended(err) => return err,
174 CopyResult::Fallback(bytes) => written += bytes,
178 // on modern kernels sendfile can copy from any mmapable type (some but not all regular files and block devices)
179 // to any writable file descriptor. On older kernels the writer side can only be a socket.
180 // So we just try and fallback if needed.
181 // If current file offsets + write sizes overflow it may also fail, we do not try to fix that and instead
182 // fall back to the generic copy loop.
183 if input_meta.potential_sendfile_source() {
184 let result = sendfile_splice(SpliceMode::Sendfile, readfd, writefd, max_write);
187 CopyResult::Ended(Ok(bytes_copied)) => return Ok(bytes_copied + written),
188 CopyResult::Ended(err) => return err,
189 CopyResult::Fallback(bytes) => written += bytes,
193 if input_meta.maybe_fifo() || output_meta.maybe_fifo() {
194 let result = sendfile_splice(SpliceMode::Splice, readfd, writefd, max_write);
197 CopyResult::Ended(Ok(bytes_copied)) => return Ok(bytes_copied + written),
198 CopyResult::Ended(err) => return err,
199 CopyResult::Fallback(0) => { /* use the fallback below */ }
200 CopyResult::Fallback(_) => {
201 unreachable!("splice should not return > 0 bytes on the fallback path")
207 // fallback if none of the more specialized syscalls wants to work with these file descriptors
208 match generic_copy(reader, writer) {
209 Ok(bytes) => Ok(bytes + written),
215 #[rustc_specialization_trait]
216 trait CopyRead: Read {
217 /// Implementations that contain buffers (i.e. `BufReader`) must transfer data from their internal
218 /// buffers into `writer` until either the buffers are emptied or `limit` bytes have been
219 /// transferred, whichever occurs sooner.
220 /// If nested buffers are present the outer buffers must be drained first.
222 /// This is necessary to directly bypass the wrapper types while preserving the data order
223 /// when operating directly on the underlying file descriptors.
224 fn drain_to<W: Write>(&mut self, _writer: &mut W, _limit: u64) -> Result<u64> {
228 /// The minimum of the limit of all `Take<_>` wrappers, `u64::MAX` otherwise.
229 /// This method does not account for data `BufReader` buffers and would underreport
230 /// the limit of a `Take<BufReader<Take<_>>>` type. Thus its result is only valid
231 /// after draining the buffers via `drain_to`.
232 fn min_limit(&self) -> u64 {
236 /// Extracts the file descriptor and hints/metadata, delegating through wrappers if necessary.
237 fn properties(&self) -> CopyParams;
240 #[rustc_specialization_trait]
241 trait CopyWrite: Write {
242 /// Extracts the file descriptor and hints/metadata, delegating through wrappers if necessary.
243 fn properties(&self) -> CopyParams;
246 impl<T> CopyRead for &mut T
250 fn drain_to<W: Write>(&mut self, writer: &mut W, limit: u64) -> Result<u64> {
251 (**self).drain_to(writer, limit)
254 fn min_limit(&self) -> u64 {
258 fn properties(&self) -> CopyParams {
259 (**self).properties()
263 impl<T> CopyWrite for &mut T
267 fn properties(&self) -> CopyParams {
268 (**self).properties()
272 impl CopyRead for File {
273 fn properties(&self) -> CopyParams {
274 CopyParams(fd_to_meta(self), Some(self.as_raw_fd()))
278 impl CopyRead for &File {
279 fn properties(&self) -> CopyParams {
280 CopyParams(fd_to_meta(*self), Some(self.as_raw_fd()))
284 impl CopyWrite for File {
285 fn properties(&self) -> CopyParams {
286 CopyParams(FdMeta::NoneObtained, Some(self.as_raw_fd()))
290 impl CopyWrite for &File {
291 fn properties(&self) -> CopyParams {
292 CopyParams(FdMeta::NoneObtained, Some(self.as_raw_fd()))
296 impl CopyRead for TcpStream {
297 fn properties(&self) -> CopyParams {
298 // avoid the stat syscall since we can be fairly sure it's a socket
299 CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
303 impl CopyRead for &TcpStream {
304 fn properties(&self) -> CopyParams {
305 // avoid the stat syscall since we can be fairly sure it's a socket
306 CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
310 impl CopyWrite for TcpStream {
311 fn properties(&self) -> CopyParams {
312 // avoid the stat syscall since we can be fairly sure it's a socket
313 CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
317 impl CopyWrite for &TcpStream {
318 fn properties(&self) -> CopyParams {
319 // avoid the stat syscall since we can be fairly sure it's a socket
320 CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
324 impl CopyRead for UnixStream {
325 fn properties(&self) -> CopyParams {
326 // avoid the stat syscall since we can be fairly sure it's a socket
327 CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
331 impl CopyRead for &UnixStream {
332 fn properties(&self) -> CopyParams {
333 // avoid the stat syscall since we can be fairly sure it's a socket
334 CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
338 impl CopyWrite for UnixStream {
339 fn properties(&self) -> CopyParams {
340 // avoid the stat syscall since we can be fairly sure it's a socket
341 CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
345 impl CopyWrite for &UnixStream {
346 fn properties(&self) -> CopyParams {
347 // avoid the stat syscall since we can be fairly sure it's a socket
348 CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
352 impl CopyWrite for ChildStdin {
353 fn properties(&self) -> CopyParams {
354 CopyParams(FdMeta::Pipe, Some(self.as_raw_fd()))
358 impl CopyRead for ChildStdout {
359 fn properties(&self) -> CopyParams {
360 CopyParams(FdMeta::Pipe, Some(self.as_raw_fd()))
364 impl CopyRead for ChildStderr {
365 fn properties(&self) -> CopyParams {
366 CopyParams(FdMeta::Pipe, Some(self.as_raw_fd()))
370 impl CopyRead for StdinLock<'_> {
371 fn drain_to<W: Write>(&mut self, writer: &mut W, outer_limit: u64) -> Result<u64> {
372 let buf_reader = self.as_mut_buf();
373 let buf = buf_reader.buffer();
374 let buf = &buf[0..min(buf.len(), outer_limit.try_into().unwrap_or(usize::MAX))];
375 let bytes_drained = buf.len();
376 writer.write_all(buf)?;
377 buf_reader.consume(bytes_drained);
379 Ok(bytes_drained as u64)
382 fn properties(&self) -> CopyParams {
383 CopyParams(fd_to_meta(self), Some(self.as_raw_fd()))
387 impl CopyWrite for StdoutLock<'_> {
388 fn properties(&self) -> CopyParams {
389 CopyParams(FdMeta::NoneObtained, Some(self.as_raw_fd()))
393 impl CopyWrite for StderrLock<'_> {
394 fn properties(&self) -> CopyParams {
395 CopyParams(FdMeta::NoneObtained, Some(self.as_raw_fd()))
399 impl<T: CopyRead> CopyRead for Take<T> {
400 fn drain_to<W: Write>(&mut self, writer: &mut W, outer_limit: u64) -> Result<u64> {
401 let local_limit = self.limit();
402 let combined_limit = min(outer_limit, local_limit);
403 let bytes_drained = self.get_mut().drain_to(writer, combined_limit)?;
404 // update limit since read() was bypassed
405 self.set_limit(local_limit - bytes_drained);
410 fn min_limit(&self) -> u64 {
411 min(Take::limit(self), self.get_ref().min_limit())
414 fn properties(&self) -> CopyParams {
415 self.get_ref().properties()
419 impl<T: CopyRead> CopyRead for BufReader<T> {
420 fn drain_to<W: Write>(&mut self, writer: &mut W, outer_limit: u64) -> Result<u64> {
421 let buf = self.buffer();
422 let buf = &buf[0..min(buf.len(), outer_limit.try_into().unwrap_or(usize::MAX))];
423 let bytes = buf.len();
424 writer.write_all(buf)?;
427 let remaining = outer_limit - bytes as u64;
429 // in case of nested bufreaders we also need to drain the ones closer to the source
430 let inner_bytes = self.get_mut().drain_to(writer, remaining)?;
432 Ok(bytes as u64 + inner_bytes)
435 fn min_limit(&self) -> u64 {
436 self.get_ref().min_limit()
439 fn properties(&self) -> CopyParams {
440 self.get_ref().properties()
444 impl<T: CopyWrite> CopyWrite for BufWriter<T> {
445 fn properties(&self) -> CopyParams {
446 self.get_ref().properties()
450 fn fd_to_meta<T: AsRawFd>(fd: &T) -> FdMeta {
451 let fd = fd.as_raw_fd();
452 let file: ManuallyDrop<File> = ManuallyDrop::new(unsafe { File::from_raw_fd(fd) });
453 match file.metadata() {
454 Ok(meta) => FdMeta::Metadata(meta),
455 Err(_) => FdMeta::NoneObtained,
459 pub(super) enum CopyResult {
464 /// linux-specific implementation that will attempt to use copy_file_range for copy offloading
465 /// as the name says, it only works on regular files
467 /// Callers must handle fallback to a generic copy loop.
468 /// `Fallback` may indicate non-zero number of bytes already written
469 /// if one of the files' cursor +`max_len` would exceed u64::MAX (`EOVERFLOW`).
470 pub(super) fn copy_regular_files(reader: RawFd, writer: RawFd, max_len: u64) -> CopyResult {
473 // Kernel prior to 4.5 don't have copy_file_range
474 // We store the availability in a global to avoid unnecessary syscalls
475 static HAS_COPY_FILE_RANGE: AtomicBool = AtomicBool::new(true);
480 off_in: *mut libc::loff_t,
482 off_out: *mut libc::loff_t,
488 let has_copy_file_range = HAS_COPY_FILE_RANGE.load(Ordering::Relaxed);
489 let mut written = 0u64;
490 while written < max_len {
491 let copy_result = if has_copy_file_range {
492 let bytes_to_copy = cmp::min(max_len - written, usize::MAX as u64);
493 // cap to 1GB chunks in case u64::MAX is passed as max_len and the file has a non-zero seek position
494 // this allows us to copy large chunks without hitting EOVERFLOW,
495 // unless someone sets a file offset close to u64::MAX - 1GB, in which case a fallback would be required
496 let bytes_to_copy = cmp::min(bytes_to_copy as usize, 0x4000_0000usize);
497 let copy_result = unsafe {
498 // We actually don't have to adjust the offsets,
499 // because copy_file_range adjusts the file offset automatically
509 if let Err(ref copy_err) = copy_result {
510 match copy_err.raw_os_error() {
511 Some(libc::ENOSYS | libc::EPERM | libc::EOPNOTSUPP) => {
512 HAS_COPY_FILE_RANGE.store(false, Ordering::Relaxed);
519 Err(Error::from_raw_os_error(libc::ENOSYS))
522 Ok(0) if written == 0 => {
523 // fallback to work around several kernel bugs where copy_file_range will fail to
524 // copy any bytes and return 0 instead of an error if
525 // - reading virtual files from the proc filesystem which appear to have 0 size
526 // but are not empty. noted in coreutils to affect kernels at least up to 5.6.19.
527 // - copying from an overlay filesystem in docker. reported to occur on fedora 32.
528 return CopyResult::Fallback(0);
530 Ok(0) => return CopyResult::Ended(Ok(written)), // reached EOF
531 Ok(ret) => written += ret as u64,
533 return match err.raw_os_error() {
534 // when file offset + max_length > u64::MAX
535 Some(libc::EOVERFLOW) => CopyResult::Fallback(written),
537 libc::ENOSYS | libc::EXDEV | libc::EINVAL | libc::EPERM | libc::EOPNOTSUPP,
539 // Try fallback io::copy if either:
540 // - Kernel version is < 4.5 (ENOSYS)
541 // - Files are mounted on different fs (EXDEV)
542 // - copy_file_range is broken in various ways on RHEL/CentOS 7 (EOPNOTSUPP)
543 // - copy_file_range is disallowed, for example by seccomp (EPERM)
544 // - copy_file_range cannot be used with pipes or device nodes (EINVAL)
545 assert_eq!(written, 0);
546 CopyResult::Fallback(0)
548 _ => CopyResult::Ended(Err(err)),
553 CopyResult::Ended(Ok(written))
562 /// performs splice or sendfile between file descriptors
563 /// Does _not_ fall back to a generic copy loop.
564 fn sendfile_splice(mode: SpliceMode, reader: RawFd, writer: RawFd, len: u64) -> CopyResult {
565 static HAS_SENDFILE: AtomicBool = AtomicBool::new(true);
566 static HAS_SPLICE: AtomicBool = AtomicBool::new(true);
571 src_offset: *const i64,
573 dst_offset: *const i64,
580 SpliceMode::Sendfile if !HAS_SENDFILE.load(Ordering::Relaxed) => {
581 return CopyResult::Fallback(0);
583 SpliceMode::Splice if !HAS_SPLICE.load(Ordering::Relaxed) => {
584 return CopyResult::Fallback(0);
589 let mut written = 0u64;
590 while written < len {
591 // according to its manpage that's the maximum size sendfile() will copy per invocation
592 let chunk_size = crate::cmp::min(len - written, 0x7ffff000_u64) as usize;
594 let result = match mode {
595 SpliceMode::Sendfile => {
596 cvt(unsafe { libc::sendfile(writer, reader, ptr::null_mut(), chunk_size) })
598 SpliceMode::Splice => cvt(unsafe {
599 splice(reader, ptr::null_mut(), writer, ptr::null_mut(), chunk_size, 0)
604 Ok(0) => break, // EOF
605 Ok(ret) => written += ret as u64,
607 return match err.raw_os_error() {
608 Some(libc::ENOSYS | libc::EPERM) => {
609 // syscall not supported (ENOSYS)
610 // syscall is disallowed, e.g. by seccomp (EPERM)
612 SpliceMode::Sendfile => HAS_SENDFILE.store(false, Ordering::Relaxed),
613 SpliceMode::Splice => HAS_SPLICE.store(false, Ordering::Relaxed),
615 assert_eq!(written, 0);
616 CopyResult::Fallback(0)
618 Some(libc::EINVAL) => {
619 // splice/sendfile do not support this particular file descriptor (EINVAL)
620 assert_eq!(written, 0);
621 CopyResult::Fallback(0)
623 Some(os_err) if mode == SpliceMode::Sendfile && os_err == libc::EOVERFLOW => {
624 CopyResult::Fallback(written)
626 _ => CopyResult::Ended(Err(err)),
631 CopyResult::Ended(Ok(written))