quinn_udp/
unix.rs

1#[cfg(not(any(apple, target_os = "openbsd", solarish)))]
2use std::ptr;
3use std::{
4    io::{self, IoSliceMut},
5    mem::{self, MaybeUninit},
6    net::{IpAddr, Ipv4Addr, Ipv6Addr, SocketAddr, SocketAddrV4, SocketAddrV6},
7    os::unix::io::AsRawFd,
8    sync::{
9        atomic::{AtomicBool, AtomicUsize, Ordering},
10        Mutex,
11    },
12    time::Instant,
13};
14
15use socket2::SockRef;
16
17use super::{
18    cmsg, log_sendmsg_error, EcnCodepoint, RecvMeta, Transmit, UdpSockRef, IO_ERROR_LOG_INTERVAL,
19};
20
21// Adapted from https://github.com/apple-oss-distributions/xnu/blob/8d741a5de7ff4191bf97d57b9f54c2f6d4a15585/bsd/sys/socket_private.h
22#[cfg(apple_fast)]
23#[repr(C)]
24#[allow(non_camel_case_types)]
25pub(crate) struct msghdr_x {
26    pub msg_name: *mut libc::c_void,
27    pub msg_namelen: libc::socklen_t,
28    pub msg_iov: *mut libc::iovec,
29    pub msg_iovlen: libc::c_int,
30    pub msg_control: *mut libc::c_void,
31    pub msg_controllen: libc::socklen_t,
32    pub msg_flags: libc::c_int,
33    pub msg_datalen: usize,
34}
35
36#[cfg(apple_fast)]
37extern "C" {
38    fn recvmsg_x(
39        s: libc::c_int,
40        msgp: *const msghdr_x,
41        cnt: libc::c_uint,
42        flags: libc::c_int,
43    ) -> isize;
44
45    fn sendmsg_x(
46        s: libc::c_int,
47        msgp: *const msghdr_x,
48        cnt: libc::c_uint,
49        flags: libc::c_int,
50    ) -> isize;
51}
52
53// Defined in netinet6/in6.h on OpenBSD, this is not yet exported by the libc crate
54// directly.  See https://github.com/rust-lang/libc/issues/3704 for when we might be able to
55// rely on this from the libc crate.
56#[cfg(any(target_os = "openbsd", target_os = "netbsd"))]
57const IPV6_DONTFRAG: libc::c_int = 62;
58#[cfg(not(any(target_os = "openbsd", target_os = "netbsd")))]
59const IPV6_DONTFRAG: libc::c_int = libc::IPV6_DONTFRAG;
60
61#[cfg(target_os = "freebsd")]
62type IpTosTy = libc::c_uchar;
63#[cfg(not(any(target_os = "freebsd", target_os = "netbsd")))]
64type IpTosTy = libc::c_int;
65
66/// Tokio-compatible UDP socket with some useful specializations.
67///
68/// Unlike a standard tokio UDP socket, this allows ECN bits to be read and written on some
69/// platforms.
70#[derive(Debug)]
71pub struct UdpSocketState {
72    last_send_error: Mutex<Instant>,
73    max_gso_segments: AtomicUsize,
74    gro_segments: usize,
75    may_fragment: bool,
76
77    /// True if we have received EINVAL error from `sendmsg` system call at least once.
78    ///
79    /// If enabled, we assume that old kernel is used and switch to fallback mode.
80    /// In particular, we do not use IP_TOS cmsg_type in this case,
81    /// which is not supported on Linux <3.13 and results in not sending the UDP packet at all.
82    sendmsg_einval: AtomicBool,
83}
84
85impl UdpSocketState {
86    pub fn new(sock: UdpSockRef<'_>) -> io::Result<Self> {
87        let io = sock.0;
88        let mut cmsg_platform_space = 0;
89        if cfg!(target_os = "linux")
90            || cfg!(bsd)
91            || cfg!(apple)
92            || cfg!(target_os = "android")
93            || cfg!(solarish)
94        {
95            cmsg_platform_space +=
96                unsafe { libc::CMSG_SPACE(mem::size_of::<libc::in6_pktinfo>() as _) as usize };
97        }
98
99        assert!(
100            CMSG_LEN
101                >= unsafe { libc::CMSG_SPACE(mem::size_of::<libc::c_int>() as _) as usize }
102                    + cmsg_platform_space
103        );
104        assert!(
105            mem::align_of::<libc::cmsghdr>() <= mem::align_of::<cmsg::Aligned<[u8; 0]>>(),
106            "control message buffers will be misaligned"
107        );
108
109        io.set_nonblocking(true)?;
110
111        let addr = io.local_addr()?;
112        let is_ipv4 = addr.family() == libc::AF_INET as libc::sa_family_t;
113
114        // mac and ios do not support IP_RECVTOS on dual-stack sockets :(
115        // older macos versions also don't have the flag and will error out if we don't ignore it
116        #[cfg(not(any(target_os = "openbsd", target_os = "netbsd", solarish)))]
117        if is_ipv4 || !io.only_v6()? {
118            if let Err(_err) =
119                set_socket_option(&*io, libc::IPPROTO_IP, libc::IP_RECVTOS, OPTION_ON)
120            {
121                crate::log::debug!("Ignoring error setting IP_RECVTOS on socket: {_err:?}");
122            }
123        }
124
125        let mut may_fragment = false;
126        #[cfg(any(target_os = "linux", target_os = "android"))]
127        {
128            // opportunistically try to enable GRO. See gro::gro_segments().
129            let _ = set_socket_option(&*io, libc::SOL_UDP, gro::UDP_GRO, OPTION_ON);
130
131            // Forbid IPv4 fragmentation. Set even for IPv6 to account for IPv6 mapped IPv4 addresses.
132            // Set `may_fragment` to `true` if this option is not supported on the platform.
133            may_fragment |= !set_socket_option_supported(
134                &*io,
135                libc::IPPROTO_IP,
136                libc::IP_MTU_DISCOVER,
137                libc::IP_PMTUDISC_PROBE,
138            )?;
139
140            if is_ipv4 {
141                set_socket_option(&*io, libc::IPPROTO_IP, libc::IP_PKTINFO, OPTION_ON)?;
142            } else {
143                // Set `may_fragment` to `true` if this option is not supported on the platform.
144                may_fragment |= !set_socket_option_supported(
145                    &*io,
146                    libc::IPPROTO_IPV6,
147                    libc::IPV6_MTU_DISCOVER,
148                    libc::IPV6_PMTUDISC_PROBE,
149                )?;
150            }
151        }
152        #[cfg(any(target_os = "freebsd", apple))]
153        {
154            if is_ipv4 {
155                // Set `may_fragment` to `true` if this option is not supported on the platform.
156                may_fragment |= !set_socket_option_supported(
157                    &*io,
158                    libc::IPPROTO_IP,
159                    libc::IP_DONTFRAG,
160                    OPTION_ON,
161                )?;
162            }
163        }
164        #[cfg(any(bsd, apple, solarish))]
165        // IP_RECVDSTADDR == IP_SENDSRCADDR on FreeBSD
166        // macOS uses only IP_RECVDSTADDR, no IP_SENDSRCADDR on macOS (the same on Solaris)
167        // macOS also supports IP_PKTINFO
168        {
169            if is_ipv4 {
170                set_socket_option(&*io, libc::IPPROTO_IP, libc::IP_RECVDSTADDR, OPTION_ON)?;
171            }
172        }
173
174        // Options standardized in RFC 3542
175        if !is_ipv4 {
176            set_socket_option(&*io, libc::IPPROTO_IPV6, libc::IPV6_RECVPKTINFO, OPTION_ON)?;
177            set_socket_option(&*io, libc::IPPROTO_IPV6, libc::IPV6_RECVTCLASS, OPTION_ON)?;
178            // Linux's IP_PMTUDISC_PROBE allows us to operate under interface MTU rather than the
179            // kernel's path MTU guess, but actually disabling fragmentation requires this too. See
180            // __ip6_append_data in ip6_output.c.
181            // Set `may_fragment` to `true` if this option is not supported on the platform.
182            may_fragment |=
183                !set_socket_option_supported(&*io, libc::IPPROTO_IPV6, IPV6_DONTFRAG, OPTION_ON)?;
184        }
185
186        let now = Instant::now();
187        Ok(Self {
188            last_send_error: Mutex::new(now.checked_sub(2 * IO_ERROR_LOG_INTERVAL).unwrap_or(now)),
189            max_gso_segments: AtomicUsize::new(gso::max_gso_segments()),
190            gro_segments: gro::gro_segments(),
191            may_fragment,
192            sendmsg_einval: AtomicBool::new(false),
193        })
194    }
195
196    /// Sends a [`Transmit`] on the given socket.
197    ///
198    /// This function will only ever return errors of kind [`io::ErrorKind::WouldBlock`].
199    /// All other errors will be logged and converted to `Ok`.
200    ///
201    /// UDP transmission errors are considered non-fatal because higher-level protocols must
202    /// employ retransmits and timeouts anyway in order to deal with UDP's unreliable nature.
203    /// Thus, logging is most likely the only thing you can do with these errors.
204    ///
205    /// If you would like to handle these errors yourself, use [`UdpSocketState::try_send`]
206    /// instead.
207    pub fn send(&self, socket: UdpSockRef<'_>, transmit: &Transmit<'_>) -> io::Result<()> {
208        match send(self, socket.0, transmit) {
209            Ok(()) => Ok(()),
210            Err(e) if e.kind() == io::ErrorKind::WouldBlock => Err(e),
211            Err(e) => {
212                log_sendmsg_error(&self.last_send_error, e, transmit);
213
214                Ok(())
215            }
216        }
217    }
218
219    /// Sends a [`Transmit`] on the given socket without any additional error handling.
220    pub fn try_send(&self, socket: UdpSockRef<'_>, transmit: &Transmit<'_>) -> io::Result<()> {
221        send(self, socket.0, transmit)
222    }
223
224    pub fn recv(
225        &self,
226        socket: UdpSockRef<'_>,
227        bufs: &mut [IoSliceMut<'_>],
228        meta: &mut [RecvMeta],
229    ) -> io::Result<usize> {
230        recv(socket.0, bufs, meta)
231    }
232
233    /// The maximum amount of segments which can be transmitted if a platform
234    /// supports Generic Send Offload (GSO).
235    ///
236    /// This is 1 if the platform doesn't support GSO. Subject to change if errors are detected
237    /// while using GSO.
238    #[inline]
239    pub fn max_gso_segments(&self) -> usize {
240        self.max_gso_segments.load(Ordering::Relaxed)
241    }
242
243    /// The number of segments to read when GRO is enabled. Used as a factor to
244    /// compute the receive buffer size.
245    ///
246    /// Returns 1 if the platform doesn't support GRO.
247    #[inline]
248    pub fn gro_segments(&self) -> usize {
249        self.gro_segments
250    }
251
252    /// Whether transmitted datagrams might get fragmented by the IP layer
253    ///
254    /// Returns `false` on targets which employ e.g. the `IPV6_DONTFRAG` socket option.
255    #[inline]
256    pub fn may_fragment(&self) -> bool {
257        self.may_fragment
258    }
259
260    /// Returns true if we previously got an EINVAL error from `sendmsg` syscall.
261    fn sendmsg_einval(&self) -> bool {
262        self.sendmsg_einval.load(Ordering::Relaxed)
263    }
264
265    /// Sets the flag indicating we got EINVAL error from `sendmsg` syscall.
266    #[cfg(not(any(apple, target_os = "openbsd", target_os = "netbsd")))]
267    fn set_sendmsg_einval(&self) {
268        self.sendmsg_einval.store(true, Ordering::Relaxed)
269    }
270}
271
272#[cfg(not(any(apple, target_os = "openbsd", target_os = "netbsd")))]
273fn send(
274    #[allow(unused_variables)] // only used on Linux
275    state: &UdpSocketState,
276    io: SockRef<'_>,
277    transmit: &Transmit<'_>,
278) -> io::Result<()> {
279    #[allow(unused_mut)] // only mutable on FreeBSD
280    let mut encode_src_ip = true;
281    #[cfg(target_os = "freebsd")]
282    {
283        let addr = io.local_addr()?;
284        let is_ipv4 = addr.family() == libc::AF_INET as libc::sa_family_t;
285        if is_ipv4 {
286            if let Some(socket) = addr.as_socket_ipv4() {
287                encode_src_ip = socket.ip() == &Ipv4Addr::UNSPECIFIED;
288            }
289        }
290    }
291    let mut msg_hdr: libc::msghdr = unsafe { mem::zeroed() };
292    let mut iovec: libc::iovec = unsafe { mem::zeroed() };
293    let mut cmsgs = cmsg::Aligned([0u8; CMSG_LEN]);
294    let dst_addr = socket2::SockAddr::from(transmit.destination);
295    prepare_msg(
296        transmit,
297        &dst_addr,
298        &mut msg_hdr,
299        &mut iovec,
300        &mut cmsgs,
301        encode_src_ip,
302        state.sendmsg_einval(),
303    );
304
305    loop {
306        let n = unsafe { libc::sendmsg(io.as_raw_fd(), &msg_hdr, 0) };
307        if n == -1 {
308            let e = io::Error::last_os_error();
309            match e.kind() {
310                io::ErrorKind::Interrupted => {
311                    // Retry the transmission
312                    continue;
313                }
314                io::ErrorKind::WouldBlock => return Err(e),
315                _ => {
316                    // Some network adapters and drivers do not support GSO. Unfortunately, Linux
317                    // offers no easy way for us to detect this short of an EIO or sometimes EINVAL
318                    // when we try to actually send datagrams using it.
319                    #[cfg(any(target_os = "linux", target_os = "android"))]
320                    if let Some(libc::EIO) | Some(libc::EINVAL) = e.raw_os_error() {
321                        // Prevent new transmits from being scheduled using GSO. Existing GSO transmits
322                        // may already be in the pipeline, so we need to tolerate additional failures.
323                        if state.max_gso_segments() > 1 {
324                            crate::log::info!(
325                                "`libc::sendmsg` failed with {e}; halting segmentation offload"
326                            );
327                            state
328                                .max_gso_segments
329                                .store(1, std::sync::atomic::Ordering::Relaxed);
330                        }
331                    }
332
333                    // Some arguments to `sendmsg` are not supported. Switch to
334                    // fallback mode and retry if we haven't already.
335                    if e.raw_os_error() == Some(libc::EINVAL) && !state.sendmsg_einval() {
336                        state.set_sendmsg_einval();
337                        prepare_msg(
338                            transmit,
339                            &dst_addr,
340                            &mut msg_hdr,
341                            &mut iovec,
342                            &mut cmsgs,
343                            encode_src_ip,
344                            state.sendmsg_einval(),
345                        );
346                        continue;
347                    }
348
349                    // - EMSGSIZE is expected for MTU probes. Future work might be able to avoid
350                    //   these by automatically clamping the MTUD upper bound to the interface MTU.
351                    if e.raw_os_error() != Some(libc::EMSGSIZE) {
352                        return Err(e);
353                    }
354                }
355            }
356        }
357        return Ok(());
358    }
359}
360
361#[cfg(apple_fast)]
362fn send(state: &UdpSocketState, io: SockRef<'_>, transmit: &Transmit<'_>) -> io::Result<()> {
363    let mut hdrs = unsafe { mem::zeroed::<[msghdr_x; BATCH_SIZE]>() };
364    let mut iovs = unsafe { mem::zeroed::<[libc::iovec; BATCH_SIZE]>() };
365    let mut ctrls = [cmsg::Aligned([0u8; CMSG_LEN]); BATCH_SIZE];
366    let addr = socket2::SockAddr::from(transmit.destination);
367    let segment_size = transmit.segment_size.unwrap_or(transmit.contents.len());
368    let mut cnt = 0;
369    debug_assert!(transmit.contents.len().div_ceil(segment_size) <= BATCH_SIZE);
370    for (i, chunk) in transmit
371        .contents
372        .chunks(segment_size)
373        .enumerate()
374        .take(BATCH_SIZE)
375    {
376        prepare_msg(
377            &Transmit {
378                destination: transmit.destination,
379                ecn: transmit.ecn,
380                contents: chunk,
381                segment_size: Some(chunk.len()),
382                src_ip: transmit.src_ip,
383            },
384            &addr,
385            &mut hdrs[i],
386            &mut iovs[i],
387            &mut ctrls[i],
388            true,
389            state.sendmsg_einval(),
390        );
391        hdrs[i].msg_datalen = chunk.len();
392        cnt += 1;
393    }
394    loop {
395        let n = unsafe { sendmsg_x(io.as_raw_fd(), hdrs.as_ptr(), cnt as u32, 0) };
396        if n == -1 {
397            let e = io::Error::last_os_error();
398            match e.kind() {
399                io::ErrorKind::Interrupted => {
400                    // Retry the transmission
401                    continue;
402                }
403                io::ErrorKind::WouldBlock => return Err(e),
404                _ => {
405                    // - EMSGSIZE is expected for MTU probes. Future work might be able to avoid
406                    //   these by automatically clamping the MTUD upper bound to the interface MTU.
407                    if e.raw_os_error() != Some(libc::EMSGSIZE) {
408                        return Err(e);
409                    }
410                }
411            }
412        }
413        return Ok(());
414    }
415}
416
417#[cfg(any(target_os = "openbsd", target_os = "netbsd", apple_slow))]
418fn send(state: &UdpSocketState, io: SockRef<'_>, transmit: &Transmit<'_>) -> io::Result<()> {
419    let mut hdr: libc::msghdr = unsafe { mem::zeroed() };
420    let mut iov: libc::iovec = unsafe { mem::zeroed() };
421    let mut ctrl = cmsg::Aligned([0u8; CMSG_LEN]);
422    let addr = socket2::SockAddr::from(transmit.destination);
423    prepare_msg(
424        transmit,
425        &addr,
426        &mut hdr,
427        &mut iov,
428        &mut ctrl,
429        cfg!(apple) || cfg!(target_os = "openbsd") || cfg!(target_os = "netbsd"),
430        state.sendmsg_einval(),
431    );
432    loop {
433        let n = unsafe { libc::sendmsg(io.as_raw_fd(), &hdr, 0) };
434        if n == -1 {
435            let e = io::Error::last_os_error();
436            match e.kind() {
437                io::ErrorKind::Interrupted => {
438                    // Retry the transmission
439                    continue;
440                }
441                io::ErrorKind::WouldBlock => return Err(e),
442                _ => {
443                    // - EMSGSIZE is expected for MTU probes. Future work might be able to avoid
444                    //   these by automatically clamping the MTUD upper bound to the interface MTU.
445                    if e.raw_os_error() != Some(libc::EMSGSIZE) {
446                        return Err(e);
447                    }
448                }
449            }
450        }
451        return Ok(());
452    }
453}
454
455#[cfg(not(any(apple, target_os = "openbsd", target_os = "netbsd", solarish)))]
456fn recv(io: SockRef<'_>, bufs: &mut [IoSliceMut<'_>], meta: &mut [RecvMeta]) -> io::Result<usize> {
457    let mut names = [MaybeUninit::<libc::sockaddr_storage>::uninit(); BATCH_SIZE];
458    let mut ctrls = [cmsg::Aligned(MaybeUninit::<[u8; CMSG_LEN]>::uninit()); BATCH_SIZE];
459    let mut hdrs = unsafe { mem::zeroed::<[libc::mmsghdr; BATCH_SIZE]>() };
460    let max_msg_count = bufs.len().min(BATCH_SIZE);
461    for i in 0..max_msg_count {
462        prepare_recv(
463            &mut bufs[i],
464            &mut names[i],
465            &mut ctrls[i],
466            &mut hdrs[i].msg_hdr,
467        );
468    }
469    let msg_count = loop {
470        let n = unsafe {
471            libc::recvmmsg(
472                io.as_raw_fd(),
473                hdrs.as_mut_ptr(),
474                bufs.len().min(BATCH_SIZE) as _,
475                0,
476                ptr::null_mut::<libc::timespec>(),
477            )
478        };
479        if n == -1 {
480            let e = io::Error::last_os_error();
481            if e.kind() == io::ErrorKind::Interrupted {
482                continue;
483            }
484            return Err(e);
485        }
486        break n;
487    };
488    for i in 0..(msg_count as usize) {
489        meta[i] = decode_recv(&names[i], &hdrs[i].msg_hdr, hdrs[i].msg_len as usize);
490    }
491    Ok(msg_count as usize)
492}
493
494#[cfg(apple_fast)]
495fn recv(io: SockRef<'_>, bufs: &mut [IoSliceMut<'_>], meta: &mut [RecvMeta]) -> io::Result<usize> {
496    let mut names = [MaybeUninit::<libc::sockaddr_storage>::uninit(); BATCH_SIZE];
497    let mut ctrls = [cmsg::Aligned(MaybeUninit::<[u8; CMSG_LEN]>::uninit()); BATCH_SIZE];
498    let mut hdrs = unsafe { mem::zeroed::<[msghdr_x; BATCH_SIZE]>() };
499    let max_msg_count = bufs.len().min(BATCH_SIZE);
500    for i in 0..max_msg_count {
501        prepare_recv(&mut bufs[i], &mut names[i], &mut ctrls[i], &mut hdrs[i]);
502    }
503    let msg_count = loop {
504        let n = unsafe { recvmsg_x(io.as_raw_fd(), hdrs.as_mut_ptr(), max_msg_count as _, 0) };
505        match n {
506            -1 => {
507                let e = io::Error::last_os_error();
508                if e.kind() == io::ErrorKind::Interrupted {
509                    continue;
510                }
511                return Err(e);
512            }
513            n => break n,
514        }
515    };
516    for i in 0..(msg_count as usize) {
517        meta[i] = decode_recv(&names[i], &hdrs[i], hdrs[i].msg_datalen as usize);
518    }
519    Ok(msg_count as usize)
520}
521
522#[cfg(any(target_os = "openbsd", target_os = "netbsd", solarish, apple_slow))]
523fn recv(io: SockRef<'_>, bufs: &mut [IoSliceMut<'_>], meta: &mut [RecvMeta]) -> io::Result<usize> {
524    let mut name = MaybeUninit::<libc::sockaddr_storage>::uninit();
525    let mut ctrl = cmsg::Aligned(MaybeUninit::<[u8; CMSG_LEN]>::uninit());
526    let mut hdr = unsafe { mem::zeroed::<libc::msghdr>() };
527    prepare_recv(&mut bufs[0], &mut name, &mut ctrl, &mut hdr);
528    let n = loop {
529        let n = unsafe { libc::recvmsg(io.as_raw_fd(), &mut hdr, 0) };
530        if n == -1 {
531            let e = io::Error::last_os_error();
532            if e.kind() == io::ErrorKind::Interrupted {
533                continue;
534            }
535            return Err(e);
536        }
537        if hdr.msg_flags & libc::MSG_TRUNC != 0 {
538            continue;
539        }
540        break n;
541    };
542    meta[0] = decode_recv(&name, &hdr, n as usize);
543    Ok(1)
544}
545
546const CMSG_LEN: usize = 88;
547
548fn prepare_msg(
549    transmit: &Transmit<'_>,
550    dst_addr: &socket2::SockAddr,
551    #[cfg(not(apple_fast))] hdr: &mut libc::msghdr,
552    #[cfg(apple_fast)] hdr: &mut msghdr_x,
553    iov: &mut libc::iovec,
554    ctrl: &mut cmsg::Aligned<[u8; CMSG_LEN]>,
555    #[allow(unused_variables)] // only used on FreeBSD & macOS
556    encode_src_ip: bool,
557    sendmsg_einval: bool,
558) {
559    iov.iov_base = transmit.contents.as_ptr() as *const _ as *mut _;
560    iov.iov_len = transmit.contents.len();
561
562    // SAFETY: Casting the pointer to a mutable one is legal,
563    // as sendmsg is guaranteed to not alter the mutable pointer
564    // as per the POSIX spec. See the section on the sys/socket.h
565    // header for details. The type is only mutable in the first
566    // place because it is reused by recvmsg as well.
567    let name = dst_addr.as_ptr() as *mut libc::c_void;
568    let namelen = dst_addr.len();
569    hdr.msg_name = name as *mut _;
570    hdr.msg_namelen = namelen;
571    hdr.msg_iov = iov;
572    hdr.msg_iovlen = 1;
573
574    hdr.msg_control = ctrl.0.as_mut_ptr() as _;
575    hdr.msg_controllen = CMSG_LEN as _;
576    let mut encoder = unsafe { cmsg::Encoder::new(hdr) };
577    let ecn = transmit.ecn.map_or(0, |x| x as libc::c_int);
578    // True for IPv4 or IPv4-Mapped IPv6
579    let is_ipv4 = transmit.destination.is_ipv4()
580        || matches!(transmit.destination.ip(), IpAddr::V6(addr) if addr.to_ipv4_mapped().is_some());
581    if is_ipv4 {
582        if !sendmsg_einval {
583            #[cfg(not(target_os = "netbsd"))]
584            {
585                encoder.push(libc::IPPROTO_IP, libc::IP_TOS, ecn as IpTosTy);
586            }
587        }
588    } else {
589        encoder.push(libc::IPPROTO_IPV6, libc::IPV6_TCLASS, ecn);
590    }
591
592    // Only set the segment size if it is different from the size of the contents.
593    // Some network drivers don't like being told to do GSO even if there is effectively only a single segment.
594    if let Some(segment_size) = transmit
595        .segment_size
596        .filter(|segment_size| *segment_size != transmit.contents.len())
597    {
598        gso::set_segment_size(&mut encoder, segment_size as u16);
599    }
600
601    if let Some(ip) = &transmit.src_ip {
602        match ip {
603            IpAddr::V4(v4) => {
604                #[cfg(any(target_os = "linux", target_os = "android"))]
605                {
606                    let pktinfo = libc::in_pktinfo {
607                        ipi_ifindex: 0,
608                        ipi_spec_dst: libc::in_addr {
609                            s_addr: u32::from_ne_bytes(v4.octets()),
610                        },
611                        ipi_addr: libc::in_addr { s_addr: 0 },
612                    };
613                    encoder.push(libc::IPPROTO_IP, libc::IP_PKTINFO, pktinfo);
614                }
615                #[cfg(any(bsd, apple, solarish))]
616                {
617                    if encode_src_ip {
618                        let addr = libc::in_addr {
619                            s_addr: u32::from_ne_bytes(v4.octets()),
620                        };
621                        encoder.push(libc::IPPROTO_IP, libc::IP_RECVDSTADDR, addr);
622                    }
623                }
624            }
625            IpAddr::V6(v6) => {
626                let pktinfo = libc::in6_pktinfo {
627                    ipi6_ifindex: 0,
628                    ipi6_addr: libc::in6_addr {
629                        s6_addr: v6.octets(),
630                    },
631                };
632                encoder.push(libc::IPPROTO_IPV6, libc::IPV6_PKTINFO, pktinfo);
633            }
634        }
635    }
636
637    encoder.finish();
638}
639
640#[cfg(not(apple_fast))]
641fn prepare_recv(
642    buf: &mut IoSliceMut,
643    name: &mut MaybeUninit<libc::sockaddr_storage>,
644    ctrl: &mut cmsg::Aligned<MaybeUninit<[u8; CMSG_LEN]>>,
645    hdr: &mut libc::msghdr,
646) {
647    hdr.msg_name = name.as_mut_ptr() as _;
648    hdr.msg_namelen = mem::size_of::<libc::sockaddr_storage>() as _;
649    hdr.msg_iov = buf as *mut IoSliceMut as *mut libc::iovec;
650    hdr.msg_iovlen = 1;
651    hdr.msg_control = ctrl.0.as_mut_ptr() as _;
652    hdr.msg_controllen = CMSG_LEN as _;
653    hdr.msg_flags = 0;
654}
655
656#[cfg(apple_fast)]
657fn prepare_recv(
658    buf: &mut IoSliceMut,
659    name: &mut MaybeUninit<libc::sockaddr_storage>,
660    ctrl: &mut cmsg::Aligned<MaybeUninit<[u8; CMSG_LEN]>>,
661    hdr: &mut msghdr_x,
662) {
663    hdr.msg_name = name.as_mut_ptr() as _;
664    hdr.msg_namelen = mem::size_of::<libc::sockaddr_storage>() as _;
665    hdr.msg_iov = buf as *mut IoSliceMut as *mut libc::iovec;
666    hdr.msg_iovlen = 1;
667    hdr.msg_control = ctrl.0.as_mut_ptr() as _;
668    hdr.msg_controllen = CMSG_LEN as _;
669    hdr.msg_flags = 0;
670    hdr.msg_datalen = buf.len();
671}
672
673fn decode_recv(
674    name: &MaybeUninit<libc::sockaddr_storage>,
675    #[cfg(not(apple_fast))] hdr: &libc::msghdr,
676    #[cfg(apple_fast)] hdr: &msghdr_x,
677    len: usize,
678) -> RecvMeta {
679    let name = unsafe { name.assume_init() };
680    let mut ecn_bits = 0;
681    let mut dst_ip = None;
682    #[allow(unused_mut)] // only mutable on Linux
683    let mut stride = len;
684
685    let cmsg_iter = unsafe { cmsg::Iter::new(hdr) };
686    for cmsg in cmsg_iter {
687        match (cmsg.cmsg_level, cmsg.cmsg_type) {
688            (libc::IPPROTO_IP, libc::IP_TOS) => unsafe {
689                ecn_bits = cmsg::decode::<u8, libc::cmsghdr>(cmsg);
690            },
691            // FreeBSD uses IP_RECVTOS here, and we can be liberal because cmsgs are opt-in.
692            #[cfg(not(any(target_os = "openbsd", target_os = "netbsd", solarish)))]
693            (libc::IPPROTO_IP, libc::IP_RECVTOS) => unsafe {
694                ecn_bits = cmsg::decode::<u8, libc::cmsghdr>(cmsg);
695            },
696            (libc::IPPROTO_IPV6, libc::IPV6_TCLASS) => unsafe {
697                // Temporary hack around broken macos ABI. Remove once upstream fixes it.
698                // https://bugreport.apple.com/web/?problemID=48761855
699                #[allow(clippy::unnecessary_cast)] // cmsg.cmsg_len defined as size_t
700                if cfg!(apple)
701                    && cmsg.cmsg_len as usize == libc::CMSG_LEN(mem::size_of::<u8>() as _) as usize
702                {
703                    ecn_bits = cmsg::decode::<u8, libc::cmsghdr>(cmsg);
704                } else {
705                    ecn_bits = cmsg::decode::<libc::c_int, libc::cmsghdr>(cmsg) as u8;
706                }
707            },
708            #[cfg(any(target_os = "linux", target_os = "android"))]
709            (libc::IPPROTO_IP, libc::IP_PKTINFO) => {
710                let pktinfo = unsafe { cmsg::decode::<libc::in_pktinfo, libc::cmsghdr>(cmsg) };
711                dst_ip = Some(IpAddr::V4(Ipv4Addr::from(
712                    pktinfo.ipi_addr.s_addr.to_ne_bytes(),
713                )));
714            }
715            #[cfg(any(bsd, apple))]
716            (libc::IPPROTO_IP, libc::IP_RECVDSTADDR) => {
717                let in_addr = unsafe { cmsg::decode::<libc::in_addr, libc::cmsghdr>(cmsg) };
718                dst_ip = Some(IpAddr::V4(Ipv4Addr::from(in_addr.s_addr.to_ne_bytes())));
719            }
720            (libc::IPPROTO_IPV6, libc::IPV6_PKTINFO) => {
721                let pktinfo = unsafe { cmsg::decode::<libc::in6_pktinfo, libc::cmsghdr>(cmsg) };
722                dst_ip = Some(IpAddr::V6(Ipv6Addr::from(pktinfo.ipi6_addr.s6_addr)));
723            }
724            #[cfg(any(target_os = "linux", target_os = "android"))]
725            (libc::SOL_UDP, gro::UDP_GRO) => unsafe {
726                stride = cmsg::decode::<libc::c_int, libc::cmsghdr>(cmsg) as usize;
727            },
728            _ => {}
729        }
730    }
731
732    let addr = match libc::c_int::from(name.ss_family) {
733        libc::AF_INET => {
734            // Safety: if the ss_family field is AF_INET then storage must be a sockaddr_in.
735            let addr: &libc::sockaddr_in =
736                unsafe { &*(&name as *const _ as *const libc::sockaddr_in) };
737            SocketAddr::V4(SocketAddrV4::new(
738                Ipv4Addr::from(addr.sin_addr.s_addr.to_ne_bytes()),
739                u16::from_be(addr.sin_port),
740            ))
741        }
742        libc::AF_INET6 => {
743            // Safety: if the ss_family field is AF_INET6 then storage must be a sockaddr_in6.
744            let addr: &libc::sockaddr_in6 =
745                unsafe { &*(&name as *const _ as *const libc::sockaddr_in6) };
746            SocketAddr::V6(SocketAddrV6::new(
747                Ipv6Addr::from(addr.sin6_addr.s6_addr),
748                u16::from_be(addr.sin6_port),
749                addr.sin6_flowinfo,
750                addr.sin6_scope_id,
751            ))
752        }
753        _ => unreachable!(),
754    };
755
756    RecvMeta {
757        len,
758        stride,
759        addr,
760        ecn: EcnCodepoint::from_bits(ecn_bits),
761        dst_ip,
762    }
763}
764
765#[cfg(not(apple_slow))]
766// Chosen somewhat arbitrarily; might benefit from additional tuning.
767pub(crate) const BATCH_SIZE: usize = 32;
768
769#[cfg(apple_slow)]
770pub(crate) const BATCH_SIZE: usize = 1;
771
772#[cfg(any(target_os = "linux", target_os = "android"))]
773mod gso {
774    use super::*;
775
776    #[cfg(not(target_os = "android"))]
777    const UDP_SEGMENT: libc::c_int = libc::UDP_SEGMENT;
778    #[cfg(target_os = "android")]
779    // TODO: Add this to libc
780    const UDP_SEGMENT: libc::c_int = 103;
781
782    /// Checks whether GSO support is available by setting the UDP_SEGMENT
783    /// option on a socket
784    pub(crate) fn max_gso_segments() -> usize {
785        const GSO_SIZE: libc::c_int = 1500;
786
787        let socket = match std::net::UdpSocket::bind("[::]:0")
788            .or_else(|_| std::net::UdpSocket::bind((Ipv4Addr::LOCALHOST, 0)))
789        {
790            Ok(socket) => socket,
791            Err(_) => return 1,
792        };
793
794        // As defined in linux/udp.h
795        // #define UDP_MAX_SEGMENTS        (1 << 6UL)
796        match set_socket_option(&socket, libc::SOL_UDP, UDP_SEGMENT, GSO_SIZE) {
797            Ok(()) => 64,
798            Err(_e) => {
799                crate::log::debug!(
800                    "failed to set `UDP_SEGMENT` socket option ({_e}); setting `max_gso_segments = 1`"
801                );
802
803                1
804            }
805        }
806    }
807
808    pub(crate) fn set_segment_size(encoder: &mut cmsg::Encoder<libc::msghdr>, segment_size: u16) {
809        encoder.push(libc::SOL_UDP, UDP_SEGMENT, segment_size);
810    }
811}
812
813// On Apple platforms using the `sendmsg_x` call, UDP datagram segmentation is not
814// offloaded to the NIC or even the kernel, but instead done here in user space in
815// [`send`]) and then passed to the OS as individual `iovec`s (up to `BATCH_SIZE`).
816#[cfg(not(any(target_os = "linux", target_os = "android")))]
817mod gso {
818    use super::*;
819
820    pub(super) fn max_gso_segments() -> usize {
821        #[cfg(apple_fast)]
822        {
823            BATCH_SIZE
824        }
825        #[cfg(not(apple_fast))]
826        {
827            1
828        }
829    }
830
831    pub(super) fn set_segment_size(
832        #[cfg(not(apple_fast))] _encoder: &mut cmsg::Encoder<libc::msghdr>,
833        #[cfg(apple_fast)] _encoder: &mut cmsg::Encoder<msghdr_x>,
834        _segment_size: u16,
835    ) {
836    }
837}
838
839#[cfg(any(target_os = "linux", target_os = "android"))]
840mod gro {
841    use super::*;
842
843    #[cfg(not(target_os = "android"))]
844    pub(crate) const UDP_GRO: libc::c_int = libc::UDP_GRO;
845    #[cfg(target_os = "android")]
846    // TODO: Add this to libc
847    pub(crate) const UDP_GRO: libc::c_int = 104;
848
849    pub(crate) fn gro_segments() -> usize {
850        let socket = match std::net::UdpSocket::bind("[::]:0")
851            .or_else(|_| std::net::UdpSocket::bind((Ipv4Addr::LOCALHOST, 0)))
852        {
853            Ok(socket) => socket,
854            Err(_) => return 1,
855        };
856
857        // As defined in net/ipv4/udp_offload.c
858        // #define UDP_GRO_CNT_MAX 64
859        //
860        // NOTE: this MUST be set to UDP_GRO_CNT_MAX to ensure that the receive buffer size
861        // (get_max_udp_payload_size() * gro_segments()) is large enough to hold the largest GRO
862        // list the kernel might potentially produce. See
863        // https://github.com/quinn-rs/quinn/pull/1354.
864        match set_socket_option(&socket, libc::SOL_UDP, UDP_GRO, OPTION_ON) {
865            Ok(()) => 64,
866            Err(_) => 1,
867        }
868    }
869}
870
871/// Returns whether the given socket option is supported on the current platform
872///
873/// Yields `Ok(true)` if the option was set successfully, `Ok(false)` if setting
874/// the option raised an `ENOPROTOOPT` error, and `Err` for any other error.
875fn set_socket_option_supported(
876    socket: &impl AsRawFd,
877    level: libc::c_int,
878    name: libc::c_int,
879    value: libc::c_int,
880) -> io::Result<bool> {
881    match set_socket_option(socket, level, name, value) {
882        Ok(()) => Ok(true),
883        Err(err) if err.raw_os_error() == Some(libc::ENOPROTOOPT) => Ok(false),
884        Err(err) => Err(err),
885    }
886}
887
888fn set_socket_option(
889    socket: &impl AsRawFd,
890    level: libc::c_int,
891    name: libc::c_int,
892    value: libc::c_int,
893) -> io::Result<()> {
894    let rc = unsafe {
895        libc::setsockopt(
896            socket.as_raw_fd(),
897            level,
898            name,
899            &value as *const _ as _,
900            mem::size_of_val(&value) as _,
901        )
902    };
903
904    match rc == 0 {
905        true => Ok(()),
906        false => Err(io::Error::last_os_error()),
907    }
908}
909
910const OPTION_ON: libc::c_int = 1;
911
912#[cfg(not(any(target_os = "linux", target_os = "android")))]
913mod gro {
914    pub(super) fn gro_segments() -> usize {
915        1
916    }
917}