quinn_udp/
unix.rs

1#[cfg(not(any(apple, target_os = "openbsd", solarish)))]
2use std::ptr;
3use std::{
4    io::{self, IoSliceMut},
5    mem::{self, MaybeUninit},
6    net::{IpAddr, Ipv4Addr, Ipv6Addr, SocketAddr, SocketAddrV4, SocketAddrV6},
7    os::unix::io::AsRawFd,
8    sync::{
9        Mutex,
10        atomic::{AtomicBool, AtomicUsize, Ordering},
11    },
12    time::Instant,
13};
14
15use socket2::SockRef;
16
17use super::{
18    EcnCodepoint, IO_ERROR_LOG_INTERVAL, RecvMeta, Transmit, UdpSockRef, cmsg, log_sendmsg_error,
19};
20
21// Adapted from https://github.com/apple-oss-distributions/xnu/blob/8d741a5de7ff4191bf97d57b9f54c2f6d4a15585/bsd/sys/socket_private.h
22#[cfg(apple_fast)]
23#[repr(C)]
24#[allow(non_camel_case_types)]
25pub(crate) struct msghdr_x {
26    pub msg_name: *mut libc::c_void,
27    pub msg_namelen: libc::socklen_t,
28    pub msg_iov: *mut libc::iovec,
29    pub msg_iovlen: libc::c_int,
30    pub msg_control: *mut libc::c_void,
31    pub msg_controllen: libc::socklen_t,
32    pub msg_flags: libc::c_int,
33    pub msg_datalen: usize,
34}
35
36#[cfg(apple_fast)]
37extern "C" {
38    fn recvmsg_x(
39        s: libc::c_int,
40        msgp: *const msghdr_x,
41        cnt: libc::c_uint,
42        flags: libc::c_int,
43    ) -> isize;
44
45    fn sendmsg_x(
46        s: libc::c_int,
47        msgp: *const msghdr_x,
48        cnt: libc::c_uint,
49        flags: libc::c_int,
50    ) -> isize;
51}
52
53// Defined in netinet6/in6.h on OpenBSD, this is not yet exported by the libc crate
54// directly.  See https://github.com/rust-lang/libc/issues/3704 for when we might be able to
55// rely on this from the libc crate.
56#[cfg(any(target_os = "openbsd", target_os = "netbsd"))]
57const IPV6_DONTFRAG: libc::c_int = 62;
58#[cfg(not(any(target_os = "openbsd", target_os = "netbsd")))]
59const IPV6_DONTFRAG: libc::c_int = libc::IPV6_DONTFRAG;
60
61#[cfg(target_os = "freebsd")]
62type IpTosTy = libc::c_uchar;
63#[cfg(not(any(target_os = "freebsd", target_os = "netbsd")))]
64type IpTosTy = libc::c_int;
65
66/// Tokio-compatible UDP socket with some useful specializations.
67///
68/// Unlike a standard tokio UDP socket, this allows ECN bits to be read and written on some
69/// platforms.
70#[derive(Debug)]
71pub struct UdpSocketState {
72    last_send_error: Mutex<Instant>,
73    max_gso_segments: AtomicUsize,
74    gro_segments: usize,
75    may_fragment: bool,
76
77    /// True if we have received EINVAL error from `sendmsg` system call at least once.
78    ///
79    /// If enabled, we assume that old kernel is used and switch to fallback mode.
80    /// In particular, we do not use IP_TOS cmsg_type in this case,
81    /// which is not supported on Linux <3.13 and results in not sending the UDP packet at all.
82    sendmsg_einval: AtomicBool,
83}
84
85impl UdpSocketState {
86    pub fn new(sock: UdpSockRef<'_>) -> io::Result<Self> {
87        let io = sock.0;
88        let mut cmsg_platform_space = 0;
89        if cfg!(target_os = "linux")
90            || cfg!(bsd)
91            || cfg!(apple)
92            || cfg!(target_os = "android")
93            || cfg!(solarish)
94        {
95            cmsg_platform_space +=
96                unsafe { libc::CMSG_SPACE(mem::size_of::<libc::in6_pktinfo>() as _) as usize };
97        }
98
99        assert!(
100            CMSG_LEN
101                >= unsafe { libc::CMSG_SPACE(mem::size_of::<libc::c_int>() as _) as usize }
102                    + cmsg_platform_space
103        );
104        assert!(
105            mem::align_of::<libc::cmsghdr>() <= mem::align_of::<cmsg::Aligned<[u8; 0]>>(),
106            "control message buffers will be misaligned"
107        );
108
109        io.set_nonblocking(true)?;
110
111        let addr = io.local_addr()?;
112        let is_ipv4 = addr.family() == libc::AF_INET as libc::sa_family_t;
113
114        // mac and ios do not support IP_RECVTOS on dual-stack sockets :(
115        // older macos versions also don't have the flag and will error out if we don't ignore it
116        #[cfg(not(any(target_os = "openbsd", target_os = "netbsd", solarish)))]
117        if is_ipv4 || !io.only_v6()? {
118            if let Err(_err) =
119                set_socket_option(&*io, libc::IPPROTO_IP, libc::IP_RECVTOS, OPTION_ON)
120            {
121                crate::log::debug!("Ignoring error setting IP_RECVTOS on socket: {_err:?}");
122            }
123        }
124
125        let mut may_fragment = false;
126        #[cfg(any(target_os = "linux", target_os = "android"))]
127        {
128            // opportunistically try to enable GRO. See gro::gro_segments().
129            let _ = set_socket_option(&*io, libc::SOL_UDP, gro::UDP_GRO, OPTION_ON);
130
131            // Forbid IPv4 fragmentation. Set even for IPv6 to account for IPv6 mapped IPv4 addresses.
132            // Set `may_fragment` to `true` if this option is not supported on the platform.
133            may_fragment |= !set_socket_option_supported(
134                &*io,
135                libc::IPPROTO_IP,
136                libc::IP_MTU_DISCOVER,
137                libc::IP_PMTUDISC_PROBE,
138            )?;
139
140            if is_ipv4 {
141                set_socket_option(&*io, libc::IPPROTO_IP, libc::IP_PKTINFO, OPTION_ON)?;
142            } else {
143                // Set `may_fragment` to `true` if this option is not supported on the platform.
144                may_fragment |= !set_socket_option_supported(
145                    &*io,
146                    libc::IPPROTO_IPV6,
147                    libc::IPV6_MTU_DISCOVER,
148                    libc::IPV6_PMTUDISC_PROBE,
149                )?;
150            }
151        }
152        #[cfg(any(target_os = "freebsd", apple))]
153        {
154            if is_ipv4 {
155                // Set `may_fragment` to `true` if this option is not supported on the platform.
156                may_fragment |= !set_socket_option_supported(
157                    &*io,
158                    libc::IPPROTO_IP,
159                    libc::IP_DONTFRAG,
160                    OPTION_ON,
161                )?;
162            }
163        }
164        #[cfg(any(bsd, apple, solarish))]
165        // IP_RECVDSTADDR == IP_SENDSRCADDR on FreeBSD
166        // macOS uses only IP_RECVDSTADDR, no IP_SENDSRCADDR on macOS (the same on Solaris)
167        // macOS also supports IP_PKTINFO
168        {
169            if is_ipv4 {
170                set_socket_option(&*io, libc::IPPROTO_IP, libc::IP_RECVDSTADDR, OPTION_ON)?;
171            }
172        }
173
174        // Options standardized in RFC 3542
175        if !is_ipv4 {
176            set_socket_option(&*io, libc::IPPROTO_IPV6, libc::IPV6_RECVPKTINFO, OPTION_ON)?;
177            set_socket_option(&*io, libc::IPPROTO_IPV6, libc::IPV6_RECVTCLASS, OPTION_ON)?;
178            // Linux's IP_PMTUDISC_PROBE allows us to operate under interface MTU rather than the
179            // kernel's path MTU guess, but actually disabling fragmentation requires this too. See
180            // __ip6_append_data in ip6_output.c.
181            // Set `may_fragment` to `true` if this option is not supported on the platform.
182            may_fragment |=
183                !set_socket_option_supported(&*io, libc::IPPROTO_IPV6, IPV6_DONTFRAG, OPTION_ON)?;
184        }
185
186        let now = Instant::now();
187        Ok(Self {
188            last_send_error: Mutex::new(now.checked_sub(2 * IO_ERROR_LOG_INTERVAL).unwrap_or(now)),
189            max_gso_segments: AtomicUsize::new(gso::max_gso_segments()),
190            gro_segments: gro::gro_segments(),
191            may_fragment,
192            sendmsg_einval: AtomicBool::new(false),
193        })
194    }
195
196    /// Sends a [`Transmit`] on the given socket.
197    ///
198    /// This function will only ever return errors of kind [`io::ErrorKind::WouldBlock`].
199    /// All other errors will be logged and converted to `Ok`.
200    ///
201    /// UDP transmission errors are considered non-fatal because higher-level protocols must
202    /// employ retransmits and timeouts anyway in order to deal with UDP's unreliable nature.
203    /// Thus, logging is most likely the only thing you can do with these errors.
204    ///
205    /// If you would like to handle these errors yourself, use [`UdpSocketState::try_send`]
206    /// instead.
207    pub fn send(&self, socket: UdpSockRef<'_>, transmit: &Transmit<'_>) -> io::Result<()> {
208        match send(self, socket.0, transmit) {
209            Ok(()) => Ok(()),
210            Err(e) if e.kind() == io::ErrorKind::WouldBlock => Err(e),
211            Err(e) => {
212                log_sendmsg_error(&self.last_send_error, e, transmit);
213
214                Ok(())
215            }
216        }
217    }
218
219    /// Sends a [`Transmit`] on the given socket without any additional error handling.
220    pub fn try_send(&self, socket: UdpSockRef<'_>, transmit: &Transmit<'_>) -> io::Result<()> {
221        send(self, socket.0, transmit)
222    }
223
224    pub fn recv(
225        &self,
226        socket: UdpSockRef<'_>,
227        bufs: &mut [IoSliceMut<'_>],
228        meta: &mut [RecvMeta],
229    ) -> io::Result<usize> {
230        recv(socket.0, bufs, meta)
231    }
232
233    /// The maximum amount of segments which can be transmitted if a platform
234    /// supports Generic Send Offload (GSO).
235    ///
236    /// This is 1 if the platform doesn't support GSO. Subject to change if errors are detected
237    /// while using GSO.
238    #[inline]
239    pub fn max_gso_segments(&self) -> usize {
240        self.max_gso_segments.load(Ordering::Relaxed)
241    }
242
243    /// The number of segments to read when GRO is enabled. Used as a factor to
244    /// compute the receive buffer size.
245    ///
246    /// Returns 1 if the platform doesn't support GRO.
247    #[inline]
248    pub fn gro_segments(&self) -> usize {
249        self.gro_segments
250    }
251
252    /// Resize the send buffer of `socket` to `bytes`
253    #[inline]
254    pub fn set_send_buffer_size(&self, socket: UdpSockRef<'_>, bytes: usize) -> io::Result<()> {
255        socket.0.set_send_buffer_size(bytes)
256    }
257
258    /// Resize the receive buffer of `socket` to `bytes`
259    #[inline]
260    pub fn set_recv_buffer_size(&self, socket: UdpSockRef<'_>, bytes: usize) -> io::Result<()> {
261        socket.0.set_recv_buffer_size(bytes)
262    }
263
264    /// Get the size of the `socket` send buffer
265    #[inline]
266    pub fn send_buffer_size(&self, socket: UdpSockRef<'_>) -> io::Result<usize> {
267        socket.0.send_buffer_size()
268    }
269
270    /// Get the size of the `socket` receive buffer
271    #[inline]
272    pub fn recv_buffer_size(&self, socket: UdpSockRef<'_>) -> io::Result<usize> {
273        socket.0.recv_buffer_size()
274    }
275
276    /// Whether transmitted datagrams might get fragmented by the IP layer
277    ///
278    /// Returns `false` on targets which employ e.g. the `IPV6_DONTFRAG` socket option.
279    #[inline]
280    pub fn may_fragment(&self) -> bool {
281        self.may_fragment
282    }
283
284    /// Returns true if we previously got an EINVAL error from `sendmsg` syscall.
285    fn sendmsg_einval(&self) -> bool {
286        self.sendmsg_einval.load(Ordering::Relaxed)
287    }
288
289    /// Sets the flag indicating we got EINVAL error from `sendmsg` syscall.
290    #[cfg(not(any(apple, target_os = "openbsd", target_os = "netbsd")))]
291    fn set_sendmsg_einval(&self) {
292        self.sendmsg_einval.store(true, Ordering::Relaxed)
293    }
294}
295
296#[cfg(not(any(apple, target_os = "openbsd", target_os = "netbsd")))]
297fn send(
298    #[allow(unused_variables)] // only used on Linux
299    state: &UdpSocketState,
300    io: SockRef<'_>,
301    transmit: &Transmit<'_>,
302) -> io::Result<()> {
303    #[allow(unused_mut)] // only mutable on FreeBSD
304    let mut encode_src_ip = true;
305    #[cfg(target_os = "freebsd")]
306    {
307        let addr = io.local_addr()?;
308        let is_ipv4 = addr.family() == libc::AF_INET as libc::sa_family_t;
309        if is_ipv4 {
310            if let Some(socket) = addr.as_socket_ipv4() {
311                encode_src_ip = socket.ip() == &Ipv4Addr::UNSPECIFIED;
312            }
313        }
314    }
315    let mut msg_hdr: libc::msghdr = unsafe { mem::zeroed() };
316    let mut iovec: libc::iovec = unsafe { mem::zeroed() };
317    let mut cmsgs = cmsg::Aligned([0u8; CMSG_LEN]);
318    let dst_addr = socket2::SockAddr::from(transmit.destination);
319    prepare_msg(
320        transmit,
321        &dst_addr,
322        &mut msg_hdr,
323        &mut iovec,
324        &mut cmsgs,
325        encode_src_ip,
326        state.sendmsg_einval(),
327    );
328
329    loop {
330        let n = unsafe { libc::sendmsg(io.as_raw_fd(), &msg_hdr, 0) };
331        if n == -1 {
332            let e = io::Error::last_os_error();
333            match e.kind() {
334                io::ErrorKind::Interrupted => {
335                    // Retry the transmission
336                    continue;
337                }
338                io::ErrorKind::WouldBlock => return Err(e),
339                _ => {
340                    // Some network adapters and drivers do not support GSO. Unfortunately, Linux
341                    // offers no easy way for us to detect this short of an EIO or sometimes EINVAL
342                    // when we try to actually send datagrams using it.
343                    #[cfg(any(target_os = "linux", target_os = "android"))]
344                    if let Some(libc::EIO) | Some(libc::EINVAL) = e.raw_os_error() {
345                        // Prevent new transmits from being scheduled using GSO. Existing GSO transmits
346                        // may already be in the pipeline, so we need to tolerate additional failures.
347                        if state.max_gso_segments() > 1 {
348                            crate::log::info!(
349                                "`libc::sendmsg` failed with {e}; halting segmentation offload"
350                            );
351                            state
352                                .max_gso_segments
353                                .store(1, std::sync::atomic::Ordering::Relaxed);
354                        }
355                    }
356
357                    // Some arguments to `sendmsg` are not supported. Switch to
358                    // fallback mode and retry if we haven't already.
359                    if e.raw_os_error() == Some(libc::EINVAL) && !state.sendmsg_einval() {
360                        state.set_sendmsg_einval();
361                        prepare_msg(
362                            transmit,
363                            &dst_addr,
364                            &mut msg_hdr,
365                            &mut iovec,
366                            &mut cmsgs,
367                            encode_src_ip,
368                            state.sendmsg_einval(),
369                        );
370                        continue;
371                    }
372
373                    // - EMSGSIZE is expected for MTU probes. Future work might be able to avoid
374                    //   these by automatically clamping the MTUD upper bound to the interface MTU.
375                    if e.raw_os_error() != Some(libc::EMSGSIZE) {
376                        return Err(e);
377                    }
378                }
379            }
380        }
381        return Ok(());
382    }
383}
384
385#[cfg(apple_fast)]
386fn send(state: &UdpSocketState, io: SockRef<'_>, transmit: &Transmit<'_>) -> io::Result<()> {
387    let mut hdrs = unsafe { mem::zeroed::<[msghdr_x; BATCH_SIZE]>() };
388    let mut iovs = unsafe { mem::zeroed::<[libc::iovec; BATCH_SIZE]>() };
389    let mut ctrls = [cmsg::Aligned([0u8; CMSG_LEN]); BATCH_SIZE];
390    let addr = socket2::SockAddr::from(transmit.destination);
391    let segment_size = transmit.segment_size.unwrap_or(transmit.contents.len());
392    let mut cnt = 0;
393    debug_assert!(transmit.contents.len().div_ceil(segment_size) <= BATCH_SIZE);
394    for (i, chunk) in transmit
395        .contents
396        .chunks(segment_size)
397        .enumerate()
398        .take(BATCH_SIZE)
399    {
400        prepare_msg(
401            &Transmit {
402                destination: transmit.destination,
403                ecn: transmit.ecn,
404                contents: chunk,
405                segment_size: Some(chunk.len()),
406                src_ip: transmit.src_ip,
407            },
408            &addr,
409            &mut hdrs[i],
410            &mut iovs[i],
411            &mut ctrls[i],
412            true,
413            state.sendmsg_einval(),
414        );
415        hdrs[i].msg_datalen = chunk.len();
416        cnt += 1;
417    }
418    loop {
419        let n = unsafe { sendmsg_x(io.as_raw_fd(), hdrs.as_ptr(), cnt as u32, 0) };
420        if n == -1 {
421            let e = io::Error::last_os_error();
422            match e.kind() {
423                io::ErrorKind::Interrupted => {
424                    // Retry the transmission
425                    continue;
426                }
427                io::ErrorKind::WouldBlock => return Err(e),
428                _ => {
429                    // - EMSGSIZE is expected for MTU probes. Future work might be able to avoid
430                    //   these by automatically clamping the MTUD upper bound to the interface MTU.
431                    if e.raw_os_error() != Some(libc::EMSGSIZE) {
432                        return Err(e);
433                    }
434                }
435            }
436        }
437        return Ok(());
438    }
439}
440
441#[cfg(any(target_os = "openbsd", target_os = "netbsd", apple_slow))]
442fn send(state: &UdpSocketState, io: SockRef<'_>, transmit: &Transmit<'_>) -> io::Result<()> {
443    let mut hdr: libc::msghdr = unsafe { mem::zeroed() };
444    let mut iov: libc::iovec = unsafe { mem::zeroed() };
445    let mut ctrl = cmsg::Aligned([0u8; CMSG_LEN]);
446    let addr = socket2::SockAddr::from(transmit.destination);
447    prepare_msg(
448        transmit,
449        &addr,
450        &mut hdr,
451        &mut iov,
452        &mut ctrl,
453        cfg!(apple) || cfg!(target_os = "openbsd") || cfg!(target_os = "netbsd"),
454        state.sendmsg_einval(),
455    );
456    loop {
457        let n = unsafe { libc::sendmsg(io.as_raw_fd(), &hdr, 0) };
458        if n == -1 {
459            let e = io::Error::last_os_error();
460            match e.kind() {
461                io::ErrorKind::Interrupted => {
462                    // Retry the transmission
463                    continue;
464                }
465                io::ErrorKind::WouldBlock => return Err(e),
466                _ => {
467                    // - EMSGSIZE is expected for MTU probes. Future work might be able to avoid
468                    //   these by automatically clamping the MTUD upper bound to the interface MTU.
469                    if e.raw_os_error() != Some(libc::EMSGSIZE) {
470                        return Err(e);
471                    }
472                }
473            }
474        }
475        return Ok(());
476    }
477}
478
479#[cfg(not(any(apple, target_os = "openbsd", target_os = "netbsd", solarish)))]
480fn recv(io: SockRef<'_>, bufs: &mut [IoSliceMut<'_>], meta: &mut [RecvMeta]) -> io::Result<usize> {
481    let mut names = [MaybeUninit::<libc::sockaddr_storage>::uninit(); BATCH_SIZE];
482    let mut ctrls = [cmsg::Aligned(MaybeUninit::<[u8; CMSG_LEN]>::uninit()); BATCH_SIZE];
483    let mut hdrs = unsafe { mem::zeroed::<[libc::mmsghdr; BATCH_SIZE]>() };
484    let max_msg_count = bufs.len().min(BATCH_SIZE);
485    for i in 0..max_msg_count {
486        prepare_recv(
487            &mut bufs[i],
488            &mut names[i],
489            &mut ctrls[i],
490            &mut hdrs[i].msg_hdr,
491        );
492    }
493    let msg_count = loop {
494        let n = unsafe {
495            libc::recvmmsg(
496                io.as_raw_fd(),
497                hdrs.as_mut_ptr(),
498                bufs.len().min(BATCH_SIZE) as _,
499                0,
500                ptr::null_mut::<libc::timespec>(),
501            )
502        };
503        if n == -1 {
504            let e = io::Error::last_os_error();
505            if e.kind() == io::ErrorKind::Interrupted {
506                continue;
507            }
508            return Err(e);
509        }
510        break n;
511    };
512    for i in 0..(msg_count as usize) {
513        meta[i] = decode_recv(&names[i], &hdrs[i].msg_hdr, hdrs[i].msg_len as usize);
514    }
515    Ok(msg_count as usize)
516}
517
518#[cfg(apple_fast)]
519fn recv(io: SockRef<'_>, bufs: &mut [IoSliceMut<'_>], meta: &mut [RecvMeta]) -> io::Result<usize> {
520    let mut names = [MaybeUninit::<libc::sockaddr_storage>::uninit(); BATCH_SIZE];
521    let mut ctrls = [cmsg::Aligned(MaybeUninit::<[u8; CMSG_LEN]>::uninit()); BATCH_SIZE];
522    let mut hdrs = unsafe { mem::zeroed::<[msghdr_x; BATCH_SIZE]>() };
523    let max_msg_count = bufs.len().min(BATCH_SIZE);
524    for i in 0..max_msg_count {
525        prepare_recv(&mut bufs[i], &mut names[i], &mut ctrls[i], &mut hdrs[i]);
526    }
527    let msg_count = loop {
528        let n = unsafe { recvmsg_x(io.as_raw_fd(), hdrs.as_mut_ptr(), max_msg_count as _, 0) };
529        match n {
530            -1 => {
531                let e = io::Error::last_os_error();
532                if e.kind() == io::ErrorKind::Interrupted {
533                    continue;
534                }
535                return Err(e);
536            }
537            n => break n,
538        }
539    };
540    for i in 0..(msg_count as usize) {
541        meta[i] = decode_recv(&names[i], &hdrs[i], hdrs[i].msg_datalen as usize);
542    }
543    Ok(msg_count as usize)
544}
545
546#[cfg(any(target_os = "openbsd", target_os = "netbsd", solarish, apple_slow))]
547fn recv(io: SockRef<'_>, bufs: &mut [IoSliceMut<'_>], meta: &mut [RecvMeta]) -> io::Result<usize> {
548    let mut name = MaybeUninit::<libc::sockaddr_storage>::uninit();
549    let mut ctrl = cmsg::Aligned(MaybeUninit::<[u8; CMSG_LEN]>::uninit());
550    let mut hdr = unsafe { mem::zeroed::<libc::msghdr>() };
551    prepare_recv(&mut bufs[0], &mut name, &mut ctrl, &mut hdr);
552    let n = loop {
553        let n = unsafe { libc::recvmsg(io.as_raw_fd(), &mut hdr, 0) };
554        if n == -1 {
555            let e = io::Error::last_os_error();
556            if e.kind() == io::ErrorKind::Interrupted {
557                continue;
558            }
559            return Err(e);
560        }
561        if hdr.msg_flags & libc::MSG_TRUNC != 0 {
562            continue;
563        }
564        break n;
565    };
566    meta[0] = decode_recv(&name, &hdr, n as usize);
567    Ok(1)
568}
569
570const CMSG_LEN: usize = 88;
571
572fn prepare_msg(
573    transmit: &Transmit<'_>,
574    dst_addr: &socket2::SockAddr,
575    #[cfg(not(apple_fast))] hdr: &mut libc::msghdr,
576    #[cfg(apple_fast)] hdr: &mut msghdr_x,
577    iov: &mut libc::iovec,
578    ctrl: &mut cmsg::Aligned<[u8; CMSG_LEN]>,
579    #[allow(unused_variables)] // only used on FreeBSD & macOS
580    encode_src_ip: bool,
581    sendmsg_einval: bool,
582) {
583    iov.iov_base = transmit.contents.as_ptr() as *const _ as *mut _;
584    iov.iov_len = transmit.contents.len();
585
586    // SAFETY: Casting the pointer to a mutable one is legal,
587    // as sendmsg is guaranteed to not alter the mutable pointer
588    // as per the POSIX spec. See the section on the sys/socket.h
589    // header for details. The type is only mutable in the first
590    // place because it is reused by recvmsg as well.
591    let name = dst_addr.as_ptr() as *mut libc::c_void;
592    let namelen = dst_addr.len();
593    hdr.msg_name = name as *mut _;
594    hdr.msg_namelen = namelen;
595    hdr.msg_iov = iov;
596    hdr.msg_iovlen = 1;
597
598    hdr.msg_control = ctrl.0.as_mut_ptr() as _;
599    hdr.msg_controllen = CMSG_LEN as _;
600    let mut encoder = unsafe { cmsg::Encoder::new(hdr) };
601    let ecn = transmit.ecn.map_or(0, |x| x as libc::c_int);
602    // True for IPv4 or IPv4-Mapped IPv6
603    let is_ipv4 = transmit.destination.is_ipv4()
604        || matches!(transmit.destination.ip(), IpAddr::V6(addr) if addr.to_ipv4_mapped().is_some());
605    if is_ipv4 {
606        if !sendmsg_einval {
607            #[cfg(not(target_os = "netbsd"))]
608            {
609                encoder.push(libc::IPPROTO_IP, libc::IP_TOS, ecn as IpTosTy);
610            }
611        }
612    } else {
613        encoder.push(libc::IPPROTO_IPV6, libc::IPV6_TCLASS, ecn);
614    }
615
616    // Only set the segment size if it is different from the size of the contents.
617    // Some network drivers don't like being told to do GSO even if there is effectively only a single segment.
618    if let Some(segment_size) = transmit
619        .segment_size
620        .filter(|segment_size| *segment_size != transmit.contents.len())
621    {
622        gso::set_segment_size(&mut encoder, segment_size as u16);
623    }
624
625    if let Some(ip) = &transmit.src_ip {
626        match ip {
627            IpAddr::V4(v4) => {
628                #[cfg(any(target_os = "linux", target_os = "android"))]
629                {
630                    let pktinfo = libc::in_pktinfo {
631                        ipi_ifindex: 0,
632                        ipi_spec_dst: libc::in_addr {
633                            s_addr: u32::from_ne_bytes(v4.octets()),
634                        },
635                        ipi_addr: libc::in_addr { s_addr: 0 },
636                    };
637                    encoder.push(libc::IPPROTO_IP, libc::IP_PKTINFO, pktinfo);
638                }
639                #[cfg(any(bsd, apple, solarish))]
640                {
641                    if encode_src_ip {
642                        let addr = libc::in_addr {
643                            s_addr: u32::from_ne_bytes(v4.octets()),
644                        };
645                        encoder.push(libc::IPPROTO_IP, libc::IP_RECVDSTADDR, addr);
646                    }
647                }
648            }
649            IpAddr::V6(v6) => {
650                let pktinfo = libc::in6_pktinfo {
651                    ipi6_ifindex: 0,
652                    ipi6_addr: libc::in6_addr {
653                        s6_addr: v6.octets(),
654                    },
655                };
656                encoder.push(libc::IPPROTO_IPV6, libc::IPV6_PKTINFO, pktinfo);
657            }
658        }
659    }
660
661    encoder.finish();
662}
663
664#[cfg(not(apple_fast))]
665fn prepare_recv(
666    buf: &mut IoSliceMut,
667    name: &mut MaybeUninit<libc::sockaddr_storage>,
668    ctrl: &mut cmsg::Aligned<MaybeUninit<[u8; CMSG_LEN]>>,
669    hdr: &mut libc::msghdr,
670) {
671    hdr.msg_name = name.as_mut_ptr() as _;
672    hdr.msg_namelen = mem::size_of::<libc::sockaddr_storage>() as _;
673    hdr.msg_iov = buf as *mut IoSliceMut as *mut libc::iovec;
674    hdr.msg_iovlen = 1;
675    hdr.msg_control = ctrl.0.as_mut_ptr() as _;
676    hdr.msg_controllen = CMSG_LEN as _;
677    hdr.msg_flags = 0;
678}
679
680#[cfg(apple_fast)]
681fn prepare_recv(
682    buf: &mut IoSliceMut,
683    name: &mut MaybeUninit<libc::sockaddr_storage>,
684    ctrl: &mut cmsg::Aligned<MaybeUninit<[u8; CMSG_LEN]>>,
685    hdr: &mut msghdr_x,
686) {
687    hdr.msg_name = name.as_mut_ptr() as _;
688    hdr.msg_namelen = mem::size_of::<libc::sockaddr_storage>() as _;
689    hdr.msg_iov = buf as *mut IoSliceMut as *mut libc::iovec;
690    hdr.msg_iovlen = 1;
691    hdr.msg_control = ctrl.0.as_mut_ptr() as _;
692    hdr.msg_controllen = CMSG_LEN as _;
693    hdr.msg_flags = 0;
694    hdr.msg_datalen = buf.len();
695}
696
697fn decode_recv(
698    name: &MaybeUninit<libc::sockaddr_storage>,
699    #[cfg(not(apple_fast))] hdr: &libc::msghdr,
700    #[cfg(apple_fast)] hdr: &msghdr_x,
701    len: usize,
702) -> RecvMeta {
703    let name = unsafe { name.assume_init() };
704    let mut ecn_bits = 0;
705    let mut dst_ip = None;
706    #[allow(unused_mut)] // only mutable on Linux
707    let mut stride = len;
708
709    let cmsg_iter = unsafe { cmsg::Iter::new(hdr) };
710    for cmsg in cmsg_iter {
711        match (cmsg.cmsg_level, cmsg.cmsg_type) {
712            (libc::IPPROTO_IP, libc::IP_TOS) => unsafe {
713                ecn_bits = cmsg::decode::<u8, libc::cmsghdr>(cmsg);
714            },
715            // FreeBSD uses IP_RECVTOS here, and we can be liberal because cmsgs are opt-in.
716            #[cfg(not(any(target_os = "openbsd", target_os = "netbsd", solarish)))]
717            (libc::IPPROTO_IP, libc::IP_RECVTOS) => unsafe {
718                ecn_bits = cmsg::decode::<u8, libc::cmsghdr>(cmsg);
719            },
720            (libc::IPPROTO_IPV6, libc::IPV6_TCLASS) => unsafe {
721                // Temporary hack around broken macos ABI. Remove once upstream fixes it.
722                // https://bugreport.apple.com/web/?problemID=48761855
723                #[allow(clippy::unnecessary_cast)] // cmsg.cmsg_len defined as size_t
724                if cfg!(apple)
725                    && cmsg.cmsg_len as usize == libc::CMSG_LEN(mem::size_of::<u8>() as _) as usize
726                {
727                    ecn_bits = cmsg::decode::<u8, libc::cmsghdr>(cmsg);
728                } else {
729                    ecn_bits = cmsg::decode::<libc::c_int, libc::cmsghdr>(cmsg) as u8;
730                }
731            },
732            #[cfg(any(target_os = "linux", target_os = "android"))]
733            (libc::IPPROTO_IP, libc::IP_PKTINFO) => {
734                let pktinfo = unsafe { cmsg::decode::<libc::in_pktinfo, libc::cmsghdr>(cmsg) };
735                dst_ip = Some(IpAddr::V4(Ipv4Addr::from(
736                    pktinfo.ipi_addr.s_addr.to_ne_bytes(),
737                )));
738            }
739            #[cfg(any(bsd, apple))]
740            (libc::IPPROTO_IP, libc::IP_RECVDSTADDR) => {
741                let in_addr = unsafe { cmsg::decode::<libc::in_addr, libc::cmsghdr>(cmsg) };
742                dst_ip = Some(IpAddr::V4(Ipv4Addr::from(in_addr.s_addr.to_ne_bytes())));
743            }
744            (libc::IPPROTO_IPV6, libc::IPV6_PKTINFO) => {
745                let pktinfo = unsafe { cmsg::decode::<libc::in6_pktinfo, libc::cmsghdr>(cmsg) };
746                dst_ip = Some(IpAddr::V6(Ipv6Addr::from(pktinfo.ipi6_addr.s6_addr)));
747            }
748            #[cfg(any(target_os = "linux", target_os = "android"))]
749            (libc::SOL_UDP, gro::UDP_GRO) => unsafe {
750                stride = cmsg::decode::<libc::c_int, libc::cmsghdr>(cmsg) as usize;
751            },
752            _ => {}
753        }
754    }
755
756    let addr = match libc::c_int::from(name.ss_family) {
757        libc::AF_INET => {
758            // Safety: if the ss_family field is AF_INET then storage must be a sockaddr_in.
759            let addr: &libc::sockaddr_in =
760                unsafe { &*(&name as *const _ as *const libc::sockaddr_in) };
761            SocketAddr::V4(SocketAddrV4::new(
762                Ipv4Addr::from(addr.sin_addr.s_addr.to_ne_bytes()),
763                u16::from_be(addr.sin_port),
764            ))
765        }
766        libc::AF_INET6 => {
767            // Safety: if the ss_family field is AF_INET6 then storage must be a sockaddr_in6.
768            let addr: &libc::sockaddr_in6 =
769                unsafe { &*(&name as *const _ as *const libc::sockaddr_in6) };
770            SocketAddr::V6(SocketAddrV6::new(
771                Ipv6Addr::from(addr.sin6_addr.s6_addr),
772                u16::from_be(addr.sin6_port),
773                addr.sin6_flowinfo,
774                addr.sin6_scope_id,
775            ))
776        }
777        _ => unreachable!(),
778    };
779
780    RecvMeta {
781        len,
782        stride,
783        addr,
784        ecn: EcnCodepoint::from_bits(ecn_bits),
785        dst_ip,
786    }
787}
788
789#[cfg(not(apple_slow))]
790// Chosen somewhat arbitrarily; might benefit from additional tuning.
791pub(crate) const BATCH_SIZE: usize = 32;
792
793#[cfg(apple_slow)]
794pub(crate) const BATCH_SIZE: usize = 1;
795
796#[cfg(any(target_os = "linux", target_os = "android"))]
797mod gso {
798    use super::*;
799
800    #[cfg(not(target_os = "android"))]
801    const UDP_SEGMENT: libc::c_int = libc::UDP_SEGMENT;
802    #[cfg(target_os = "android")]
803    // TODO: Add this to libc
804    const UDP_SEGMENT: libc::c_int = 103;
805
806    /// Checks whether GSO support is available by setting the UDP_SEGMENT
807    /// option on a socket
808    pub(crate) fn max_gso_segments() -> usize {
809        const GSO_SIZE: libc::c_int = 1500;
810
811        let socket = match std::net::UdpSocket::bind("[::]:0")
812            .or_else(|_| std::net::UdpSocket::bind((Ipv4Addr::LOCALHOST, 0)))
813        {
814            Ok(socket) => socket,
815            Err(_) => return 1,
816        };
817
818        // As defined in linux/udp.h
819        // #define UDP_MAX_SEGMENTS        (1 << 6UL)
820        match set_socket_option(&socket, libc::SOL_UDP, UDP_SEGMENT, GSO_SIZE) {
821            Ok(()) => 64,
822            Err(_e) => {
823                crate::log::debug!(
824                    "failed to set `UDP_SEGMENT` socket option ({_e}); setting `max_gso_segments = 1`"
825                );
826
827                1
828            }
829        }
830    }
831
832    pub(crate) fn set_segment_size(encoder: &mut cmsg::Encoder<libc::msghdr>, segment_size: u16) {
833        encoder.push(libc::SOL_UDP, UDP_SEGMENT, segment_size);
834    }
835}
836
837// On Apple platforms using the `sendmsg_x` call, UDP datagram segmentation is not
838// offloaded to the NIC or even the kernel, but instead done here in user space in
839// [`send`]) and then passed to the OS as individual `iovec`s (up to `BATCH_SIZE`).
840#[cfg(not(any(target_os = "linux", target_os = "android")))]
841mod gso {
842    use super::*;
843
844    pub(super) fn max_gso_segments() -> usize {
845        #[cfg(apple_fast)]
846        {
847            BATCH_SIZE
848        }
849        #[cfg(not(apple_fast))]
850        {
851            1
852        }
853    }
854
855    pub(super) fn set_segment_size(
856        #[cfg(not(apple_fast))] _encoder: &mut cmsg::Encoder<libc::msghdr>,
857        #[cfg(apple_fast)] _encoder: &mut cmsg::Encoder<msghdr_x>,
858        _segment_size: u16,
859    ) {
860    }
861}
862
863#[cfg(any(target_os = "linux", target_os = "android"))]
864mod gro {
865    use super::*;
866
867    #[cfg(not(target_os = "android"))]
868    pub(crate) const UDP_GRO: libc::c_int = libc::UDP_GRO;
869    #[cfg(target_os = "android")]
870    // TODO: Add this to libc
871    pub(crate) const UDP_GRO: libc::c_int = 104;
872
873    pub(crate) fn gro_segments() -> usize {
874        let socket = match std::net::UdpSocket::bind("[::]:0")
875            .or_else(|_| std::net::UdpSocket::bind((Ipv4Addr::LOCALHOST, 0)))
876        {
877            Ok(socket) => socket,
878            Err(_) => return 1,
879        };
880
881        // As defined in net/ipv4/udp_offload.c
882        // #define UDP_GRO_CNT_MAX 64
883        //
884        // NOTE: this MUST be set to UDP_GRO_CNT_MAX to ensure that the receive buffer size
885        // (get_max_udp_payload_size() * gro_segments()) is large enough to hold the largest GRO
886        // list the kernel might potentially produce. See
887        // https://github.com/quinn-rs/quinn/pull/1354.
888        match set_socket_option(&socket, libc::SOL_UDP, UDP_GRO, OPTION_ON) {
889            Ok(()) => 64,
890            Err(_) => 1,
891        }
892    }
893}
894
895/// Returns whether the given socket option is supported on the current platform
896///
897/// Yields `Ok(true)` if the option was set successfully, `Ok(false)` if setting
898/// the option raised an `ENOPROTOOPT` error, and `Err` for any other error.
899fn set_socket_option_supported(
900    socket: &impl AsRawFd,
901    level: libc::c_int,
902    name: libc::c_int,
903    value: libc::c_int,
904) -> io::Result<bool> {
905    match set_socket_option(socket, level, name, value) {
906        Ok(()) => Ok(true),
907        Err(err) if err.raw_os_error() == Some(libc::ENOPROTOOPT) => Ok(false),
908        Err(err) => Err(err),
909    }
910}
911
912fn set_socket_option(
913    socket: &impl AsRawFd,
914    level: libc::c_int,
915    name: libc::c_int,
916    value: libc::c_int,
917) -> io::Result<()> {
918    let rc = unsafe {
919        libc::setsockopt(
920            socket.as_raw_fd(),
921            level,
922            name,
923            &value as *const _ as _,
924            mem::size_of_val(&value) as _,
925        )
926    };
927
928    match rc == 0 {
929        true => Ok(()),
930        false => Err(io::Error::last_os_error()),
931    }
932}
933
934const OPTION_ON: libc::c_int = 1;
935
936#[cfg(not(any(target_os = "linux", target_os = "android")))]
937mod gro {
938    pub(super) fn gro_segments() -> usize {
939        1
940    }
941}