The Linux IPv4 UDP kernel code

Paul Dwerryhouse, paul@dwerryhouse.com.au

3-Nov-2001

Introduction

This document describes the IPv4 UDP network code, as found in the file net/ipv4/udp.c in the Linux 2.4.13 source code tree.

This document is not yet completed

Major functions

The major function calls in this file are:

udp_connect
udp_disconnect
udp_recvmsg
udp_v4_get_port

udp_connect(struct sock sk, struct sockaddr uaddr, int addr_len)

This function sets up the initial connection information for the UDP socket. Since UDP is a connectionless protocol, no information is exchanged with the remote computer - instead, routing, address and port information is stored in the socket structure.

Code description

There is only support for UDP in the INET domain, in this code. Return an error if some other domain is given:


	if (usin->sin_family != AF_INET) 
	  	return -EAFNOSUPPORT;

Determine from the network routing table which route should be used to send packets for this particular destination host. This information will exist for the lifetime of this particular socket:

	err = ip_route_connect(&rt, usin->sin_addr.s_addr, sk->saddr,
			       RT_CONN_FLAGS(sk), sk->bound_dev_if);
	if (err)
		return err;

If the user didn't specify a source address to be used with this socket, set the socket's source address from the route cache entry. Set the destination address and destination port:

  	if(!sk->saddr)
	  	sk->saddr = rt->rt_src;		/* Update source address */
	if(!sk->rcv_saddr)
		sk->rcv_saddr = rt->rt_src;
	sk->daddr = rt->rt_dst;
	sk->dport = usin->sin_port;

Set the socket state to TCP_ESTABLISHED (obviously we're not using TCP, but since this particular value can have no other valid meaning in this section, it is used as the "connected" state indicator). Then set the packet ID to the current clock count (jiffies):

	sk->state = TCP_ESTABLISHED;
	sk->protinfo.af_inet.id = jiffies;

Finally, set the socket destination to that from the route cache entry:

	sk_dst_set(sk, &rt->u.dst);

udp_disconnect(struct sock *sk, int flags)

udp_disconnect disassociates a socket with a particular UDP "connection". It's a fairly simple function, since there's not a lot to do. Firstly, set the state of the socket to TCP_CLOSE, set the destination address and port to zero, and then make sure there is no device bound to the socket:

	sk->state = TCP_CLOSE;
	sk->daddr = 0;
	sk->dport = 0;
	sk->bound_dev_if = 0;

Provided the socket isn't locked, clear the various address and port attributes associated with it:

	if (!(sk->userlocks&SOCK_BINDADDR_LOCK)) {
		sk->rcv_saddr = 0;
		sk->saddr = 0;
	}
	if (!(sk->userlocks&SOCK_BINDPORT_LOCK)) {
		sk->prot->unhash(sk);
		sk->sport = 0;
	}

Finally clear the destination cache of the socket:

	sk_dst_reset(sk);

udp_recvmsg(struct sock sk, struct msghdr msg, int len, int noblock, int flags, int *addr_len)

First, get the message from the socket:

	skb = skb_recv_datagram(sk, flags, noblock, &err);
	if (!skb)
		goto out;

If the packet was truncated when copying, flag it as such:

  	copied = skb->len - sizeof(struct udphdr);
	if (copied > len) {
		copied = len;
		msg->msg_flags |= MSG_TRUNC;
	}

Make sure the checksun is okay. Not entirely sure what it's doing here.

	if (skb->ip_summed==CHECKSUM_UNNECESSARY) {
		err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov,
					      copied);
	} else if (msg->msg_flags&MSG_TRUNC) {
		if (__udp_checksum_complete(skb))
			goto csum_copy_err;
		err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov,
					      copied);
	} else {
		err = skb_copy_and_csum_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov);

		if (err == -EINVAL)
			goto csum_copy_err;
	}

	if (err)
		goto out_free;

Get the packet timestamp, source address & port information, store in the socket.

	sock_recv_timestamp(msg, sk, skb);

	/* Copy the address. */
	if (sin)
	{
		sin->sin_family = AF_INET;
		sin->sin_port = skb->h.uh->source;
		sin->sin_addr.s_addr = skb->nh.iph->saddr;
		memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
  	}
	if (sk->protinfo.af_inet.cmsg_flags)
		ip_cmsg_recv(msg, skb);
	err = copied;
  
out_free:
  	skb_free_datagram(sk, skb);
out:
  	return err;

If there was a checksum error in the packet, update the error stats and remove the datagram from the queue.

csum_copy_err:
	UDP_INC_STATS_BH(UdpInErrors);

	/* Clear queue. */
	if (flags&MSG_PEEK) {
		int clear = 0;
		spin_lock_irq(&sk->receive_queue.lock);
		if (skb == skb_peek(&sk->receive_queue)) {
			__skb_unlink(skb, &sk->receive_queue);
			clear = 1;
		}
		spin_unlock_irq(&sk->receive_queue.lock);
		if (clear)
			kfree_skb(skb);
	}

	skb_free_datagram(sk, skb);

	return -EAGAIN;

udp_v4_get_port(struct sock *sk, unsigned short snum)

This function gets a source port for the socket. If a port number of zero is given as an argument, it will get the next available port in the range 1024-4999; otherwise it will try to get the port asked for by the user space program.

	write_lock_bh(&udp_hash_lock);
	if (snum == 0) {
		int best_size_so_far, best, result, i;

		if (udp_port_rover > sysctl_local_port_range[1] ||
		    udp_port_rover < sysctl_local_port_range[0])
			udp_port_rover = sysctl_local_port_range[0];
		best_size_so_far = 32767;
		best = result = udp_port_rover;
		for (i = 0; i < UDP_HTABLE_SIZE; i++, result++) {
			struct sock *sk;
			int size;

			sk = udp_hash[result & (UDP_HTABLE_SIZE - 1)];
			if (!sk) {
				if (result > sysctl_local_port_range[1])
					result = sysctl_local_port_range[0] +
						((result - sysctl_local_port_range[0]) &
						 (UDP_HTABLE_SIZE - 1));
				goto gotit;
			}
			size = 0;
			do {
				if (++size >= best_size_so_far)
					goto next;
			} while ((sk = sk->next) != NULL);
			best_size_so_far = size;
			best = result;
		next:;
		}
		result = best;
		for(i = 0; i < (1 << 16) / UDP_HTABLE_SIZE; i++, result += UDP_HTABLE_SIZE) {
			if (result > sysctl_local_port_range[1])
				result = sysctl_local_port_range[0]
					+ ((result - sysctl_local_port_range[0]) &
					   (UDP_HTABLE_SIZE - 1));
			if (!udp_lport_inuse(result))
				break;
		}
		if (i >= (1 << 16) / UDP_HTABLE_SIZE)
			goto fail;
gotit:
		udp_port_rover = snum = result;
	} else {
		struct sock *sk2;

		for (sk2 = udp_hash[snum & (UDP_HTABLE_SIZE - 1)];
		     sk2 != NULL;
		     sk2 = sk2->next) {
			if (sk2->num == snum &&
			    sk2 != sk &&
			    sk2->bound_dev_if == sk->bound_dev_if &&
			    (!sk2->rcv_saddr ||
			     !sk->rcv_saddr ||
			     sk2->rcv_saddr == sk->rcv_saddr) &&
			    (!sk2->reuse || !sk->reuse))
				goto fail;
		}
	}
	sk->num = snum;
	if (sk->pprev == NULL) {
		struct sock **skp = &udp_hash[snum & (UDP_HTABLE_SIZE - 1)];
		if ((sk->next = *skp) != NULL)
			(*skp)->pprev = &sk->next;
		*skp = sk;
		sk->pprev = skp;
		sock_prot_inc_use(sk->prot);
		sock_hold(sk);
	}
	write_unlock_bh(&udp_hash_lock);
	return 0;

fail:
	write_unlock_bh(&udp_hash_lock);
	return 1;