1/*
2 * Copyright 2006-2010, Haiku, Inc. All Rights Reserved.
3 * Distributed under the terms of the MIT License.
4 *
5 * Authors:
6 *		Andrew Galante, haiku.galante@gmail.com
7 *		Axel Dörfler, axeld@pinc-software.de
8 *		Hugo Santos, hugosantos@gmail.com
9 */
10
11
12#include "TCPEndpoint.h"
13
14#include <netinet/in.h>
15#include <netinet/ip.h>
16#include <netinet/tcp.h>
17#include <new>
18#include <signal.h>
19#include <stdlib.h>
20#include <string.h>
21
22#include <KernelExport.h>
23#include <Select.h>
24
25#include <net_buffer.h>
26#include <net_datalink.h>
27#include <net_stat.h>
28#include <NetBufferUtilities.h>
29#include <NetUtilities.h>
30
31#include <lock.h>
32#include <tracing.h>
33#include <util/AutoLock.h>
34#include <util/khash.h>
35#include <util/list.h>
36
37#include "EndpointManager.h"
38
39
40// References:
41//  - RFC 793 - Transmission Control Protocol
42//  - RFC 813 - Window and Acknowledgement Strategy in TCP
43//	- RFC 1337 - TIME_WAIT Assassination Hazards in TCP
44//
45// Things this implementation currently doesn't implement:
46//	- TCP Slow Start, Congestion Avoidance, Fast Retransmit, and Fast Recovery,
47//	  RFC 2001, RFC 2581, RFC 3042
48//	- NewReno Modification to TCP's Fast Recovery, RFC 2582
49//	- Explicit Congestion Notification (ECN), RFC 3168
50//	- SYN-Cache
51//	- TCP Extensions for High Performance, RFC 1323
52//	- SACK, Selective Acknowledgment - RFC 2018, RFC 2883, RFC 3517
53//	- Forward RTO-Recovery, RFC 4138
54//	- Time-Wait hash instead of keeping sockets alive
55
56#define PrintAddress(address) \
57	AddressString(Domain(), address, true).Data()
58
59//#define TRACE_TCP
60//#define PROBE_TCP
61
62#ifdef TRACE_TCP
63// the space before ', ##args' is important in order for this to work with cpp 2.95
64#	define TRACE(format, args...)	dprintf("%ld: TCP [%llu] %p (%12s) " \
65		format "\n", find_thread(NULL), system_time(), this, \
66		name_for_state(fState) , ##args)
67#else
68#	define TRACE(args...)			do { } while (0)
69#endif
70
71#ifdef PROBE_TCP
72#	define PROBE(buffer, window) \
73	dprintf("TCP PROBE %llu %s %s %ld snxt %lu suna %lu cw %lu sst %lu win %lu swin %lu smax-suna %lu savail %lu sqused %lu rto %llu\n", \
74		system_time(), PrintAddress(buffer->source), \
75		PrintAddress(buffer->destination), buffer->size, fSendNext.Number(), \
76		fSendUnacknowledged.Number(), fCongestionWindow, fSlowStartThreshold, \
77		window, fSendWindow, (fSendMax - fSendUnacknowledged).Number(), \
78		fSendQueue.Available(fSendNext), fSendQueue.Used(), fRetransmitTimeout)
79#else
80#	define PROBE(buffer, window)	do { } while (0)
81#endif
82
83#if TCP_TRACING
84namespace TCPTracing {
85
86class Receive : public AbstractTraceEntry {
87public:
88	Receive(TCPEndpoint* endpoint, tcp_segment_header& segment, uint32 window,
89			net_buffer* buffer)
90		:
91		fEndpoint(endpoint),
92		fBuffer(buffer),
93		fBufferSize(buffer->size),
94		fSequence(segment.sequence),
95		fAcknowledge(segment.acknowledge),
96		fWindow(window),
97		fState(endpoint->State()),
98		fFlags(segment.flags)
99	{
100		Initialized();
101	}
102
103	virtual void AddDump(TraceOutput& out)
104	{
105		out.Print("tcp:%p (%12s) receive buffer %p (%lu bytes), flags %x, "
106			"seq %lu, ack %lu, wnd %lu", fEndpoint, name_for_state(fState),
107			fBuffer, fBufferSize, fFlags, fSequence, fAcknowledge, fWindow);
108	}
109
110protected:
111	TCPEndpoint*	fEndpoint;
112	net_buffer*		fBuffer;
113	uint32			fBufferSize;
114	uint32			fSequence;
115	uint32			fAcknowledge;
116	uint32			fWindow;
117	tcp_state		fState;
118	uint8			fFlags;
119};
120
121class Send : public AbstractTraceEntry {
122public:
123	Send(TCPEndpoint* endpoint, tcp_segment_header& segment, net_buffer* buffer,
124			tcp_sequence firstSequence, tcp_sequence lastSequence)
125		:
126		fEndpoint(endpoint),
127		fBuffer(buffer),
128		fBufferSize(buffer->size),
129		fSequence(segment.sequence),
130		fAcknowledge(segment.acknowledge),
131		fFirstSequence(firstSequence.Number()),
132		fLastSequence(lastSequence.Number()),
133		fState(endpoint->State()),
134		fFlags(segment.flags)
135	{
136		Initialized();
137	}
138
139	virtual void AddDump(TraceOutput& out)
140	{
141		out.Print("tcp:%p (%12s) send buffer %p (%lu bytes), flags %x, "
142			"seq %lu, ack %lu, first %lu, last %lu",
143			fEndpoint, name_for_state(fState), fBuffer, fBufferSize, fFlags,
144			fSequence, fAcknowledge, fFirstSequence, fLastSequence);
145	}
146
147protected:
148	TCPEndpoint*	fEndpoint;
149	net_buffer*		fBuffer;
150	uint32			fBufferSize;
151	uint32			fSequence;
152	uint32			fAcknowledge;
153	uint32			fFirstSequence;
154	uint32			fLastSequence;
155	tcp_state		fState;
156	uint8			fFlags;
157};
158
159class State : public AbstractTraceEntry {
160public:
161	State(TCPEndpoint* endpoint)
162		:
163		fEndpoint(endpoint),
164		fState(endpoint->State())
165	{
166		Initialized();
167	}
168
169	virtual void AddDump(TraceOutput& out)
170	{
171		out.Print("tcp:%p (%12s) state change", fEndpoint,
172			name_for_state(fState));
173	}
174
175protected:
176	TCPEndpoint*	fEndpoint;
177	tcp_state		fState;
178};
179
180class Spawn : public AbstractTraceEntry {
181public:
182	Spawn(TCPEndpoint* listeningEndpoint, TCPEndpoint* spawnedEndpoint)
183		:
184		fListeningEndpoint(listeningEndpoint),
185		fSpawnedEndpoint(spawnedEndpoint)
186	{
187		Initialized();
188	}
189
190	virtual void AddDump(TraceOutput& out)
191	{
192		out.Print("tcp:%p spawns %p", fListeningEndpoint, fSpawnedEndpoint);
193	}
194
195protected:
196	TCPEndpoint*	fListeningEndpoint;
197	TCPEndpoint*	fSpawnedEndpoint;
198};
199
200class Error : public AbstractTraceEntry {
201public:
202	Error(TCPEndpoint* endpoint, const char* error, int32 line)
203		:
204		fEndpoint(endpoint),
205		fLine(line),
206		fError(error),
207		fState(endpoint->State())
208	{
209		Initialized();
210	}
211
212	virtual void AddDump(TraceOutput& out)
213	{
214		out.Print("tcp:%p (%12s) error at line %ld: %s", fEndpoint,
215			name_for_state(fState), fLine, fError);
216	}
217
218protected:
219	TCPEndpoint*	fEndpoint;
220	int32			fLine;
221	const char*		fError;
222	tcp_state		fState;
223};
224
225class Timer : public AbstractTraceEntry {
226public:
227	Timer(TCPEndpoint* endpoint, const char* which)
228		:
229		fEndpoint(endpoint),
230		fWhich(which),
231		fState(endpoint->State())
232	{
233		Initialized();
234	}
235
236	virtual void AddDump(TraceOutput& out)
237	{
238		out.Print("tcp:%p (%12s) %s timer", fEndpoint,
239			name_for_state(fState), fWhich);
240	}
241
242protected:
243	TCPEndpoint*	fEndpoint;
244	const char*		fWhich;
245	tcp_state		fState;
246};
247
248}	// namespace TCPTracing
249
250#	define T(x)	new(std::nothrow) TCPTracing::x
251#else
252#	define T(x)
253#endif	// TCP_TRACING
254
255// Initial estimate for packet round trip time (RTT)
256#define TCP_INITIAL_RTT		2000000
257
258// constants for the fFlags field
259enum {
260	FLAG_OPTION_WINDOW_SCALE	= 0x01,
261	FLAG_OPTION_TIMESTAMP		= 0x02,
262	// TODO: Should FLAG_NO_RECEIVE apply as well to received connections?
263	//       That is, what is expected from accept() after a shutdown()
264	//       is performed on a listen()ing socket.
265	FLAG_NO_RECEIVE				= 0x04,
266	FLAG_CLOSED					= 0x08,
267	FLAG_DELETE_ON_CLOSE		= 0x10,
268	FLAG_LOCAL					= 0x20
269};
270
271
272static const int kTimestampFactor = 1024;
273
274
275static inline bigtime_t
276absolute_timeout(bigtime_t timeout)
277{
278	if (timeout == 0 || timeout == B_INFINITE_TIMEOUT)
279		return timeout;
280
281	return timeout + system_time();
282}
283
284
285static inline status_t
286posix_error(status_t error)
287{
288	if (error == B_TIMED_OUT)
289		return B_WOULD_BLOCK;
290
291	return error;
292}
293
294
295static inline bool
296in_window(const tcp_sequence& sequence, const tcp_sequence& receiveNext,
297	uint32 receiveWindow)
298{
299	return sequence >= receiveNext && sequence < (receiveNext + receiveWindow);
300}
301
302
303static inline bool
304segment_in_sequence(const tcp_segment_header& segment, int size,
305	const tcp_sequence& receiveNext, uint32 receiveWindow)
306{
307	tcp_sequence sequence(segment.sequence);
308	if (size == 0) {
309		if (receiveWindow == 0)
310			return sequence == receiveNext;
311		return in_window(sequence, receiveNext, receiveWindow);
312	} else {
313		if (receiveWindow == 0)
314			return false;
315		return in_window(sequence, receiveNext, receiveWindow)
316			|| in_window(sequence + size - 1, receiveNext, receiveWindow);
317	}
318}
319
320
321static inline bool
322is_writable(tcp_state state)
323{
324	return state == SYNCHRONIZE_SENT || state == SYNCHRONIZE_RECEIVED
325		|| state == ESTABLISHED || state == FINISH_RECEIVED;
326}
327
328
329static inline uint32 tcp_now()
330{
331	return system_time() / kTimestampFactor;
332}
333
334
335static inline uint32 tcp_diff_timestamp(uint32 base)
336{
337	uint32 now = tcp_now();
338
339	if (now > base)
340		return now - base;
341
342	return now + UINT_MAX - base;
343}
344
345
346static inline bool
347state_needs_finish(int32 state)
348{
349	return state == WAIT_FOR_FINISH_ACKNOWLEDGE
350		|| state == FINISH_SENT || state == CLOSING;
351}
352
353
354//	#pragma mark -
355
356
357WaitList::WaitList(const char* name)
358{
359	fCondition = 0;
360	fSem = create_sem(0, name);
361}
362
363
364WaitList::~WaitList()
365{
366	delete_sem(fSem);
367}
368
369
370status_t
371WaitList::InitCheck() const
372{
373	return fSem;
374}
375
376
377status_t
378WaitList::Wait(MutexLocker& locker, bigtime_t timeout)
379{
380	locker.Unlock();
381
382	status_t status = B_OK;
383
384	while (!atomic_test_and_set(&fCondition, 0, 1)) {
385		status = acquire_sem_etc(fSem, 1, B_ABSOLUTE_TIMEOUT | B_CAN_INTERRUPT,
386			timeout);
387		if (status != B_OK)
388			break;
389	}
390
391	locker.Lock();
392	return status;
393}
394
395
396void
397WaitList::Signal()
398{
399	atomic_or(&fCondition, 1);
400#ifdef __HAIKU__
401	release_sem_etc(fSem, 1, B_DO_NOT_RESCHEDULE | B_RELEASE_ALL);
402#else
403	int32 count;
404	if (get_sem_count(fSem, &count) == B_OK && count < 0)
405		release_sem_etc(fSem, -count, B_DO_NOT_RESCHEDULE);
406#endif
407}
408
409
410//	#pragma mark -
411
412
413TCPEndpoint::TCPEndpoint(net_socket* socket)
414	:
415	ProtocolSocket(socket),
416	fManager(NULL),
417	fReceiveList("tcp receive"),
418	fSendList("tcp send"),
419	fOptions(0),
420	fSendWindowShift(0),
421	fReceiveWindowShift(0),
422	fSendUnacknowledged(0),
423	fSendNext(0),
424	fSendMax(0),
425	fSendUrgentOffset(0),
426	fSendWindow(0),
427	fSendMaxWindow(0),
428	fSendMaxSegmentSize(TCP_DEFAULT_MAX_SEGMENT_SIZE),
429	fSendQueue(socket->send.buffer_size),
430	fInitialSendSequence(0),
431	fDuplicateAcknowledgeCount(0),
432	fRoute(NULL),
433	fReceiveNext(0),
434	fReceiveMaxAdvertised(0),
435	fReceiveWindow(socket->receive.buffer_size),
436	fReceiveMaxSegmentSize(TCP_DEFAULT_MAX_SEGMENT_SIZE),
437	fReceiveQueue(socket->receive.buffer_size),
438	fRoundTripTime(TCP_INITIAL_RTT / kTimestampFactor),
439	fRoundTripDeviation(TCP_INITIAL_RTT / kTimestampFactor),
440	fRetransmitTimeout(TCP_INITIAL_RTT),
441	fReceivedTimestamp(0),
442	fCongestionWindow(0),
443	fSlowStartThreshold(0),
444	fState(CLOSED),
445	fFlags(FLAG_OPTION_WINDOW_SCALE | FLAG_OPTION_TIMESTAMP)
446{
447	// TODO: to be replaced with a real read/write locking strategy!
448	mutex_init(&fLock, "tcp lock");
449
450	gStackModule->init_timer(&fPersistTimer, TCPEndpoint::_PersistTimer, this);
451	gStackModule->init_timer(&fRetransmitTimer, TCPEndpoint::_RetransmitTimer,
452		this);
453	gStackModule->init_timer(&fDelayedAcknowledgeTimer,
454		TCPEndpoint::_DelayedAcknowledgeTimer, this);
455	gStackModule->init_timer(&fTimeWaitTimer, TCPEndpoint::_TimeWaitTimer,
456		this);
457}
458
459
460TCPEndpoint::~TCPEndpoint()
461{
462	mutex_lock(&fLock);
463
464	_CancelConnectionTimers();
465	gStackModule->cancel_timer(&fTimeWaitTimer);
466
467	if (fManager != NULL) {
468		fManager->Unbind(this);
469		put_endpoint_manager(fManager);
470	}
471
472	mutex_destroy(&fLock);
473
474	// we need to wait for all timers to return
475	gStackModule->wait_for_timer(&fRetransmitTimer);
476	gStackModule->wait_for_timer(&fPersistTimer);
477	gStackModule->wait_for_timer(&fDelayedAcknowledgeTimer);
478	gStackModule->wait_for_timer(&fTimeWaitTimer);
479
480	gDatalinkModule->put_route(Domain(), fRoute);
481}
482
483
484status_t
485TCPEndpoint::InitCheck() const
486{
487	if (fReceiveList.InitCheck() < B_OK)
488		return fReceiveList.InitCheck();
489
490	if (fSendList.InitCheck() < B_OK)
491		return fSendList.InitCheck();
492
493	return B_OK;
494}
495
496
497//	#pragma mark - protocol API
498
499
500status_t
501TCPEndpoint::Open()
502{
503	TRACE("Open()");
504
505	status_t status = ProtocolSocket::Open();
506	if (status < B_OK)
507		return status;
508
509	fManager = get_endpoint_manager(Domain());
510	if (fManager == NULL)
511		return EAFNOSUPPORT;
512
513	return B_OK;
514}
515
516
517status_t
518TCPEndpoint::Close()
519{
520	TRACE("Close()");
521
522	MutexLocker locker(fLock);
523
524	if (fState == LISTEN)
525		delete_sem(fAcceptSemaphore);
526
527	if (fState == SYNCHRONIZE_SENT || fState == LISTEN) {
528		// TODO: what about linger in case of SYNCHRONIZE_SENT?
529		fState = CLOSED;
530		T(State(this));
531		return B_OK;
532	}
533
534	status_t status = _Disconnect(true);
535	if (status != B_OK)
536		return status;
537
538	if (socket->options & SO_LINGER) {
539		TRACE("Close(): Lingering for %i secs", socket->linger);
540
541		bigtime_t maximum = absolute_timeout(socket->linger * 1000000LL);
542
543		while (fSendQueue.Used() > 0) {
544			status = fSendList.Wait(locker, maximum);
545			if (status == B_TIMED_OUT || status == B_WOULD_BLOCK)
546				break;
547			else if (status < B_OK)
548				return status;
549		}
550
551		TRACE("Close(): after waiting, the SendQ was left with %lu bytes.",
552			fSendQueue.Used());
553	}
554	return B_OK;
555}
556
557
558void
559TCPEndpoint::Free()
560{
561	TRACE("Free()");
562
563	MutexLocker _(fLock);
564
565	if (fState <= SYNCHRONIZE_SENT)
566		return;
567
568	// we are only interested in the timer, not in changing state
569	_EnterTimeWait();
570
571	fFlags |= FLAG_CLOSED;
572	if ((fFlags & FLAG_DELETE_ON_CLOSE) == 0) {
573		// we'll be freed later when the 2MSL timer expires
574		gSocketModule->acquire_socket(socket);
575	}
576}
577
578
579/*!	Creates and sends a synchronize packet to /a address, and then waits
580	until the connection has been established or refused.
581*/
582status_t
583TCPEndpoint::Connect(const sockaddr* address)
584{
585	TRACE("Connect() on address %s", PrintAddress(address));
586
587	if (!AddressModule()->is_same_family(address))
588		return EAFNOSUPPORT;
589
590	MutexLocker locker(fLock);
591
592	if (gStackModule->is_restarted_syscall()) {
593		bigtime_t timeout = gStackModule->restore_syscall_restart_timeout();
594		status_t status = _WaitForEstablished(locker, timeout);
595		TRACE("  Connect(): Connection complete: %s (timeout was %llu)",
596			strerror(status), timeout);
597		return posix_error(status);
598	}
599
600	// Can only call connect() from CLOSED or LISTEN states
601	// otherwise endpoint is considered already connected
602	if (fState == LISTEN) {
603		// this socket is about to connect; remove pending connections in the backlog
604		gSocketModule->set_max_backlog(socket, 0);
605	} else if (fState == ESTABLISHED) {
606		return EISCONN;
607	} else if (fState != CLOSED)
608		return EINPROGRESS;
609
610	// consider destination address INADDR_ANY as INADDR_LOOPBACK
611	sockaddr_storage _address;
612	if (AddressModule()->is_empty_address(address, false)) {
613		AddressModule()->get_loopback_address((sockaddr *)&_address);
614		// for IPv4 and IPv6 the port is at the same offset
615		((sockaddr_in &)_address).sin_port = ((sockaddr_in *)address)->sin_port;
616		address = (sockaddr *)&_address;
617	}
618
619	status_t status = _PrepareSendPath(address);
620	if (status < B_OK)
621		return status;
622
623	TRACE("  Connect(): starting 3-way handshake...");
624
625	fState = SYNCHRONIZE_SENT;
626	T(State(this));
627
628	// send SYN
629	status = _SendQueued();
630	if (status != B_OK) {
631		_Close();
632		return status;
633	}
634
635	// If we are running over Loopback, after _SendQueued() returns we
636	// may be in ESTABLISHED already.
637	if (fState == ESTABLISHED) {
638		TRACE("  Connect() completed after _SendQueued()");
639		return B_OK;
640	}
641
642	// wait until 3-way handshake is complete (if needed)
643	bigtime_t timeout = min_c(socket->send.timeout, TCP_CONNECTION_TIMEOUT);
644	if (timeout == 0) {
645		// we're a non-blocking socket
646		TRACE("  Connect() delayed, return EINPROGRESS");
647		return EINPROGRESS;
648	}
649
650	bigtime_t absoluteTimeout = absolute_timeout(timeout);
651	gStackModule->store_syscall_restart_timeout(absoluteTimeout);
652
653	status = _WaitForEstablished(locker, absoluteTimeout);
654	TRACE("  Connect(): Connection complete: %s (timeout was %llu)",
655		strerror(status), timeout);
656	return posix_error(status);
657}
658
659
660status_t
661TCPEndpoint::Accept(struct net_socket** _acceptedSocket)
662{
663	TRACE("Accept()");
664
665	MutexLocker locker(fLock);
666
667	status_t status;
668	bigtime_t timeout = absolute_timeout(socket->receive.timeout);
669	if (gStackModule->is_restarted_syscall())
670		timeout = gStackModule->restore_syscall_restart_timeout();
671	else
672		gStackModule->store_syscall_restart_timeout(timeout);
673
674	do {
675		locker.Unlock();
676
677		status = acquire_sem_etc(fAcceptSemaphore, 1, B_ABSOLUTE_TIMEOUT
678			| B_CAN_INTERRUPT, timeout);
679		if (status != B_OK) {
680			if (status == B_TIMED_OUT && socket->receive.timeout == 0)
681				return B_WOULD_BLOCK;
682
683			return status;
684		}
685
686		locker.Lock();
687		status = gSocketModule->dequeue_connected(socket, _acceptedSocket);
688#ifdef TRACE_TCP
689		if (status == B_OK)
690			TRACE("  Accept() returning %p", (*_acceptedSocket)->first_protocol);
691#endif
692	} while (status != B_OK);
693
694	return status;
695}
696
697
698status_t
699TCPEndpoint::Bind(const sockaddr *address)
700{
701	if (address == NULL)
702		return B_BAD_VALUE;
703
704	MutexLocker lock(fLock);
705
706	TRACE("Bind() on address %s", PrintAddress(address));
707
708	if (fState != CLOSED)
709		return EISCONN;
710
711	return fManager->Bind(this, address);
712}
713
714
715status_t
716TCPEndpoint::Unbind(struct sockaddr *address)
717{
718	TRACE("Unbind()");
719
720	MutexLocker _(fLock);
721	return fManager->Unbind(this);
722}
723
724
725status_t
726TCPEndpoint::Listen(int count)
727{
728	TRACE("Listen()");
729
730	MutexLocker _(fLock);
731
732	if (fState != CLOSED)
733		return B_BAD_VALUE;
734
735	fAcceptSemaphore = create_sem(0, "tcp accept");
736	if (fAcceptSemaphore < B_OK)
737		return ENOBUFS;
738
739	status_t status = fManager->SetPassive(this);
740	if (status != B_OK) {
741		delete_sem(fAcceptSemaphore);
742		fAcceptSemaphore = -1;
743		return status;
744	}
745
746	gSocketModule->set_max_backlog(socket, count);
747
748	fState = LISTEN;
749	T(State(this));
750	return B_OK;
751}
752
753
754status_t
755TCPEndpoint::Shutdown(int direction)
756{
757	TRACE("Shutdown(%i)", direction);
758
759	MutexLocker lock(fLock);
760
761	if (direction == SHUT_RD || direction == SHUT_RDWR)
762		fFlags |= FLAG_NO_RECEIVE;
763
764	if (direction == SHUT_WR || direction == SHUT_RDWR) {
765		// TODO: That's not correct. After read/write shutting down the socket
766		// one should still be able to read previously arrived data.
767		_Disconnect(false);
768	}
769
770	return B_OK;
771}
772
773
774/*!	Puts data contained in \a buffer into send buffer */
775status_t
776TCPEndpoint::SendData(net_buffer *buffer)
777{
778	MutexLocker lock(fLock);
779
780	TRACE("SendData(buffer %p, size %lu, flags %lx) [total %lu bytes, has %lu]",
781		  buffer, buffer->size, buffer->flags, fSendQueue.Size(),
782		  fSendQueue.Free());
783
784	if (fState == CLOSED)
785		return ENOTCONN;
786	if (fState == LISTEN)
787		return EDESTADDRREQ;
788	if (!is_writable(fState)) {
789		// we only send signals when called from userland
790		if (gStackModule->is_syscall())
791			send_signal(find_thread(NULL), SIGPIPE);
792		return EPIPE;
793	}
794
795	uint32 flags = buffer->flags;
796	size_t left = buffer->size;
797
798	bigtime_t timeout = absolute_timeout(socket->send.timeout);
799	if (gStackModule->is_restarted_syscall())
800		timeout = gStackModule->restore_syscall_restart_timeout();
801	else
802		gStackModule->store_syscall_restart_timeout(timeout);
803
804	while (left > 0) {
805		while (fSendQueue.Free() < socket->send.low_water_mark) {
806			// wait until enough space is available
807			status_t status = fSendList.Wait(lock, timeout);
808			if (status < B_OK) {
809				TRACE("  SendData() returning %s (%d)",
810					strerror(posix_error(status)), (int)posix_error(status));
811				return posix_error(status);
812			}
813
814			if (!is_writable(fState)) {
815				// we only send signals when called from userland
816				if (gStackModule->is_syscall())
817					send_signal(find_thread(NULL), SIGPIPE);
818				return EPIPE;
819			}
820		}
821
822		size_t size = fSendQueue.Free();
823		if (size < left) {
824			// we need to split the original buffer
825			net_buffer* clone = gBufferModule->clone(buffer, false);
826				// TODO: add offset/size parameter to net_buffer::clone() or
827				// even a move_data() function, as this is a bit inefficient
828			if (clone == NULL)
829				return ENOBUFS;
830
831			status_t status = gBufferModule->trim(clone, size);
832			if (status != B_OK) {
833				gBufferModule->free(clone);
834				return status;
835			}
836
837			gBufferModule->remove_header(buffer, size);
838			left -= size;
839			fSendQueue.Add(clone);
840		} else {
841			left -= buffer->size;
842			fSendQueue.Add(buffer);
843		}
844	}
845
846	TRACE("  SendData(): %lu bytes used.", fSendQueue.Used());
847
848	bool force = false;
849	if ((flags & MSG_OOB) != 0) {
850		fSendUrgentOffset = fSendQueue.LastSequence();
851			// RFC 961 specifies that the urgent offset points to the last
852			// byte of urgent data. However, this is commonly implemented as
853			// here, ie. it points to the first byte after the urgent data.
854		force = true;
855	}
856	if ((flags & MSG_EOF) != 0)
857		_Disconnect(false);
858
859	if (fState == ESTABLISHED || fState == FINISH_RECEIVED)
860		_SendQueued(force);
861
862	return B_OK;
863}
864
865
866ssize_t
867TCPEndpoint::SendAvailable()
868{
869	MutexLocker locker(fLock);
870
871	ssize_t available;
872
873	if (is_writable(fState))
874		available = fSendQueue.Free();
875	else
876		available = EPIPE;
877
878	TRACE("SendAvailable(): %li", available);
879	return available;
880}
881
882
883status_t
884TCPEndpoint::FillStat(net_stat *stat)
885{
886	MutexLocker _(fLock);
887
888	strlcpy(stat->state, name_for_state(fState), sizeof(stat->state));
889	stat->receive_queue_size = fReceiveQueue.Available();
890	stat->send_queue_size = fSendQueue.Used();
891
892	return B_OK;
893}
894
895
896status_t
897TCPEndpoint::ReadData(size_t numBytes, uint32 flags, net_buffer** _buffer)
898{
899	TRACE("ReadData(%lu bytes, flags 0x%x)", numBytes, (unsigned int)flags);
900
901	MutexLocker locker(fLock);
902
903	*_buffer = NULL;
904
905	if (fState == CLOSED)
906		return ENOTCONN;
907
908	bigtime_t timeout = absolute_timeout(socket->receive.timeout);
909	if (gStackModule->is_restarted_syscall())
910		timeout = gStackModule->restore_syscall_restart_timeout();
911	else
912		gStackModule->store_syscall_restart_timeout(timeout);
913
914	if (fState == SYNCHRONIZE_SENT || fState == SYNCHRONIZE_RECEIVED) {
915		if (flags & MSG_DONTWAIT)
916			return B_WOULD_BLOCK;
917
918		status_t status = _WaitForEstablished(locker, timeout);
919		if (status < B_OK)
920			return posix_error(status);
921	}
922
923	size_t dataNeeded = socket->receive.low_water_mark;
924
925	// When MSG_WAITALL is set then the function should block
926	// until the full amount of data can be returned.
927	if (flags & MSG_WAITALL)
928		dataNeeded = numBytes;
929
930	// TODO: add support for urgent data (MSG_OOB)
931
932	while (true) {
933		if (fState == CLOSING || fState == WAIT_FOR_FINISH_ACKNOWLEDGE
934			|| fState == TIME_WAIT) {
935			// ``Connection closing''.
936			return B_OK;
937		}
938
939		if (fReceiveQueue.Available() > 0) {
940			if (fReceiveQueue.Available() >= dataNeeded
941				|| (fReceiveQueue.PushedData() > 0
942					&& fReceiveQueue.PushedData() >= fReceiveQueue.Available()))
943				break;
944		} else if (fState == FINISH_RECEIVED) {
945			// ``If no text is awaiting delivery, the RECEIVE will
946			//   get a Connection closing''.
947			return B_OK;
948		}
949
950		if ((flags & MSG_DONTWAIT) != 0 || socket->receive.timeout == 0)
951			return B_WOULD_BLOCK;
952
953		if ((fFlags & FLAG_NO_RECEIVE) != 0)
954			return B_OK;
955
956		status_t status = fReceiveList.Wait(locker, timeout);
957		if (status < B_OK) {
958			// The Open Group base specification mentions that EINTR should be
959			// returned if the recv() is interrupted before _any data_ is
960			// available. So we actually check if there is data, and if so,
961			// push it to the user.
962			if ((status == B_TIMED_OUT || status == B_INTERRUPTED)
963				&& fReceiveQueue.Available() > 0)
964				break;
965
966			return posix_error(status);
967		}
968	}
969
970	TRACE("  ReadData(): %lu are available.", fReceiveQueue.Available());
971
972	if (numBytes < fReceiveQueue.Available())
973		fReceiveList.Signal();
974
975	bool clone = (flags & MSG_PEEK) != 0;
976
977	ssize_t receivedBytes = fReceiveQueue.Get(numBytes, !clone, _buffer);
978
979	TRACE("  ReadData(): %lu bytes kept.", fReceiveQueue.Available());
980
981	// if we are opening the window, check if we should send an ACK
982	if (!clone)
983		SendAcknowledge(false);
984
985	return receivedBytes;
986}
987
988
989ssize_t
990TCPEndpoint::ReadAvailable()
991{
992	MutexLocker locker(fLock);
993
994	TRACE("ReadAvailable(): %li", _AvailableData());
995
996	return _AvailableData();
997}
998
999
1000status_t
1001TCPEndpoint::SetSendBufferSize(size_t length)
1002{
1003	MutexLocker _(fLock);
1004	fSendQueue.SetMaxBytes(length);
1005	return B_OK;
1006}
1007
1008
1009status_t
1010TCPEndpoint::SetReceiveBufferSize(size_t length)
1011{
1012	MutexLocker _(fLock);
1013	fReceiveQueue.SetMaxBytes(length);
1014	return B_OK;
1015}
1016
1017
1018status_t
1019TCPEndpoint::GetOption(int option, void* _value, int* _length)
1020{
1021	if (*_length != sizeof(int))
1022		return B_BAD_VALUE;
1023
1024	int* value = (int*)_value;
1025
1026	switch (option) {
1027		case TCP_NODELAY:
1028			if ((fOptions & TCP_NODELAY) != 0)
1029				*value = 1;
1030			else
1031				*value = 0;
1032			return B_OK;
1033
1034		case TCP_MAXSEG:
1035			*value = fReceiveMaxSegmentSize;
1036			return B_OK;
1037
1038		default:
1039			return B_BAD_VALUE;
1040	}
1041}
1042
1043
1044status_t
1045TCPEndpoint::SetOption(int option, const void* _value, int length)
1046{
1047	if (option != TCP_NODELAY)
1048		return B_BAD_VALUE;
1049
1050	if (length != sizeof(int))
1051		return B_BAD_VALUE;
1052
1053	const int* value = (const int*)_value;
1054
1055	MutexLocker _(fLock);
1056	if (*value)
1057		fOptions |= TCP_NODELAY;
1058	else
1059		fOptions &= ~TCP_NODELAY;
1060
1061	return B_OK;
1062}
1063
1064
1065//	#pragma mark - misc
1066
1067
1068bool
1069TCPEndpoint::IsBound() const
1070{
1071	return !LocalAddress().IsEmpty(true);
1072}
1073
1074
1075bool
1076TCPEndpoint::IsLocal() const
1077{
1078	return (fFlags & FLAG_LOCAL) != 0;
1079}
1080
1081
1082status_t
1083TCPEndpoint::DelayedAcknowledge()
1084{
1085	if (gStackModule->cancel_timer(&fDelayedAcknowledgeTimer)) {
1086		// timer was active, send an ACK now (with the exception above,
1087		// we send every other ACK)
1088		return SendAcknowledge(true);
1089	}
1090
1091	gStackModule->set_timer(&fDelayedAcknowledgeTimer,
1092		TCP_DELAYED_ACKNOWLEDGE_TIMEOUT);
1093	return B_OK;
1094}
1095
1096
1097status_t
1098TCPEndpoint::SendAcknowledge(bool force)
1099{
1100	return _SendQueued(force, 0);
1101}
1102
1103
1104void
1105TCPEndpoint::_StartPersistTimer()
1106{
1107	gStackModule->set_timer(&fPersistTimer, 1000000LL);
1108}
1109
1110
1111void
1112TCPEndpoint::_EnterTimeWait()
1113{
1114	TRACE("_EnterTimeWait()\n");
1115
1116	_CancelConnectionTimers();
1117
1118	if (fState == TIME_WAIT && IsLocal()) {
1119		// we do not use TIME_WAIT state for local connections
1120		fFlags |= FLAG_DELETE_ON_CLOSE;
1121		return;
1122	}
1123
1124	_UpdateTimeWait();
1125}
1126
1127
1128void
1129TCPEndpoint::_UpdateTimeWait()
1130{
1131	gStackModule->set_timer(&fTimeWaitTimer, TCP_MAX_SEGMENT_LIFETIME << 1);
1132}
1133
1134
1135void
1136TCPEndpoint::_CancelConnectionTimers()
1137{
1138	gStackModule->cancel_timer(&fRetransmitTimer);
1139	gStackModule->cancel_timer(&fPersistTimer);
1140	gStackModule->cancel_timer(&fDelayedAcknowledgeTimer);
1141}
1142
1143
1144/*!	Sends the FIN flag to the peer when the connection is still open.
1145	Moves the endpoint to the next state depending on where it was.
1146*/
1147status_t
1148TCPEndpoint::_Disconnect(bool closing)
1149{
1150	tcp_state previousState = fState;
1151
1152	if (fState == SYNCHRONIZE_RECEIVED || fState == ESTABLISHED)
1153		fState = FINISH_SENT;
1154	else if (fState == FINISH_RECEIVED)
1155		fState = WAIT_FOR_FINISH_ACKNOWLEDGE;
1156	else
1157		return B_OK;
1158
1159	T(State(this));
1160
1161	status_t status = _SendQueued();
1162	if (status != B_OK) {
1163		fState = previousState;
1164		T(State(this));
1165		return status;
1166	}
1167
1168	return B_OK;
1169}
1170
1171
1172void
1173TCPEndpoint::_MarkEstablished()
1174{
1175	fState = ESTABLISHED;
1176	T(State(this));
1177
1178	if (gSocketModule->has_parent(socket)) {
1179		gSocketModule->set_connected(socket);
1180		release_sem_etc(fAcceptSemaphore, 1, B_DO_NOT_RESCHEDULE);
1181	}
1182
1183	fSendList.Signal();
1184}
1185
1186
1187status_t
1188TCPEndpoint::_WaitForEstablished(MutexLocker &locker, bigtime_t timeout)
1189{
1190	// TODO: Checking for CLOSED seems correct, but breaks several neon tests.
1191	// When investigating this, also have a look at _Close() and _HandleReset().
1192	while (fState < ESTABLISHED/* && fState != CLOSED*/) {
1193		if (socket->error != B_OK)
1194			return socket->error;
1195
1196		status_t status = fSendList.Wait(locker, timeout);
1197		if (status < B_OK)
1198			return status;
1199	}
1200
1201	return B_OK;
1202}
1203
1204
1205//	#pragma mark - receive
1206
1207
1208void
1209TCPEndpoint::_Close()
1210{
1211	_CancelConnectionTimers();
1212	fState = CLOSED;
1213	T(State(this));
1214
1215	fFlags |= FLAG_DELETE_ON_CLOSE;
1216
1217	fSendList.Signal();
1218	_NotifyReader();
1219
1220	if (gSocketModule->has_parent(socket)) {
1221		// We still have a parent - obviously, we haven't been accepted yet,
1222		// so no one could ever close us.
1223		_CancelConnectionTimers();
1224		gSocketModule->set_aborted(socket);
1225	}
1226}
1227
1228
1229void
1230TCPEndpoint::_HandleReset(status_t error)
1231{
1232	socket->error = error;
1233	_Close();
1234
1235	gSocketModule->notify(socket, B_SELECT_WRITE, error);
1236	gSocketModule->notify(socket, B_SELECT_ERROR, error);
1237}
1238
1239
1240void
1241TCPEndpoint::_DuplicateAcknowledge(tcp_segment_header &segment)
1242{
1243	if (++fDuplicateAcknowledgeCount < 3)
1244		return;
1245
1246	if (fDuplicateAcknowledgeCount == 3) {
1247		_ResetSlowStart();
1248		fCongestionWindow = fSlowStartThreshold + 3
1249			* fSendMaxSegmentSize;
1250		fSendNext = segment.acknowledge;
1251	} else if (fDuplicateAcknowledgeCount > 3)
1252		fCongestionWindow += fSendMaxSegmentSize;
1253
1254	_SendQueued();
1255}
1256
1257
1258void
1259TCPEndpoint::_UpdateTimestamps(tcp_segment_header& segment,
1260	size_t segmentLength)
1261{
1262	if (fFlags & FLAG_OPTION_TIMESTAMP) {
1263		tcp_sequence sequence(segment.sequence);
1264
1265		if (fLastAcknowledgeSent >= sequence
1266			&& fLastAcknowledgeSent < (sequence + segmentLength))
1267			fReceivedTimestamp = segment.timestamp_value;
1268	}
1269}
1270
1271
1272ssize_t
1273TCPEndpoint::_AvailableData() const
1274{
1275	// TODO: Refer to the FLAG_NO_RECEIVE comment above regarding
1276	//       the application of FLAG_NO_RECEIVE in listen()ing
1277	//       sockets.
1278	if (fState == LISTEN)
1279		return gSocketModule->count_connected(socket);
1280	if (fState == SYNCHRONIZE_SENT)
1281		return 0;
1282
1283	ssize_t availableData = fReceiveQueue.Available();
1284
1285	if (availableData == 0 && !_ShouldReceive())
1286		return ENOTCONN;
1287
1288	return availableData;
1289}
1290
1291
1292void
1293TCPEndpoint::_NotifyReader()
1294{
1295	fReceiveList.Signal();
1296	gSocketModule->notify(socket, B_SELECT_READ, _AvailableData());
1297}
1298
1299
1300bool
1301TCPEndpoint::_AddData(tcp_segment_header& segment, net_buffer* buffer)
1302{
1303	if ((segment.flags & TCP_FLAG_FINISH) != 0) {
1304		// Remember the position of the finish received flag
1305		fFinishReceived = true;
1306		fFinishReceivedAt = segment.sequence + buffer->size;
1307	}
1308
1309	fReceiveQueue.Add(buffer, segment.sequence);
1310	fReceiveNext = fReceiveQueue.NextSequence();
1311
1312	if (fFinishReceived) {
1313		// Set or reset the finish flag on the current segment
1314		if (fReceiveNext < fFinishReceivedAt)
1315			segment.flags &= ~TCP_FLAG_FINISH;
1316		else
1317			segment.flags |= TCP_FLAG_FINISH;
1318	}
1319
1320	TRACE("  _AddData(): adding data, receive next = %lu. Now have %lu bytes.",
1321		fReceiveNext.Number(), fReceiveQueue.Available());
1322
1323	if ((segment.flags & TCP_FLAG_PUSH) != 0)
1324		fReceiveQueue.SetPushPointer();
1325
1326	return fReceiveQueue.Available() > 0;
1327}
1328
1329
1330void
1331TCPEndpoint::_PrepareReceivePath(tcp_segment_header& segment)
1332{
1333	fInitialReceiveSequence = segment.sequence;
1334	fFinishReceived = false;
1335
1336	// count the received SYN
1337	segment.sequence++;
1338
1339	fReceiveNext = segment.sequence;
1340	fReceiveQueue.SetInitialSequence(segment.sequence);
1341
1342	if ((fOptions & TCP_NOOPT) == 0) {
1343		if (segment.max_segment_size > 0)
1344			fSendMaxSegmentSize = segment.max_segment_size;
1345
1346		if (segment.options & TCP_HAS_WINDOW_SCALE) {
1347			fFlags |= FLAG_OPTION_WINDOW_SCALE;
1348			fSendWindowShift = segment.window_shift;
1349		} else {
1350			fFlags &= ~FLAG_OPTION_WINDOW_SCALE;
1351			fReceiveWindowShift = 0;
1352		}
1353
1354		if (segment.options & TCP_HAS_TIMESTAMPS) {
1355			fFlags |= FLAG_OPTION_TIMESTAMP;
1356			fReceivedTimestamp = segment.timestamp_value;
1357		} else
1358			fFlags &= ~FLAG_OPTION_TIMESTAMP;
1359	}
1360
1361	fCongestionWindow = 2 * fSendMaxSegmentSize;
1362	fSlowStartThreshold = (uint32)segment.advertised_window << fSendWindowShift;
1363}
1364
1365
1366bool
1367TCPEndpoint::_ShouldReceive() const
1368{
1369	if ((fFlags & FLAG_NO_RECEIVE) != 0)
1370		return false;
1371
1372	return fState == ESTABLISHED || fState == FINISH_SENT
1373		|| fState == FINISH_ACKNOWLEDGED;
1374}
1375
1376
1377int32
1378TCPEndpoint::_Spawn(TCPEndpoint* parent, tcp_segment_header& segment,
1379	net_buffer* buffer)
1380{
1381	MutexLocker _(fLock);
1382
1383	// TODO error checking
1384	ProtocolSocket::Open();
1385
1386	fState = SYNCHRONIZE_RECEIVED;
1387	T(Spawn(parent, this));
1388
1389	fManager = parent->fManager;
1390
1391	LocalAddress().SetTo(buffer->destination);
1392	PeerAddress().SetTo(buffer->source);
1393
1394	TRACE("Spawn()");
1395
1396	// TODO: proper error handling!
1397	if (fManager->BindChild(this) != B_OK) {
1398		T(Error(this, "binding failed", __LINE__));
1399		return DROP;
1400	}
1401	if (_PrepareSendPath(*PeerAddress()) != B_OK) {
1402		T(Error(this, "prepare send faild", __LINE__));
1403		return DROP;
1404	}
1405
1406	fOptions = parent->fOptions;
1407	fAcceptSemaphore = parent->fAcceptSemaphore;
1408
1409	_PrepareReceivePath(segment);
1410
1411	// send SYN+ACK
1412	if (_SendQueued() != B_OK) {
1413		T(Error(this, "sending failed", __LINE__));
1414		return DROP;
1415	}
1416
1417	segment.flags &= ~TCP_FLAG_SYNCHRONIZE;
1418		// we handled this flag now, it must not be set for further processing
1419
1420	return _Receive(segment, buffer);
1421}
1422
1423
1424int32
1425TCPEndpoint::_ListenReceive(tcp_segment_header& segment, net_buffer* buffer)
1426{
1427	TRACE("ListenReceive()");
1428
1429	// Essentially, we accept only TCP_FLAG_SYNCHRONIZE in this state,
1430	// but the error behaviour differs
1431	if (segment.flags & TCP_FLAG_RESET)
1432		return DROP;
1433	if (segment.flags & TCP_FLAG_ACKNOWLEDGE)
1434		return DROP | RESET;
1435	if ((segment.flags & TCP_FLAG_SYNCHRONIZE) == 0)
1436		return DROP;
1437
1438	// TODO: drop broadcast/multicast
1439
1440	// spawn new endpoint for accept()
1441	net_socket* newSocket;
1442	if (gSocketModule->spawn_pending_socket(socket, &newSocket) < B_OK) {
1443		T(Error(this, "spawning failed", __LINE__));
1444		return DROP;
1445	}
1446
1447	return ((TCPEndpoint *)newSocket->first_protocol)->_Spawn(this,
1448		segment, buffer);
1449}
1450
1451
1452int32
1453TCPEndpoint::_SynchronizeSentReceive(tcp_segment_header &segment,
1454	net_buffer *buffer)
1455{
1456	TRACE("_SynchronizeSentReceive()");
1457
1458	if ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0
1459		&& (fInitialSendSequence >= segment.acknowledge
1460			|| fSendMax < segment.acknowledge))
1461		return DROP | RESET;
1462
1463	if (segment.flags & TCP_FLAG_RESET) {
1464		_HandleReset(ECONNREFUSED);
1465		return DROP;
1466	}
1467
1468	if ((segment.flags & TCP_FLAG_SYNCHRONIZE) == 0)
1469		return DROP;
1470
1471	fSendUnacknowledged = segment.acknowledge;
1472	_PrepareReceivePath(segment);
1473
1474	if (segment.flags & TCP_FLAG_ACKNOWLEDGE) {
1475		_MarkEstablished();
1476	} else {
1477		// simultaneous open
1478		fState = SYNCHRONIZE_RECEIVED;
1479		T(State(this));
1480	}
1481
1482	segment.flags &= ~TCP_FLAG_SYNCHRONIZE;
1483		// we handled this flag now, it must not be set for further processing
1484
1485	return _Receive(segment, buffer) | IMMEDIATE_ACKNOWLEDGE;
1486}
1487
1488
1489int32
1490TCPEndpoint::_Receive(tcp_segment_header& segment, net_buffer* buffer)
1491{
1492	uint32 advertisedWindow = (uint32)segment.advertised_window
1493		<< fSendWindowShift;
1494	size_t segmentLength = buffer->size;
1495
1496	// First, handle the most common case for uni-directional data transfer
1497	// (known as header prediction - the segment must not change the window,
1498	// and must be the expected sequence, and contain no control flags)
1499
1500	if (fState == ESTABLISHED
1501		&& segment.AcknowledgeOnly()
1502		&& fReceiveNext == segment.sequence
1503		&& advertisedWindow > 0 && advertisedWindow == fSendWindow
1504		&& fSendNext == fSendMax) {
1505		_UpdateTimestamps(segment, segmentLength);
1506
1507		if (segmentLength == 0) {
1508			// this is a pure acknowledge segment - we're on the sending end
1509			if (fSendUnacknowledged < segment.acknowledge
1510				&& fSendMax >= segment.acknowledge) {
1511				_Acknowledged(segment);
1512				return DROP;
1513			}
1514		} else if (segment.acknowledge == fSendUnacknowledged
1515			&& fReceiveQueue.IsContiguous()
1516			&& fReceiveQueue.Free() >= segmentLength
1517			&& (fFlags & FLAG_NO_RECEIVE) == 0) {
1518			if (_AddData(segment, buffer))
1519				_NotifyReader();
1520
1521			return KEEP | ((segment.flags & TCP_FLAG_PUSH) != 0
1522				? IMMEDIATE_ACKNOWLEDGE : ACKNOWLEDGE);
1523		}
1524	}
1525
1526	// The fast path was not applicable, so we continue with the standard
1527	// processing of the incoming segment
1528
1529	ASSERT(fState != SYNCHRONIZE_SENT && fState != LISTEN);
1530
1531	if (fState != CLOSED && fState != TIME_WAIT) {
1532		// Check sequence number
1533		if (!segment_in_sequence(segment, segmentLength, fReceiveNext,
1534				fReceiveWindow)) {
1535			TRACE("  Receive(): segment out of window, next: %lu wnd: %lu",
1536				fReceiveNext.Number(), fReceiveWindow);
1537			if ((segment.flags & TCP_FLAG_RESET) != 0) {
1538				// TODO: this doesn't look right - review!
1539				return DROP;
1540			}
1541			return DROP | IMMEDIATE_ACKNOWLEDGE;
1542		}
1543	}
1544
1545	if ((segment.flags & TCP_FLAG_RESET) != 0) {
1546		// Is this a valid reset?
1547		// We generally ignore resets in time wait state (see RFC 1337)
1548		if (fLastAcknowledgeSent <= segment.sequence
1549			&& tcp_sequence(segment.sequence) < (fLastAcknowledgeSent
1550				+ fReceiveWindow)
1551			&& fState != TIME_WAIT) {
1552			status_t error;
1553			if (fState == SYNCHRONIZE_RECEIVED)
1554				error = ECONNREFUSED;
1555			else if (fState == CLOSING || fState == WAIT_FOR_FINISH_ACKNOWLEDGE)
1556				error = ENOTCONN;
1557			else
1558				error = ECONNRESET;
1559
1560			_HandleReset(error);
1561		}
1562
1563		return DROP;
1564	}
1565
1566	if ((segment.flags & TCP_FLAG_SYNCHRONIZE) != 0
1567		|| (fState == SYNCHRONIZE_RECEIVED
1568			&& (fInitialReceiveSequence > segment.sequence
1569				|| ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0
1570					&& (fSendUnacknowledged > segment.acknowledge
1571						|| fSendMax < segment.acknowledge))))) {
1572		// reset the connection - either the initial SYN was faulty, or we
1573		// received a SYN within the data stream
1574		return DROP | RESET;
1575	}
1576
1577	// TODO: Check this! Why do we advertize a window outside of what we should
1578	// buffer?
1579	fReceiveWindow = max_c(fReceiveQueue.Free(), fReceiveWindow);
1580		// the window must not shrink
1581
1582	// trim buffer to be within the receive window
1583	int32 drop = (int32)(fReceiveNext - segment.sequence).Number();
1584	if (drop > 0) {
1585		if ((uint32)drop > buffer->size
1586			|| ((uint32)drop == buffer->size
1587				&& (segment.flags & TCP_FLAG_FINISH) == 0)) {
1588			// don't accidently remove a FIN we shouldn't remove
1589			segment.flags &= ~TCP_FLAG_FINISH;
1590			drop = buffer->size;
1591		}
1592
1593		// remove duplicate data at the start
1594		TRACE("* remove %ld bytes from the start", drop);
1595		gBufferModule->remove_header(buffer, drop);
1596		segment.sequence += drop;
1597	}
1598
1599	int32 action = KEEP;
1600
1601	drop = (int32)(segment.sequence + buffer->size
1602		- (fReceiveNext + fReceiveWindow)).Number();
1603	if (drop > 0) {
1604		// remove data exceeding our window
1605		if ((uint32)drop >= buffer->size) {
1606			// if we can accept data, or the segment is not what we'd expect,
1607			// drop the segment (an immediate acknowledge is always triggered)
1608			if (fReceiveWindow != 0 || segment.sequence != fReceiveNext)
1609				return DROP | IMMEDIATE_ACKNOWLEDGE;
1610
1611			action |= IMMEDIATE_ACKNOWLEDGE;
1612		}
1613
1614		if ((segment.flags & TCP_FLAG_FINISH) != 0) {
1615			// we need to remove the finish, too, as part of the data
1616			drop--;
1617		}
1618
1619		segment.flags &= ~(TCP_FLAG_FINISH | TCP_FLAG_PUSH);
1620		TRACE("* remove %ld bytes from the end", drop);
1621		gBufferModule->remove_trailer(buffer, drop);
1622	}
1623
1624#ifdef TRACE_TCP
1625	if (advertisedWindow > fSendWindow) {
1626		TRACE("  Receive(): Window update %lu -> %lu", fSendWindow,
1627			advertisedWindow);
1628	}
1629#endif
1630
1631	fSendWindow = advertisedWindow;
1632	if (advertisedWindow > fSendMaxWindow)
1633		fSendMaxWindow = advertisedWindow;
1634
1635	// Then look at the acknowledgement for any updates
1636
1637	if ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0) {
1638		// process acknowledged data
1639		if (fState == SYNCHRONIZE_RECEIVED)
1640			_MarkEstablished();
1641
1642		if (fSendMax < segment.acknowledge)
1643			return DROP | IMMEDIATE_ACKNOWLEDGE;
1644
1645		if (segment.acknowledge < fSendUnacknowledged) {
1646			if (buffer->size == 0 && advertisedWindow == fSendWindow
1647				&& (segment.flags & TCP_FLAG_FINISH) == 0) {
1648				TRACE("Receive(): duplicate ack!");
1649
1650				_DuplicateAcknowledge(segment);
1651			}
1652
1653			return DROP;
1654		} else {
1655			// this segment acknowledges in flight data
1656
1657			if (fDuplicateAcknowledgeCount >= 3) {
1658				// deflate the window.
1659				fCongestionWindow = fSlowStartThreshold;
1660			}
1661
1662			fDuplicateAcknowledgeCount = 0;
1663
1664			if (fSendMax == segment.acknowledge)
1665				TRACE("Receive(): all inflight data ack'd!");
1666
1667			if (segment.acknowledge > fSendQueue.LastSequence()
1668					&& fState > ESTABLISHED) {
1669				TRACE("Receive(): FIN has been acknowledged!");
1670
1671				switch (fState) {
1672					case FINISH_SENT:
1673						fState = FINISH_ACKNOWLEDGED;
1674						T(State(this));
1675						break;
1676					case CLOSING:
1677						fState = TIME_WAIT;
1678						T(State(this));
1679						_EnterTimeWait();
1680						return DROP;
1681					case WAIT_FOR_FINISH_ACKNOWLEDGE:
1682						_Close();
1683						break;
1684
1685					default:
1686						break;
1687				}
1688			}
1689
1690			if (fState != CLOSED)
1691				_Acknowledged(segment);
1692		}
1693	}
1694
1695	if (segment.flags & TCP_FLAG_URGENT) {
1696		if (fState == ESTABLISHED || fState == FINISH_SENT
1697			|| fState == FINISH_ACKNOWLEDGED) {
1698			// TODO: Handle urgent data:
1699			//  - RCV.UP <- max(RCV.UP, SEG.UP)
1700			//  - signal the user that urgent data is available (SIGURG)
1701		}
1702	}
1703
1704	bool notify = false;
1705
1706	// The buffer may be freed if its data is added to the queue, so cache
1707	// the size as we still need it later.
1708	uint32 bufferSize = buffer->size;
1709
1710	if ((bufferSize > 0 || (segment.flags & TCP_FLAG_FINISH) != 0)
1711		&& _ShouldReceive())
1712		notify = _AddData(segment, buffer);
1713	else {
1714		if ((fFlags & FLAG_NO_RECEIVE) != 0)
1715			fReceiveNext += buffer->size;
1716
1717		action = (action & ~KEEP) | DROP;
1718	}
1719
1720	if ((segment.flags & TCP_FLAG_FINISH) != 0) {
1721		segmentLength++;
1722		if (fState != CLOSED && fState != LISTEN && fState != SYNCHRONIZE_SENT) {
1723			TRACE("Receive(): peer is finishing connection!");
1724			fReceiveNext++;
1725			notify = true;
1726
1727			// FIN implies PUSH
1728			fReceiveQueue.SetPushPointer();
1729
1730			// we'll reply immediately to the FIN if we are not
1731			// transitioning to TIME WAIT so we immediatly ACK it.
1732			action |= IMMEDIATE_ACKNOWLEDGE;
1733
1734			// other side is closing connection; change states
1735			switch (fState) {
1736				case ESTABLISHED:
1737				case SYNCHRONIZE_RECEIVED:
1738					fState = FINISH_RECEIVED;
1739					T(State(this));
1740					break;
1741				case FINISH_SENT:
1742					// simultaneous close
1743					fState = CLOSING;
1744					T(State(this));
1745					break;
1746				case FINISH_ACKNOWLEDGED:
1747					fState = TIME_WAIT;
1748					T(State(this));
1749					_EnterTimeWait();
1750					break;
1751				case TIME_WAIT:
1752					_UpdateTimeWait();
1753					break;
1754
1755				default:
1756					break;
1757			}
1758		}
1759	}
1760
1761	if (notify)
1762		_NotifyReader();
1763
1764	if (bufferSize > 0 || (segment.flags & TCP_FLAG_SYNCHRONIZE) != 0)
1765		action |= ACKNOWLEDGE;
1766
1767	_UpdateTimestamps(segment, segmentLength);
1768
1769	TRACE("Receive() Action %ld", action);
1770
1771	return action;
1772}
1773
1774
1775int32
1776TCPEndpoint::SegmentReceived(tcp_segment_header& segment, net_buffer* buffer)
1777{
1778	MutexLocker locker(fLock);
1779
1780	TRACE("SegmentReceived(): buffer %p (%lu bytes) address %s to %s\n"
1781		"\tflags 0x%x, seq %lu, ack %lu, wnd %lu",
1782		buffer, buffer->size, PrintAddress(buffer->source),
1783		PrintAddress(buffer->destination), segment.flags, segment.sequence,
1784		segment.acknowledge,
1785		(uint32)segment.advertised_window << fSendWindowShift);
1786	T(Receive(this, segment,
1787		(uint32)segment.advertised_window << fSendWindowShift, buffer));
1788	int32 segmentAction = DROP;
1789
1790	switch (fState) {
1791		case LISTEN:
1792			segmentAction = _ListenReceive(segment, buffer);
1793			break;
1794
1795		case SYNCHRONIZE_SENT:
1796			segmentAction = _SynchronizeSentReceive(segment, buffer);
1797			break;
1798
1799		case SYNCHRONIZE_RECEIVED:
1800		case ESTABLISHED:
1801		case FINISH_RECEIVED:
1802		case WAIT_FOR_FINISH_ACKNOWLEDGE:
1803		case FINISH_SENT:
1804		case FINISH_ACKNOWLEDGED:
1805		case CLOSING:
1806		case TIME_WAIT:
1807		case CLOSED:
1808			segmentAction = _Receive(segment, buffer);
1809			break;
1810	}
1811
1812	// process acknowledge action as asked for by the *Receive() method
1813	if (segmentAction & IMMEDIATE_ACKNOWLEDGE)
1814		SendAcknowledge(true);
1815	else if (segmentAction & ACKNOWLEDGE)
1816		DelayedAcknowledge();
1817
1818	if ((fFlags & (FLAG_CLOSED | FLAG_DELETE_ON_CLOSE))
1819			== (FLAG_CLOSED | FLAG_DELETE_ON_CLOSE)) {
1820		locker.Unlock();
1821		gSocketModule->release_socket(socket);
1822	}
1823
1824	return segmentAction;
1825}
1826
1827
1828//	#pragma mark - send
1829
1830
1831inline uint8
1832TCPEndpoint::_CurrentFlags()
1833{
1834	// we don't set FLAG_FINISH here, instead we do it
1835	// conditionally below depending if we are sending
1836	// the last bytes of the send queue.
1837
1838	switch (fState) {
1839		case CLOSED:
1840			return TCP_FLAG_RESET | TCP_FLAG_ACKNOWLEDGE;
1841
1842		case SYNCHRONIZE_SENT:
1843			return TCP_FLAG_SYNCHRONIZE;
1844		case SYNCHRONIZE_RECEIVED:
1845			return TCP_FLAG_SYNCHRONIZE | TCP_FLAG_ACKNOWLEDGE;
1846
1847		case ESTABLISHED:
1848		case FINISH_RECEIVED:
1849		case FINISH_ACKNOWLEDGED:
1850		case TIME_WAIT:
1851		case WAIT_FOR_FINISH_ACKNOWLEDGE:
1852		case FINISH_SENT:
1853		case CLOSING:
1854			return TCP_FLAG_ACKNOWLEDGE;
1855
1856		default:
1857			return 0;
1858	}
1859}
1860
1861
1862inline bool
1863TCPEndpoint::_ShouldSendSegment(tcp_segment_header& segment, uint32 length,
1864	uint32 segmentMaxSize, uint32 flightSize)
1865{
1866	if (length > 0) {
1867		// Avoid the silly window syndrome - we only send a segment in case:
1868		// - we have a full segment to send, or
1869		// - we're at the end of our buffer queue, or
1870		// - the buffer is at least larger than half of the maximum send window,
1871		//   or
1872		// - we're retransmitting data
1873		if (length == segmentMaxSize
1874			|| (fOptions & TCP_NODELAY) != 0
1875			|| tcp_sequence(fSendNext + length) == fSendQueue.LastSequence()
1876			|| (fSendMaxWindow > 0 && length >= fSendMaxWindow / 2))
1877			return true;
1878	}
1879
1880	// check if we need to send a window update to the peer
1881	if (segment.advertised_window > 0) {
1882		// correct the window to take into account what already has been advertised
1883		uint32 window = (segment.advertised_window << fReceiveWindowShift)
1884			- (fReceiveMaxAdvertised - fReceiveNext).Number();
1885
1886		// if we can advertise a window larger than twice the maximum segment
1887		// size, or half the maximum buffer size we send a window update
1888		if (window >= (fReceiveMaxSegmentSize << 1)
1889			|| window >= (socket->receive.buffer_size >> 1))
1890			return true;
1891	}
1892
1893	if ((segment.flags & (TCP_FLAG_SYNCHRONIZE | TCP_FLAG_FINISH
1894			| TCP_FLAG_RESET)) != 0)
1895		return true;
1896
1897	// We do have urgent data pending
1898	if (fSendUrgentOffset > fSendNext)
1899		return true;
1900
1901	// there is no reason to send a segment just now
1902	return false;
1903}
1904
1905
1906status_t
1907TCPEndpoint::_SendQueued(bool force)
1908{
1909	return _SendQueued(force, fSendWindow);
1910}
1911
1912
1913/*!	Sends one or more TCP segments with the data waiting in the queue, or some
1914	specific flags that need to be sent.
1915*/
1916status_t
1917TCPEndpoint::_SendQueued(bool force, uint32 sendWindow)
1918{
1919	if (fRoute == NULL)
1920		return B_ERROR;
1921
1922	// in passive state?
1923	if (fState == LISTEN)
1924		return B_ERROR;
1925
1926	tcp_segment_header segment(_CurrentFlags());
1927
1928	if ((fOptions & TCP_NOOPT) == 0) {
1929		if ((fFlags & FLAG_OPTION_TIMESTAMP) != 0) {
1930			segment.options |= TCP_HAS_TIMESTAMPS;
1931			segment.timestamp_reply = fReceivedTimestamp;
1932			segment.timestamp_value = tcp_now();
1933		}
1934
1935		if ((segment.flags & TCP_FLAG_SYNCHRONIZE) != 0
1936			&& fSendNext == fInitialSendSequence) {
1937			// add connection establishment options
1938			segment.max_segment_size = fReceiveMaxSegmentSize;
1939			if (fFlags & FLAG_OPTION_WINDOW_SCALE) {
1940				segment.options |= TCP_HAS_WINDOW_SCALE;
1941				segment.window_shift = fReceiveWindowShift;
1942			}
1943		}
1944	}
1945
1946	size_t availableBytes = fReceiveQueue.Free();
1947	if (fFlags & FLAG_OPTION_WINDOW_SCALE)
1948		segment.advertised_window = availableBytes >> fReceiveWindowShift;
1949	else
1950		segment.advertised_window = min_c(TCP_MAX_WINDOW, availableBytes);
1951
1952	segment.acknowledge = fReceiveNext.Number();
1953
1954	// Process urgent data
1955	if (fSendUrgentOffset > fSendNext) {
1956		segment.flags |= TCP_FLAG_URGENT;
1957		segment.urgent_offset = (fSendUrgentOffset - fSendNext).Number();
1958	} else {
1959		fSendUrgentOffset = fSendUnacknowledged.Number();
1960			// Keep urgent offset updated, so that it doesn't reach into our
1961			// send window on overlap
1962		segment.urgent_offset = 0;
1963	}
1964
1965	if (fCongestionWindow > 0 && fCongestionWindow < sendWindow)
1966		sendWindow = fCongestionWindow;
1967
1968	// fSendUnacknowledged
1969	//  |    fSendNext      fSendMax
1970	//  |        |              |
1971	//  v        v              v
1972	//  -----------------------------------
1973	//  | effective window           |
1974	//  -----------------------------------
1975
1976	// Flight size represents the window of data which is currently in the
1977	// ether. We should never send data such as the flight size becomes larger
1978	// than the effective window. Note however that the effective window may be
1979	// reduced (by congestion for instance), so at some point in time flight
1980	// size may be larger than the currently calculated window.
1981
1982	uint32 flightSize = (fSendMax - fSendUnacknowledged).Number();
1983	uint32 consumedWindow = (fSendNext - fSendUnacknowledged).Number();
1984
1985	if (consumedWindow > sendWindow) {
1986		sendWindow = 0;
1987		// TODO: enter persist state? try to get a window update.
1988	} else
1989		sendWindow -= consumedWindow;
1990
1991	if (force && sendWindow == 0 && fSendNext <= fSendQueue.LastSequence()) {
1992		// send one byte of data to ask for a window update
1993		// (triggered by the persist timer)
1994		sendWindow = 1;
1995	}
1996
1997	uint32 length = min_c(fSendQueue.Available(fSendNext), sendWindow);
1998	tcp_sequence previousSendNext = fSendNext;
1999
2000	do {
2001		uint32 segmentMaxSize = fSendMaxSegmentSize
2002			- tcp_options_length(segment);
2003		uint32 segmentLength = min_c(length, segmentMaxSize);
2004
2005		if (fSendNext + segmentLength == fSendQueue.LastSequence()) {
2006			if (state_needs_finish(fState))
2007				segment.flags |= TCP_FLAG_FINISH;
2008			if (length > 0)
2009				segment.flags |= TCP_FLAG_PUSH;
2010		}
2011
2012		// Determine if we should really send this segment
2013		if (!force && !_ShouldSendSegment(segment, segmentLength,
2014				segmentMaxSize, flightSize)) {
2015			if (fSendQueue.Available()
2016				&& !gStackModule->is_timer_active(&fPersistTimer)
2017				&& !gStackModule->is_timer_active(&fRetransmitTimer))
2018				_StartPersistTimer();
2019			break;
2020		}
2021
2022		net_buffer *buffer = gBufferModule->create(256);
2023		if (buffer == NULL)
2024			return B_NO_MEMORY;
2025
2026		status_t status = B_OK;
2027		if (segmentLength > 0)
2028			status = fSendQueue.Get(buffer, fSendNext, segmentLength);
2029		if (status < B_OK) {
2030			gBufferModule->free(buffer);
2031			return status;
2032		}
2033
2034		LocalAddress().CopyTo(buffer->source);
2035		PeerAddress().CopyTo(buffer->destination);
2036
2037		uint32 size = buffer->size;
2038		segment.sequence = fSendNext.Number();
2039
2040		TRACE("SendQueued(): buffer %p (%lu bytes) address %s to %s\n"
2041			"\tflags 0x%x, seq %lu, ack %lu, rwnd %hu, cwnd %lu, ssthresh %lu\n"
2042			"\tlen %lu first %lu last %lu",
2043			buffer, buffer->size, PrintAddress(buffer->source),
2044			PrintAddress(buffer->destination), segment.flags, segment.sequence,
2045			segment.acknowledge, segment.advertised_window,
2046			fCongestionWindow, fSlowStartThreshold, segmentLength,
2047			fSendQueue.FirstSequence().Number(),
2048			fSendQueue.LastSequence().Number());
2049		T(Send(this, segment, buffer, fSendQueue.FirstSequence(),
2050			fSendQueue.LastSequence()));
2051
2052		PROBE(buffer, sendWindow);
2053		sendWindow -= buffer->size;
2054
2055		status = add_tcp_header(AddressModule(), segment, buffer);
2056		if (status != B_OK) {
2057			gBufferModule->free(buffer);
2058			return status;
2059		}
2060
2061		// Update send status - we need to do this before we send the data
2062		// for local connections as the answer is directly handled
2063
2064		if (segment.flags & TCP_FLAG_SYNCHRONIZE) {
2065			segment.options &= ~TCP_HAS_WINDOW_SCALE;
2066			segment.max_segment_size = 0;
2067			size++;
2068		}
2069
2070		if (segment.flags & TCP_FLAG_FINISH)
2071			size++;
2072
2073		uint32 sendMax = fSendMax.Number();
2074		fSendNext += size;
2075		if (fSendMax < fSendNext)
2076			fSendMax = fSendNext;
2077
2078		fReceiveMaxAdvertised = fReceiveNext
2079			+ ((uint32)segment.advertised_window << fReceiveWindowShift);
2080
2081		status = next->module->send_routed_data(next, fRoute, buffer);
2082		if (status < B_OK) {
2083			gBufferModule->free(buffer);
2084
2085			fSendNext = segment.sequence;
2086			fSendMax = sendMax;
2087				// restore send status
2088			return status;
2089		}
2090
2091		if (segment.flags & TCP_FLAG_ACKNOWLEDGE)
2092			fLastAcknowledgeSent = segment.acknowledge;
2093
2094		length -= segmentLength;
2095		segment.flags &= ~(TCP_FLAG_SYNCHRONIZE | TCP_FLAG_RESET
2096			| TCP_FLAG_FINISH);
2097	} while (length > 0);
2098
2099	// if we sent data from the beggining of the send queue,
2100	// start the retransmition timer
2101	if (previousSendNext == fSendUnacknowledged
2102		&& fSendNext > previousSendNext) {
2103		TRACE("  SendQueue(): set retransmit timer with rto %llu",
2104			fRetransmitTimeout);
2105
2106		gStackModule->set_timer(&fRetransmitTimer, fRetransmitTimeout);
2107	}
2108
2109	return B_OK;
2110}
2111
2112
2113int
2114TCPEndpoint::_MaxSegmentSize(const sockaddr* address) const
2115{
2116	return next->module->get_mtu(next, address) - sizeof(tcp_header);
2117}
2118
2119
2120status_t
2121TCPEndpoint::_PrepareSendPath(const sockaddr* peer)
2122{
2123	if (fRoute == NULL) {
2124		fRoute = gDatalinkModule->get_route(Domain(), peer);
2125		if (fRoute == NULL)
2126			return ENETUNREACH;
2127
2128		if ((fRoute->flags & RTF_LOCAL) != 0)
2129			fFlags |= FLAG_LOCAL;
2130	}
2131
2132	// make sure connection does not already exist
2133	status_t status = fManager->SetConnection(this, *LocalAddress(), peer,
2134		fRoute->interface_address->local);
2135	if (status < B_OK)
2136		return status;
2137
2138	fInitialSendSequence = system_time() >> 4;
2139	fSendNext = fInitialSendSequence;
2140	fSendUnacknowledged = fInitialSendSequence;
2141	fSendMax = fInitialSendSequence;
2142	fSendUrgentOffset = fInitialSendSequence;
2143
2144	// we are counting the SYN here
2145	fSendQueue.SetInitialSequence(fSendNext + 1);
2146
2147	fReceiveMaxSegmentSize = _MaxSegmentSize(peer);
2148
2149	// Compute the window shift we advertise to our peer - if it doesn't support
2150	// this option, this will be reset to 0 (when its SYN is received)
2151	fReceiveWindowShift = 0;
2152	while (fReceiveWindowShift < TCP_MAX_WINDOW_SHIFT
2153		&& (0xffffUL << fReceiveWindowShift) < socket->receive.buffer_size) {
2154		fReceiveWindowShift++;
2155	}
2156
2157	return B_OK;
2158}
2159
2160
2161void
2162TCPEndpoint::_Acknowledged(tcp_segment_header& segment)
2163{
2164	size_t previouslyUsed = fSendQueue.Used();
2165
2166	fSendQueue.RemoveUntil(segment.acknowledge);
2167	fSendUnacknowledged = segment.acknowledge;
2168
2169	if (fSendNext < fSendUnacknowledged)
2170		fSendNext = fSendUnacknowledged;
2171
2172	if (fSendUnacknowledged == fSendMax)
2173		gStackModule->cancel_timer(&fRetransmitTimer);
2174
2175	if (fSendQueue.Used() < previouslyUsed) {
2176		// this ACK acknowledged data
2177
2178		if (segment.options & TCP_HAS_TIMESTAMPS)
2179			_UpdateRoundTripTime(tcp_diff_timestamp(segment.timestamp_reply));
2180		else {
2181			// TODO: Fallback to RFC 793 type estimation
2182		}
2183
2184		if (is_writable(fState)) {
2185			// notify threads waiting on the socket to become writable again
2186			fSendList.Signal();
2187			gSocketModule->notify(socket, B_SELECT_WRITE, fSendQueue.Used());
2188		}
2189
2190		if (fCongestionWindow < fSlowStartThreshold)
2191			fCongestionWindow += fSendMaxSegmentSize;
2192	}
2193
2194	if (fCongestionWindow >= fSlowStartThreshold) {
2195		uint32 increment = fSendMaxSegmentSize * fSendMaxSegmentSize;
2196
2197		if (increment < fCongestionWindow)
2198			increment = 1;
2199		else
2200			increment /= fCongestionWindow;
2201
2202		fCongestionWindow += increment;
2203	}
2204
2205	// if there is data left to be send, send it now
2206	if (fSendQueue.Used() > 0)
2207		_SendQueued();
2208}
2209
2210
2211void
2212TCPEndpoint::_Retransmit()
2213{
2214	TRACE("Retransmit()");
2215	_ResetSlowStart();
2216	fSendNext = fSendUnacknowledged;
2217	_SendQueued();
2218}
2219
2220
2221void
2222TCPEndpoint::_UpdateRoundTripTime(int32 roundTripTime)
2223{
2224	int32 rtt = roundTripTime;
2225
2226	// "smooth" round trip time as per Van Jacobson
2227	rtt -= fRoundTripTime / 8;
2228	fRoundTripTime += rtt;
2229	if (rtt < 0)
2230		rtt = -rtt;
2231	rtt -= fRoundTripDeviation / 4;
2232	fRoundTripDeviation += rtt;
2233
2234	fRetransmitTimeout = ((fRoundTripTime / 4 + fRoundTripDeviation) / 2)
2235		* kTimestampFactor;
2236
2237	TRACE("  RTO is now %llu (after rtt %ldms)", fRetransmitTimeout,
2238		roundTripTime);
2239}
2240
2241
2242void
2243TCPEndpoint::_ResetSlowStart()
2244{
2245	fSlowStartThreshold = max_c((fSendMax - fSendUnacknowledged).Number() / 2,
2246		2 * fSendMaxSegmentSize);
2247	fCongestionWindow = fSendMaxSegmentSize;
2248}
2249
2250
2251//	#pragma mark - timer
2252
2253
2254/*static*/ void
2255TCPEndpoint::_RetransmitTimer(net_timer* timer, void* _endpoint)
2256{
2257	TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint;
2258	T(Timer(endpoint, "retransmit"));
2259
2260	MutexLocker locker(endpoint->fLock);
2261	if (!locker.IsLocked())
2262		return;
2263
2264	endpoint->_Retransmit();
2265}
2266
2267
2268/*static*/ void
2269TCPEndpoint::_PersistTimer(net_timer* timer, void* _endpoint)
2270{
2271	TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint;
2272	T(Timer(endpoint, "persist"));
2273
2274	MutexLocker locker(endpoint->fLock);
2275	if (!locker.IsLocked())
2276		return;
2277
2278	// the timer might not have been canceled early enough
2279	if (endpoint->State() == CLOSED)
2280		return;
2281
2282	endpoint->_SendQueued(true);
2283}
2284
2285
2286/*static*/ void
2287TCPEndpoint::_DelayedAcknowledgeTimer(net_timer* timer, void* _endpoint)
2288{
2289	TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint;
2290	T(Timer(endpoint, "delayed ack"));
2291
2292	MutexLocker locker(endpoint->fLock);
2293	if (!locker.IsLocked())
2294		return;
2295
2296	// the timer might not have been canceled early enough
2297	if (endpoint->State() == CLOSED)
2298		return;
2299
2300	endpoint->SendAcknowledge(true);
2301}
2302
2303
2304/*static*/ void
2305TCPEndpoint::_TimeWaitTimer(net_timer* timer, void* _endpoint)
2306{
2307	TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint;
2308	T(Timer(endpoint, "time-wait"));
2309
2310	MutexLocker locker(endpoint->fLock);
2311	if (!locker.IsLocked())
2312		return;
2313
2314	if ((endpoint->fFlags & FLAG_CLOSED) == 0) {
2315		endpoint->fFlags |= FLAG_DELETE_ON_CLOSE;
2316		return;
2317	}
2318
2319	locker.Unlock();
2320
2321	gSocketModule->release_socket(endpoint->socket);
2322}
2323
2324
2325//	#pragma mark -
2326
2327
2328void
2329TCPEndpoint::Dump() const
2330{
2331	kprintf("TCP endpoint %p\n", this);
2332	kprintf("  state: %s\n", name_for_state(fState));
2333	kprintf("  flags: 0x%" B_PRIx32 "\n", fFlags);
2334#if KDEBUG
2335	kprintf("  lock: { %p, holder: %" B_PRId32 " }\n", &fLock, fLock.holder);
2336#endif
2337	kprintf("  accept sem: %" B_PRId32 "\n", fAcceptSemaphore);
2338	kprintf("  options: 0x%" B_PRIx32 "\n", (uint32)fOptions);
2339	kprintf("  send\n");
2340	kprintf("    window shift: %u\n", fSendWindowShift);
2341	kprintf("    unacknowledged: %" B_PRIu32 "\n",
2342		fSendUnacknowledged.Number());
2343	kprintf("    next: %" B_PRIu32 "\n", fSendNext.Number());
2344	kprintf("    max: %" B_PRIu32 "\n", fSendMax.Number());
2345	kprintf("    urgent offset: %" B_PRIu32 "\n", fSendUrgentOffset.Number());
2346	kprintf("    window: %" B_PRIu32 "\n", fSendWindow);
2347	kprintf("    max window: %" B_PRIu32 "\n", fSendMaxWindow);
2348	kprintf("    max segment size: %" B_PRIu32 "\n", fSendMaxSegmentSize);
2349	kprintf("    queue: %lu / %lu\n", fSendQueue.Used(), fSendQueue.Size());
2350#if DEBUG_BUFFER_QUEUE
2351	fSendQueue.Dump();
2352#endif
2353	kprintf("    last acknowledge sent: %" B_PRIu32 "\n",
2354		fLastAcknowledgeSent.Number());
2355	kprintf("    initial sequence: %" B_PRIu32 "\n",
2356		fInitialSendSequence.Number());
2357	kprintf("  receive\n");
2358	kprintf("    window shift: %u\n", fReceiveWindowShift);
2359	kprintf("    next: %" B_PRIu32 "\n", fReceiveNext.Number());
2360	kprintf("    max advertised: %" B_PRIu32 "\n",
2361		fReceiveMaxAdvertised.Number());
2362	kprintf("    window: %" B_PRIu32 "\n", fReceiveWindow);
2363	kprintf("    max segment size: %" B_PRIu32 "\n", fReceiveMaxSegmentSize);
2364	kprintf("    queue: %lu / %lu\n", fReceiveQueue.Available(),
2365		fReceiveQueue.Size());
2366#if DEBUG_BUFFER_QUEUE
2367	fReceiveQueue.Dump();
2368#endif
2369	kprintf("    initial sequence: %" B_PRIu32 "\n",
2370		fInitialReceiveSequence.Number());
2371	kprintf("    duplicate acknowledge count: %" B_PRIu32 "\n",
2372		fDuplicateAcknowledgeCount);
2373	kprintf("  round trip time: %" B_PRId32 " (deviation %" B_PRId32 ")\n",
2374		fRoundTripTime, fRoundTripDeviation);
2375	kprintf("  retransmit timeout: %" B_PRId64 "\n", fRetransmitTimeout);
2376	kprintf("  congestion window: %" B_PRIu32 "\n", fCongestionWindow);
2377	kprintf("  slow start threshold: %" B_PRIu32 "\n", fSlowStartThreshold);
2378}
2379
2380