1/*
2 * Copyright 2006-2010, Haiku, Inc. All Rights Reserved.
3 * Distributed under the terms of the MIT License.
4 *
5 * Authors:
6 *		Andrew Galante, haiku.galante@gmail.com
7 *		Axel D��rfler, axeld@pinc-software.de
8 *		Hugo Santos, hugosantos@gmail.com
9 */
10
11
12#include "TCPEndpoint.h"
13
14#include <netinet/in.h>
15#include <netinet/ip.h>
16#include <netinet/tcp.h>
17#include <new>
18#include <signal.h>
19#include <stdlib.h>
20#include <string.h>
21#include <stdint.h>
22
23#include <KernelExport.h>
24#include <Select.h>
25
26#include <net_buffer.h>
27#include <net_datalink.h>
28#include <net_stat.h>
29#include <NetBufferUtilities.h>
30#include <NetUtilities.h>
31
32#include <lock.h>
33#include <tracing.h>
34#include <util/AutoLock.h>
35#include <util/list.h>
36
37#include "EndpointManager.h"
38
39
40// References:
41//	- RFC 793 - Transmission Control Protocol
42//	- RFC 813 - Window and Acknowledgement Strategy in TCP
43//	- RFC 1337 - TIME_WAIT Assassination Hazards in TCP
44//
45// Things incomplete in this implementation:
46//	- TCP Extensions for High Performance, RFC 1323 - RTTM, PAWS
47//	- Congestion Control, RFC 5681
48//	- Limited Transit, RFC 3042
49//	- SACK, Selective Acknowledgment; RFC 2018, RFC 2883, RFC 6675
50//	- NewReno Modification to TCP's Fast Recovery, RFC 2582
51//
52// Things this implementation currently doesn't implement:
53//	- Explicit Congestion Notification (ECN), RFC 3168
54//	- SYN-Cache
55//	- Forward RTO-Recovery, RFC 4138
56//	- Time-Wait hash instead of keeping sockets alive
57
58#define PrintAddress(address) \
59	AddressString(Domain(), address, true).Data()
60
61//#define TRACE_TCP
62//#define PROBE_TCP
63
64#ifdef TRACE_TCP
65// the space before ', ##args' is important in order for this to work with cpp 2.95
66#	define TRACE(format, args...)	dprintf("%" B_PRId32 ": TCP [%" \
67		B_PRIdBIGTIME "] %p (%12s) " format "\n", find_thread(NULL), \
68		system_time(), this, name_for_state(fState) , ##args)
69#else
70#	define TRACE(args...)			do { } while (0)
71#endif
72
73#ifdef PROBE_TCP
74#	define PROBE(buffer, window) \
75	dprintf("TCP PROBE %" B_PRIdBIGTIME " %s %s %" B_PRIu32 " snxt %" B_PRIu32 \
76		" suna %" B_PRIu32 " cw %" B_PRIu32 " sst %" B_PRIu32 " win %" \
77		B_PRIu32 " swin %" B_PRIu32 " smax-suna %" B_PRIu32 " savail %" \
78		B_PRIuSIZE " sqused %" B_PRIuSIZE " rto %" B_PRIdBIGTIME "\n", \
79		system_time(), PrintAddress(buffer->source), \
80		PrintAddress(buffer->destination), buffer->size, fSendNext.Number(), \
81		fSendUnacknowledged.Number(), fCongestionWindow, fSlowStartThreshold, \
82		window, fSendWindow, (fSendMax - fSendUnacknowledged).Number(), \
83		fSendQueue.Available(fSendNext), fSendQueue.Used(), fRetransmitTimeout)
84#else
85#	define PROBE(buffer, window)	do { } while (0)
86#endif
87
88#if TCP_TRACING
89namespace TCPTracing {
90
91class Receive : public AbstractTraceEntry {
92public:
93	Receive(TCPEndpoint* endpoint, tcp_segment_header& segment, uint32 window,
94			net_buffer* buffer)
95		:
96		fEndpoint(endpoint),
97		fBuffer(buffer),
98		fBufferSize(buffer->size),
99		fSequence(segment.sequence),
100		fAcknowledge(segment.acknowledge),
101		fWindow(window),
102		fState(endpoint->State()),
103		fFlags(segment.flags)
104	{
105		Initialized();
106	}
107
108	virtual void AddDump(TraceOutput& out)
109	{
110		out.Print("tcp:%p (%12s) receive buffer %p (%" B_PRIu32 " bytes), "
111			"flags %#" B_PRIx8 ", seq %" B_PRIu32 ", ack %" B_PRIu32
112			", wnd %" B_PRIu32, fEndpoint, name_for_state(fState), fBuffer,
113			fBufferSize, fFlags, fSequence, fAcknowledge, fWindow);
114	}
115
116protected:
117	TCPEndpoint*	fEndpoint;
118	net_buffer*		fBuffer;
119	uint32			fBufferSize;
120	uint32			fSequence;
121	uint32			fAcknowledge;
122	uint32			fWindow;
123	tcp_state		fState;
124	uint8			fFlags;
125};
126
127class Send : public AbstractTraceEntry {
128public:
129	Send(TCPEndpoint* endpoint, tcp_segment_header& segment, net_buffer* buffer,
130			tcp_sequence firstSequence, tcp_sequence lastSequence)
131		:
132		fEndpoint(endpoint),
133		fBuffer(buffer),
134		fBufferSize(buffer->size),
135		fSequence(segment.sequence),
136		fAcknowledge(segment.acknowledge),
137		fFirstSequence(firstSequence.Number()),
138		fLastSequence(lastSequence.Number()),
139		fState(endpoint->State()),
140		fFlags(segment.flags)
141	{
142		Initialized();
143	}
144
145	virtual void AddDump(TraceOutput& out)
146	{
147		out.Print("tcp:%p (%12s) send buffer %p (%" B_PRIu32 " bytes), "
148			"flags %#" B_PRIx8 ", seq %" B_PRIu32 ", ack %" B_PRIu32
149			", first %" B_PRIu32 ", last %" B_PRIu32, fEndpoint,
150			name_for_state(fState), fBuffer, fBufferSize, fFlags, fSequence,
151			fAcknowledge, fFirstSequence, fLastSequence);
152	}
153
154protected:
155	TCPEndpoint*	fEndpoint;
156	net_buffer*		fBuffer;
157	uint32			fBufferSize;
158	uint32			fSequence;
159	uint32			fAcknowledge;
160	uint32			fFirstSequence;
161	uint32			fLastSequence;
162	tcp_state		fState;
163	uint8			fFlags;
164};
165
166class State : public AbstractTraceEntry {
167public:
168	State(TCPEndpoint* endpoint)
169		:
170		fEndpoint(endpoint),
171		fState(endpoint->State())
172	{
173		Initialized();
174	}
175
176	virtual void AddDump(TraceOutput& out)
177	{
178		out.Print("tcp:%p (%12s) state change", fEndpoint,
179			name_for_state(fState));
180	}
181
182protected:
183	TCPEndpoint*	fEndpoint;
184	tcp_state		fState;
185};
186
187class Spawn : public AbstractTraceEntry {
188public:
189	Spawn(TCPEndpoint* listeningEndpoint, TCPEndpoint* spawnedEndpoint)
190		:
191		fListeningEndpoint(listeningEndpoint),
192		fSpawnedEndpoint(spawnedEndpoint)
193	{
194		Initialized();
195	}
196
197	virtual void AddDump(TraceOutput& out)
198	{
199		out.Print("tcp:%p spawns %p", fListeningEndpoint, fSpawnedEndpoint);
200	}
201
202protected:
203	TCPEndpoint*	fListeningEndpoint;
204	TCPEndpoint*	fSpawnedEndpoint;
205};
206
207class Error : public AbstractTraceEntry {
208public:
209	Error(TCPEndpoint* endpoint, const char* error, int32 line)
210		:
211		fEndpoint(endpoint),
212		fLine(line),
213		fError(error),
214		fState(endpoint->State())
215	{
216		Initialized();
217	}
218
219	virtual void AddDump(TraceOutput& out)
220	{
221		out.Print("tcp:%p (%12s) error at line %" B_PRId32 ": %s", fEndpoint,
222			name_for_state(fState), fLine, fError);
223	}
224
225protected:
226	TCPEndpoint*	fEndpoint;
227	int32			fLine;
228	const char*		fError;
229	tcp_state		fState;
230};
231
232class TimerSet : public AbstractTraceEntry {
233public:
234	TimerSet(TCPEndpoint* endpoint, const char* which, bigtime_t timeout)
235		:
236		fEndpoint(endpoint),
237		fWhich(which),
238		fTimeout(timeout),
239		fState(endpoint->State())
240	{
241		Initialized();
242	}
243
244	virtual void AddDump(TraceOutput& out)
245	{
246		out.Print("tcp:%p (%12s) %s timer set to %" B_PRIdBIGTIME, fEndpoint,
247			name_for_state(fState), fWhich, fTimeout);
248	}
249
250protected:
251	TCPEndpoint*	fEndpoint;
252	const char*		fWhich;
253	bigtime_t		fTimeout;
254	tcp_state		fState;
255};
256
257class TimerTriggered : public AbstractTraceEntry {
258public:
259	TimerTriggered(TCPEndpoint* endpoint, const char* which)
260		:
261		fEndpoint(endpoint),
262		fWhich(which),
263		fState(endpoint->State())
264	{
265		Initialized();
266	}
267
268	virtual void AddDump(TraceOutput& out)
269	{
270		out.Print("tcp:%p (%12s) %s timer triggered", fEndpoint,
271			name_for_state(fState), fWhich);
272	}
273
274protected:
275	TCPEndpoint*	fEndpoint;
276	const char*		fWhich;
277	tcp_state		fState;
278};
279
280class APICall : public AbstractTraceEntry {
281public:
282	APICall(TCPEndpoint* endpoint, const char* which)
283		:
284		fEndpoint(endpoint),
285		fWhich(which),
286		fState(endpoint->State())
287	{
288		Initialized();
289	}
290
291	virtual void AddDump(TraceOutput& out)
292	{
293		out.Print("tcp:%p (%12s) api call: %s", fEndpoint,
294			name_for_state(fState), fWhich);
295	}
296
297protected:
298	TCPEndpoint*	fEndpoint;
299	const char*		fWhich;
300	tcp_state		fState;
301};
302
303}	// namespace TCPTracing
304
305#	define T(x)	new(std::nothrow) TCPTracing::x
306#else
307#	define T(x)
308#endif	// TCP_TRACING
309
310
311// constants for the fFlags field
312enum {
313	FLAG_OPTION_WINDOW_SCALE	= 0x01,
314	FLAG_OPTION_TIMESTAMP		= 0x02,
315	// TODO: Should FLAG_NO_RECEIVE apply as well to received connections?
316	//       That is, what is expected from accept() after a shutdown()
317	//       is performed on a listen()ing socket.
318	FLAG_NO_RECEIVE				= 0x04,
319	FLAG_CLOSED					= 0x08,
320	FLAG_DELETE_ON_CLOSE		= 0x10,
321	FLAG_LOCAL					= 0x20,
322	FLAG_RECOVERY				= 0x40,
323	FLAG_OPTION_SACK_PERMITTED	= 0x80,
324};
325
326
327static const int kTimestampFactor = 1000;
328	// conversion factor between usec system time and msec tcp time
329
330
331static inline bigtime_t
332absolute_timeout(bigtime_t timeout)
333{
334	if (timeout == 0 || timeout == B_INFINITE_TIMEOUT)
335		return timeout;
336
337	return timeout + system_time();
338}
339
340
341static inline status_t
342posix_error(status_t error)
343{
344	if (error == B_TIMED_OUT)
345		return B_WOULD_BLOCK;
346
347	return error;
348}
349
350
351static inline bool
352in_window(const tcp_sequence& sequence, const tcp_sequence& receiveNext,
353	uint32 receiveWindow)
354{
355	return sequence >= receiveNext && sequence < (receiveNext + receiveWindow);
356}
357
358
359static inline bool
360segment_in_sequence(const tcp_segment_header& segment, int size,
361	const tcp_sequence& receiveNext, uint32 receiveWindow)
362{
363	tcp_sequence sequence(segment.sequence);
364	if (size == 0) {
365		if (receiveWindow == 0)
366			return sequence == receiveNext;
367		return in_window(sequence, receiveNext, receiveWindow);
368	} else {
369		if (receiveWindow == 0)
370			return false;
371		return in_window(sequence, receiveNext, receiveWindow)
372			|| in_window(sequence + size - 1, receiveNext, receiveWindow);
373	}
374}
375
376
377static inline bool
378is_writable(tcp_state state)
379{
380	return state == ESTABLISHED || state == FINISH_RECEIVED;
381}
382
383
384static inline bool
385is_establishing(tcp_state state)
386{
387	return state == SYNCHRONIZE_SENT || state == SYNCHRONIZE_RECEIVED;
388}
389
390
391static inline uint32 tcp_now()
392{
393	return system_time() / kTimestampFactor;
394}
395
396
397static inline uint32 tcp_diff_timestamp(uint32 base)
398{
399	uint32 now = tcp_now();
400
401	if (now > base)
402		return now - base;
403
404	return now + UINT_MAX - base;
405}
406
407
408static inline bool
409state_needs_finish(int32 state)
410{
411	return state == WAIT_FOR_FINISH_ACKNOWLEDGE
412		|| state == FINISH_SENT || state == CLOSING;
413}
414
415
416//	#pragma mark -
417
418
419TCPEndpoint::TCPEndpoint(net_socket* socket)
420	:
421	ProtocolSocket(socket),
422	fManager(NULL),
423	fOptions(0),
424	fSendWindowShift(0),
425	fReceiveWindowShift(0),
426	fSendUnacknowledged(0),
427	fSendNext(0),
428	fSendMax(0),
429	fSendUrgentOffset(0),
430	fSendWindow(0),
431	fSendMaxWindow(0),
432	fSendMaxSegmentSize(TCP_DEFAULT_MAX_SEGMENT_SIZE),
433	fSendMaxSegments(0),
434	fSendQueue(socket->send.buffer_size),
435	fInitialSendSequence(0),
436	fPreviousHighestAcknowledge(0),
437	fDuplicateAcknowledgeCount(0),
438	fPreviousFlightSize(0),
439	fRecover(0),
440	fRoute(NULL),
441	fReceiveNext(0),
442	fReceiveMaxAdvertised(0),
443	fReceiveWindow(socket->receive.buffer_size),
444	fReceiveMaxSegmentSize(TCP_DEFAULT_MAX_SEGMENT_SIZE),
445	fReceiveQueue(socket->receive.buffer_size),
446	fSmoothedRoundTripTime(0),
447	fRoundTripVariation(0),
448	fSendTime(0),
449	fRoundTripStartSequence(0),
450	fRetransmitTimeout(TCP_INITIAL_RTT),
451	fReceivedTimestamp(0),
452	fCongestionWindow(0),
453	fSlowStartThreshold(0),
454	fState(CLOSED),
455	fFlags(FLAG_OPTION_WINDOW_SCALE | FLAG_OPTION_TIMESTAMP | FLAG_OPTION_SACK_PERMITTED)
456{
457	// TODO: to be replaced with a real read/write locking strategy!
458	mutex_init(&fLock, "tcp lock");
459
460	fReceiveCondition.Init(this, "tcp receive");
461	fSendCondition.Init(this, "tcp send");
462
463	gStackModule->init_timer(&fPersistTimer, TCPEndpoint::_PersistTimer, this);
464	gStackModule->init_timer(&fRetransmitTimer, TCPEndpoint::_RetransmitTimer,
465		this);
466	gStackModule->init_timer(&fDelayedAcknowledgeTimer,
467		TCPEndpoint::_DelayedAcknowledgeTimer, this);
468	gStackModule->init_timer(&fTimeWaitTimer, TCPEndpoint::_TimeWaitTimer,
469		this);
470
471	T(APICall(this, "constructor"));
472}
473
474
475TCPEndpoint::~TCPEndpoint()
476{
477	mutex_lock(&fLock);
478
479	T(APICall(this, "destructor"));
480
481	_CancelConnectionTimers();
482	gStackModule->cancel_timer(&fTimeWaitTimer);
483	T(TimerSet(this, "time-wait", -1));
484
485	if (fManager != NULL) {
486		fManager->Unbind(this);
487		put_endpoint_manager(fManager);
488	}
489
490	mutex_destroy(&fLock);
491
492	// we need to wait for all timers to return
493	gStackModule->wait_for_timer(&fRetransmitTimer);
494	gStackModule->wait_for_timer(&fPersistTimer);
495	gStackModule->wait_for_timer(&fDelayedAcknowledgeTimer);
496	gStackModule->wait_for_timer(&fTimeWaitTimer);
497
498	gDatalinkModule->put_route(Domain(), fRoute);
499}
500
501
502status_t
503TCPEndpoint::InitCheck() const
504{
505	return B_OK;
506}
507
508
509//	#pragma mark - protocol API
510
511
512status_t
513TCPEndpoint::Open()
514{
515	TRACE("Open()");
516	T(APICall(this, "open"));
517
518	status_t status = ProtocolSocket::Open();
519	if (status < B_OK)
520		return status;
521
522	fManager = get_endpoint_manager(Domain());
523	if (fManager == NULL)
524		return EAFNOSUPPORT;
525
526	return B_OK;
527}
528
529
530status_t
531TCPEndpoint::Close()
532{
533	MutexLocker locker(fLock);
534
535	TRACE("Close()");
536	T(APICall(this, "close"));
537
538	if (fState == LISTEN)
539		delete_sem(fAcceptSemaphore);
540
541	if (fState == SYNCHRONIZE_SENT || fState == LISTEN) {
542		// TODO: what about linger in case of SYNCHRONIZE_SENT?
543		fState = CLOSED;
544		T(State(this));
545		return B_OK;
546	}
547
548	// handle linger with zero timeout
549	if ((socket->options & SO_LINGER) != 0 && socket->linger == 0) {
550		fState = CLOSED;
551		T(State(this));
552		return _SendQueued(true);
553	}
554
555	status_t status = _Disconnect(true);
556	if (status != B_OK)
557		return status;
558
559	if ((socket->options & SO_LINGER) != 0) {
560		TRACE("Close(): Lingering for %i secs", socket->linger);
561
562		bigtime_t maximum = absolute_timeout(socket->linger * 1000000LL);
563
564		while (fSendQueue.Used() > 0) {
565			status = _WaitForCondition(fSendCondition, locker, maximum);
566			if (status == B_TIMED_OUT || status == B_WOULD_BLOCK)
567				break;
568			else if (status < B_OK)
569				return status;
570		}
571
572		TRACE("Close(): after waiting, the SendQ was left with %" B_PRIuSIZE
573			" bytes.", fSendQueue.Used());
574	}
575	return B_OK;
576}
577
578
579void
580TCPEndpoint::Free()
581{
582	MutexLocker _(fLock);
583
584	TRACE("Free()");
585	T(APICall(this, "free"));
586
587	if (fState <= SYNCHRONIZE_SENT)
588		return;
589
590	// we are only interested in the timer, not in changing state
591	_EnterTimeWait();
592
593	fFlags |= FLAG_CLOSED;
594	if ((fFlags & FLAG_DELETE_ON_CLOSE) == 0) {
595		// we'll be freed later when the 2MSL timer expires
596		gSocketModule->acquire_socket(socket);
597	}
598}
599
600
601/*!	Creates and sends a synchronize packet to /a address, and then waits
602	until the connection has been established or refused.
603*/
604status_t
605TCPEndpoint::Connect(const sockaddr* address)
606{
607	if (!AddressModule()->is_same_family(address))
608		return EAFNOSUPPORT;
609
610	MutexLocker locker(fLock);
611
612	TRACE("Connect() on address %s", PrintAddress(address));
613	T(APICall(this, "connect"));
614
615	if (gStackModule->is_restarted_syscall()) {
616		bigtime_t timeout = gStackModule->restore_syscall_restart_timeout();
617		status_t status = _WaitForEstablished(locker, timeout);
618		TRACE("  Connect(): Connection complete: %s (timeout was %"
619			B_PRIdBIGTIME ")", strerror(status), timeout);
620		return posix_error(status);
621	}
622
623	// Can only call connect() from CLOSED or LISTEN states
624	// otherwise endpoint is considered already connected
625	if (fState == LISTEN) {
626		// this socket is about to connect; remove pending connections in the backlog
627		gSocketModule->set_max_backlog(socket, 0);
628	} else if (fState == ESTABLISHED) {
629		return EISCONN;
630	} else if (fState != CLOSED)
631		return EALREADY;
632
633	// consider destination address INADDR_ANY as INADDR_LOOPBACK
634	sockaddr_storage _address;
635	if (AddressModule()->is_empty_address(address, false)) {
636		AddressModule()->get_loopback_address((sockaddr *)&_address);
637		// for IPv4 and IPv6 the port is at the same offset
638		((sockaddr_in &)_address).sin_port = ((sockaddr_in *)address)->sin_port;
639		address = (sockaddr *)&_address;
640	}
641
642	status_t status = _PrepareSendPath(address);
643	if (status < B_OK)
644		return status;
645
646	TRACE("  Connect(): starting 3-way handshake...");
647
648	fState = SYNCHRONIZE_SENT;
649	T(State(this));
650
651	// send SYN
652	status = _SendAcknowledge();
653	if (status != B_OK) {
654		_Close();
655		return status;
656	}
657
658	// If we are running over Loopback, after _SendQueued() returns we
659	// may be in ESTABLISHED already.
660	if (fState == ESTABLISHED) {
661		TRACE("  Connect() completed after _SendQueued()");
662		return B_OK;
663	}
664
665	// wait until 3-way handshake is complete (if needed)
666	bigtime_t timeout = min_c(socket->send.timeout, TCP_CONNECTION_TIMEOUT);
667	if (timeout == 0) {
668		// we're a non-blocking socket
669		TRACE("  Connect() delayed, return EINPROGRESS");
670		return EINPROGRESS;
671	}
672
673	bigtime_t absoluteTimeout = absolute_timeout(timeout);
674	gStackModule->store_syscall_restart_timeout(absoluteTimeout);
675
676	status = _WaitForEstablished(locker, absoluteTimeout);
677	TRACE("  Connect(): Connection complete: %s (timeout was %" B_PRIdBIGTIME
678		")", strerror(status), timeout);
679	return posix_error(status);
680}
681
682
683status_t
684TCPEndpoint::Accept(struct net_socket** _acceptedSocket)
685{
686	MutexLocker locker(fLock);
687
688	TRACE("Accept()");
689	T(APICall(this, "accept"));
690
691	status_t status;
692	bigtime_t timeout = absolute_timeout(socket->receive.timeout);
693	if (gStackModule->is_restarted_syscall())
694		timeout = gStackModule->restore_syscall_restart_timeout();
695	else
696		gStackModule->store_syscall_restart_timeout(timeout);
697
698	do {
699		locker.Unlock();
700
701		status = acquire_sem_etc(fAcceptSemaphore, 1, B_ABSOLUTE_TIMEOUT
702			| B_CAN_INTERRUPT, timeout);
703		if (status != B_OK) {
704			if (status == B_TIMED_OUT && socket->receive.timeout == 0)
705				return B_WOULD_BLOCK;
706
707			return status;
708		}
709
710		locker.Lock();
711		status = gSocketModule->dequeue_connected(socket, _acceptedSocket);
712#ifdef TRACE_TCP
713		if (status == B_OK)
714			TRACE("  Accept() returning %p", (*_acceptedSocket)->first_protocol);
715#endif
716	} while (status != B_OK);
717
718	return status;
719}
720
721
722status_t
723TCPEndpoint::Bind(const sockaddr *address)
724{
725	if (address == NULL)
726		return B_BAD_VALUE;
727
728	MutexLocker lock(fLock);
729
730	TRACE("Bind() on address %s", PrintAddress(address));
731	T(APICall(this, "bind"));
732
733	if (fState != CLOSED)
734		return EISCONN;
735
736	return fManager->Bind(this, address);
737}
738
739
740status_t
741TCPEndpoint::Unbind(struct sockaddr *address)
742{
743	MutexLocker _(fLock);
744
745	TRACE("Unbind()");
746	T(APICall(this, "unbind"));
747
748	return fManager->Unbind(this);
749}
750
751
752status_t
753TCPEndpoint::Listen(int count)
754{
755	MutexLocker _(fLock);
756
757	TRACE("Listen()");
758	T(APICall(this, "listen"));
759
760	if (fState != CLOSED && fState != LISTEN)
761		return B_BAD_VALUE;
762
763	if (fState == CLOSED) {
764		fAcceptSemaphore = create_sem(0, "tcp accept");
765		if (fAcceptSemaphore < B_OK)
766			return ENOBUFS;
767
768		status_t status = fManager->SetPassive(this);
769		if (status != B_OK) {
770			delete_sem(fAcceptSemaphore);
771			fAcceptSemaphore = -1;
772			return status;
773		}
774	}
775
776	gSocketModule->set_max_backlog(socket, count);
777
778	fState = LISTEN;
779	T(State(this));
780	return B_OK;
781}
782
783
784status_t
785TCPEndpoint::Shutdown(int direction)
786{
787	MutexLocker lock(fLock);
788
789	TRACE("Shutdown(%i)", direction);
790	T(APICall(this, "shutdown"));
791
792	if (direction == SHUT_RD || direction == SHUT_RDWR)
793		fFlags |= FLAG_NO_RECEIVE;
794
795	if (direction == SHUT_WR || direction == SHUT_RDWR) {
796		// TODO: That's not correct. After read/write shutting down the socket
797		// one should still be able to read previously arrived data.
798		_Disconnect(false);
799	}
800
801	return B_OK;
802}
803
804
805/*!	Puts data contained in \a buffer into send buffer */
806status_t
807TCPEndpoint::SendData(net_buffer *buffer)
808{
809	MutexLocker lock(fLock);
810
811	TRACE("SendData(buffer %p, size %" B_PRIu32 ", flags %#" B_PRIx32
812		") [total %" B_PRIuSIZE " bytes, has %" B_PRIuSIZE "]", buffer,
813		buffer->size, buffer->flags, fSendQueue.Size(), fSendQueue.Free());
814	T(APICall(this, "senddata"));
815
816	const uint32 flags = buffer->flags;
817	if ((flags & ~(MSG_DONTWAIT | MSG_OOB | MSG_EOF)) != 0)
818		return EOPNOTSUPP;
819
820	if (fState == CLOSED)
821		return ENOTCONN;
822	if (fState == LISTEN)
823		return EDESTADDRREQ;
824	if (!is_writable(fState) && !is_establishing(fState))
825		return EPIPE;
826
827	size_t left = buffer->size;
828
829	bigtime_t timeout = 0;
830	if ((flags & MSG_DONTWAIT) == 0) {
831		timeout = absolute_timeout(socket->send.timeout);
832		if (gStackModule->is_restarted_syscall())
833			timeout = gStackModule->restore_syscall_restart_timeout();
834		else
835			gStackModule->store_syscall_restart_timeout(timeout);
836	}
837
838	while (left > 0) {
839		while (fSendQueue.Free() < socket->send.low_water_mark) {
840			// initiate a send before waiting
841			_SendQueued();
842
843			// wait until enough space is available
844			status_t status = _WaitForCondition(fSendCondition, lock, timeout);
845			if (status < B_OK) {
846				TRACE("  SendData() returning %s (%d)",
847					strerror(posix_error(status)), (int)posix_error(status));
848				return posix_error(status);
849			}
850
851			if (!is_writable(fState) && !is_establishing(fState))
852				return EPIPE;
853		}
854
855		size_t size = fSendQueue.Free();
856		if (size < left) {
857			// we need to split the original buffer
858			net_buffer* clone = gBufferModule->clone(buffer, false);
859				// TODO: add offset/size parameter to net_buffer::clone() or
860				// even a move_data() function, as this is a bit inefficient
861			if (clone == NULL)
862				return ENOBUFS;
863
864			status_t status = gBufferModule->trim(clone, size);
865			if (status != B_OK) {
866				gBufferModule->free(clone);
867				return status;
868			}
869
870			gBufferModule->remove_header(buffer, size);
871			left -= size;
872			fSendQueue.Add(clone);
873		} else {
874			left -= buffer->size;
875			fSendQueue.Add(buffer);
876		}
877	}
878
879	TRACE("  SendData(): %" B_PRIuSIZE " bytes used.", fSendQueue.Used());
880
881	bool force = false;
882	if ((flags & MSG_OOB) != 0) {
883		fSendUrgentOffset = fSendQueue.LastSequence();
884			// RFC 961 specifies that the urgent offset points to the last
885			// byte of urgent data. However, this is commonly implemented as
886			// here, ie. it points to the first byte after the urgent data.
887		force = true;
888	}
889	if ((flags & MSG_EOF) != 0)
890		_Disconnect(false);
891
892	_SendQueued(force);
893
894	return B_OK;
895}
896
897
898ssize_t
899TCPEndpoint::SendAvailable()
900{
901	MutexLocker locker(fLock);
902
903	ssize_t available;
904
905	if (is_writable(fState))
906		available = fSendQueue.Free();
907	else if (is_establishing(fState))
908		available = 0;
909	else
910		available = EPIPE;
911
912	TRACE("SendAvailable(): %" B_PRIdSSIZE, available);
913	T(APICall(this, "sendavailable"));
914	return available;
915}
916
917
918status_t
919TCPEndpoint::FillStat(net_stat *stat)
920{
921	MutexLocker _(fLock);
922
923	strlcpy(stat->state, name_for_state(fState), sizeof(stat->state));
924	stat->receive_queue_size = fReceiveQueue.Available();
925	stat->send_queue_size = fSendQueue.Used();
926
927	return B_OK;
928}
929
930
931status_t
932TCPEndpoint::ReadData(size_t numBytes, uint32 flags, net_buffer** _buffer)
933{
934	if ((flags & ~(MSG_DONTWAIT | MSG_WAITALL | MSG_PEEK)) != 0)
935		return EOPNOTSUPP;
936
937	MutexLocker locker(fLock);
938
939	TRACE("ReadData(%" B_PRIuSIZE " bytes, flags %#" B_PRIx32 ")", numBytes,
940		flags);
941	T(APICall(this, "readdata"));
942
943	*_buffer = NULL;
944
945	if (fState == CLOSED || fState == LISTEN) {
946		if (socket->error != B_OK)
947			return socket->error;
948		return ENOTCONN;
949	}
950
951	bigtime_t timeout = 0;
952	if ((flags & MSG_DONTWAIT) == 0) {
953		timeout = absolute_timeout(socket->receive.timeout);
954		if (gStackModule->is_restarted_syscall())
955			timeout = gStackModule->restore_syscall_restart_timeout();
956		else
957			gStackModule->store_syscall_restart_timeout(timeout);
958	}
959
960	if (fState == SYNCHRONIZE_SENT || fState == SYNCHRONIZE_RECEIVED) {
961		if (flags & MSG_DONTWAIT)
962			return B_WOULD_BLOCK;
963
964		status_t status = _WaitForEstablished(locker, timeout);
965		if (status < B_OK)
966			return posix_error(status);
967	}
968
969	size_t dataNeeded = socket->receive.low_water_mark;
970
971	// When MSG_WAITALL is set then the function should block
972	// until the full amount of data can be returned.
973	if (flags & MSG_WAITALL)
974		dataNeeded = numBytes;
975
976	// TODO: add support for urgent data (MSG_OOB)
977
978	while (true) {
979		if (fState == CLOSING || fState == WAIT_FOR_FINISH_ACKNOWLEDGE
980			|| fState == TIME_WAIT) {
981			// ``Connection closing''.
982			if (fReceiveQueue.Available() > 0)
983				break;
984			return B_OK;
985		}
986
987		if (fReceiveQueue.Available() > 0) {
988			if (fReceiveQueue.Available() >= dataNeeded
989				|| (fReceiveQueue.PushedData() > 0
990					&& fReceiveQueue.PushedData() >= fReceiveQueue.Available()))
991				break;
992		} else if (fState == FINISH_RECEIVED) {
993			// ``If no text is awaiting delivery, the RECEIVE will
994			//   get a Connection closing''.
995			return B_OK;
996		}
997
998		if (timeout == 0)
999			return B_WOULD_BLOCK;
1000
1001		if ((fFlags & FLAG_NO_RECEIVE) != 0)
1002			return B_OK;
1003
1004		status_t status = _WaitForCondition(fReceiveCondition, locker, timeout);
1005		if (status < B_OK) {
1006			// The Open Group base specification mentions that EINTR should be
1007			// returned if the recv() is interrupted before _any data_ is
1008			// available. So we actually check if there is data, and if so,
1009			// push it to the user.
1010			if ((status == B_TIMED_OUT || status == B_INTERRUPTED)
1011				&& fReceiveQueue.Available() > 0)
1012				break;
1013
1014			return posix_error(status);
1015		}
1016	}
1017
1018	TRACE("  ReadData(): %" B_PRIuSIZE " are available.",
1019		fReceiveQueue.Available());
1020
1021	if (numBytes < fReceiveQueue.Available())
1022		fReceiveCondition.NotifyAll();
1023
1024	bool clone = (flags & MSG_PEEK) != 0;
1025
1026	ssize_t receivedBytes = fReceiveQueue.Get(numBytes, !clone, _buffer);
1027
1028	TRACE("  ReadData(): %" B_PRIuSIZE " bytes kept.",
1029		fReceiveQueue.Available());
1030
1031	if (fReceiveQueue.Available() == 0 && fState == FINISH_RECEIVED)
1032		socket->receive.low_water_mark = 0;
1033
1034	// if we are opening the window, check if we should send an ACK
1035	if (!clone)
1036		_SendAcknowledge();
1037
1038	return receivedBytes;
1039}
1040
1041
1042ssize_t
1043TCPEndpoint::ReadAvailable()
1044{
1045	MutexLocker locker(fLock);
1046
1047	TRACE("ReadAvailable(): %" B_PRIdSSIZE, _AvailableData());
1048	T(APICall(this, "readavailable"));
1049
1050	return _AvailableData();
1051}
1052
1053
1054status_t
1055TCPEndpoint::SetSendBufferSize(size_t length)
1056{
1057	MutexLocker _(fLock);
1058	fSendQueue.SetMaxBytes(length);
1059	return B_OK;
1060}
1061
1062
1063status_t
1064TCPEndpoint::SetReceiveBufferSize(size_t length)
1065{
1066	MutexLocker _(fLock);
1067	fReceiveQueue.SetMaxBytes(length);
1068	return B_OK;
1069}
1070
1071
1072status_t
1073TCPEndpoint::GetOption(int option, void* _value, int* _length)
1074{
1075	if (*_length != sizeof(int))
1076		return B_BAD_VALUE;
1077
1078	int* value = (int*)_value;
1079
1080	switch (option) {
1081		case TCP_NODELAY:
1082			if ((fOptions & TCP_NODELAY) != 0)
1083				*value = 1;
1084			else
1085				*value = 0;
1086			return B_OK;
1087
1088		case TCP_MAXSEG:
1089			*value = fReceiveMaxSegmentSize;
1090			return B_OK;
1091
1092		default:
1093			return B_BAD_VALUE;
1094	}
1095}
1096
1097
1098status_t
1099TCPEndpoint::SetOption(int option, const void* _value, int length)
1100{
1101	if (option != TCP_NODELAY)
1102		return B_BAD_VALUE;
1103
1104	if (length != sizeof(int))
1105		return B_BAD_VALUE;
1106
1107	const int* value = (const int*)_value;
1108
1109	MutexLocker _(fLock);
1110	if (*value)
1111		fOptions |= TCP_NODELAY;
1112	else
1113		fOptions &= ~TCP_NODELAY;
1114
1115	return B_OK;
1116}
1117
1118
1119//	#pragma mark - misc
1120
1121
1122bool
1123TCPEndpoint::IsBound() const
1124{
1125	return !LocalAddress().IsEmpty(true);
1126}
1127
1128
1129bool
1130TCPEndpoint::IsLocal() const
1131{
1132	return (fFlags & FLAG_LOCAL) != 0;
1133}
1134
1135
1136status_t
1137TCPEndpoint::DelayedAcknowledge()
1138{
1139	// ACKs "MUST" be generated within 500ms of the first unACKed packet, and
1140	// "SHOULD" be for at least every second full-size segment. (RFC 5681 �� 4.2)
1141
1142	bigtime_t delay = TCP_DELAYED_ACKNOWLEDGE_TIMEOUT;
1143	if ((fReceiveNext - fLastAcknowledgeSent) >= (fReceiveMaxSegmentSize * 2)) {
1144		// Trigger an immediate timeout rather than invoking Send directly,
1145		// allowing multiple invocations to be coalesced.
1146		delay = 0;
1147	} else if (gStackModule->is_timer_active(&fDelayedAcknowledgeTimer)) {
1148		return B_OK;
1149	}
1150
1151	gStackModule->set_timer(&fDelayedAcknowledgeTimer, delay);
1152	T(TimerSet(this, "delayed ack", TCP_DELAYED_ACKNOWLEDGE_TIMEOUT));
1153	return B_OK;
1154}
1155
1156
1157void
1158TCPEndpoint::_StartPersistTimer()
1159{
1160	gStackModule->set_timer(&fPersistTimer, TCP_PERSIST_TIMEOUT);
1161	T(TimerSet(this, "persist", TCP_PERSIST_TIMEOUT));
1162}
1163
1164
1165void
1166TCPEndpoint::_EnterTimeWait()
1167{
1168	TRACE("_EnterTimeWait()");
1169
1170	if (fState == TIME_WAIT) {
1171		_CancelConnectionTimers();
1172	}
1173
1174	_UpdateTimeWait();
1175}
1176
1177
1178void
1179TCPEndpoint::_UpdateTimeWait()
1180{
1181	gStackModule->set_timer(&fTimeWaitTimer, TCP_MAX_SEGMENT_LIFETIME << 1);
1182	T(TimerSet(this, "time-wait", TCP_MAX_SEGMENT_LIFETIME << 1));
1183}
1184
1185
1186void
1187TCPEndpoint::_CancelConnectionTimers()
1188{
1189	gStackModule->cancel_timer(&fRetransmitTimer);
1190	T(TimerSet(this, "retransmit", -1));
1191	gStackModule->cancel_timer(&fPersistTimer);
1192	T(TimerSet(this, "persist", -1));
1193	gStackModule->cancel_timer(&fDelayedAcknowledgeTimer);
1194	T(TimerSet(this, "delayed ack", -1));
1195}
1196
1197
1198/*!	Sends the FIN flag to the peer when the connection is still open.
1199	Moves the endpoint to the next state depending on where it was.
1200*/
1201status_t
1202TCPEndpoint::_Disconnect(bool closing)
1203{
1204	tcp_state previousState = fState;
1205
1206	if (fState == SYNCHRONIZE_RECEIVED || fState == ESTABLISHED)
1207		fState = FINISH_SENT;
1208	else if (fState == FINISH_RECEIVED)
1209		fState = WAIT_FOR_FINISH_ACKNOWLEDGE;
1210	else
1211		return B_OK;
1212
1213	T(State(this));
1214
1215	status_t status = _SendQueued();
1216	if (status != B_OK) {
1217		fState = previousState;
1218		T(State(this));
1219		return status;
1220	}
1221
1222	return B_OK;
1223}
1224
1225
1226void
1227TCPEndpoint::_MarkEstablished()
1228{
1229	fState = ESTABLISHED;
1230	T(State(this));
1231
1232	gSocketModule->set_connected(socket);
1233	if (gSocketModule->has_parent(socket))
1234		release_sem_etc(fAcceptSemaphore, 1, B_DO_NOT_RESCHEDULE);
1235
1236	fSendCondition.NotifyAll();
1237	gSocketModule->notify(socket, B_SELECT_WRITE, fSendQueue.Free());
1238}
1239
1240
1241status_t
1242TCPEndpoint::_WaitForEstablished(MutexLocker &locker, bigtime_t timeout)
1243{
1244	// TODO: Checking for CLOSED seems correct, but breaks several neon tests.
1245	// When investigating this, also have a look at _Close() and _HandleReset().
1246	while (fState < ESTABLISHED/* && fState != CLOSED*/) {
1247		if (socket->error != B_OK)
1248			return socket->error;
1249
1250		status_t status = _WaitForCondition(fSendCondition, locker, timeout);
1251		if (status < B_OK)
1252			return status;
1253	}
1254
1255	return B_OK;
1256}
1257
1258
1259//	#pragma mark - receive
1260
1261
1262void
1263TCPEndpoint::_Close()
1264{
1265	_CancelConnectionTimers();
1266	fState = CLOSED;
1267	T(State(this));
1268
1269	fFlags |= FLAG_DELETE_ON_CLOSE;
1270
1271	fSendCondition.NotifyAll();
1272	_NotifyReader();
1273
1274	if (gSocketModule->has_parent(socket)) {
1275		// We still have a parent - obviously, we haven't been accepted yet,
1276		// so no one could ever close us.
1277		_CancelConnectionTimers();
1278		gSocketModule->set_aborted(socket);
1279	}
1280}
1281
1282
1283void
1284TCPEndpoint::_HandleReset(status_t error)
1285{
1286	socket->error = error;
1287	_Close();
1288
1289	gSocketModule->notify(socket, B_SELECT_WRITE, error);
1290	gSocketModule->notify(socket, B_SELECT_ERROR, error);
1291}
1292
1293
1294void
1295TCPEndpoint::_DuplicateAcknowledge(tcp_segment_header &segment)
1296{
1297	if (fDuplicateAcknowledgeCount == 0)
1298		fPreviousFlightSize = (fSendMax - fSendUnacknowledged).Number();
1299
1300	if (++fDuplicateAcknowledgeCount < 3) {
1301		if (fSendQueue.Available(fSendMax) != 0  && fSendWindow != 0) {
1302			fSendNext = fSendMax;
1303			fCongestionWindow += fDuplicateAcknowledgeCount * fSendMaxSegmentSize;
1304			_SendQueued();
1305			TRACE("_DuplicateAcknowledge(): packet sent under limited transmit on receipt of dup ack");
1306			fCongestionWindow -= fDuplicateAcknowledgeCount * fSendMaxSegmentSize;
1307		}
1308	}
1309
1310	if (fDuplicateAcknowledgeCount == 3) {
1311		if ((segment.acknowledge - 1) > fRecover || (fCongestionWindow > fSendMaxSegmentSize &&
1312			(fSendUnacknowledged - fPreviousHighestAcknowledge) <= 4 * fSendMaxSegmentSize)) {
1313			fFlags |= FLAG_RECOVERY;
1314			fRecover = fSendMax.Number() - 1;
1315			fSlowStartThreshold = max_c(fPreviousFlightSize / 2, 2 * fSendMaxSegmentSize);
1316			fCongestionWindow = fSlowStartThreshold + 3 * fSendMaxSegmentSize;
1317			fSendNext = segment.acknowledge;
1318			_SendQueued();
1319			TRACE("_DuplicateAcknowledge(): packet sent under fast restransmit on the receipt of 3rd dup ack");
1320		}
1321	} else if (fDuplicateAcknowledgeCount > 3) {
1322		uint32 flightSize = (fSendMax - fSendUnacknowledged).Number();
1323		if ((fDuplicateAcknowledgeCount - 3) * fSendMaxSegmentSize <= flightSize)
1324			fCongestionWindow += fSendMaxSegmentSize;
1325		if (fSendQueue.Available(fSendMax) != 0) {
1326			fSendNext = fSendMax;
1327			_SendQueued();
1328		}
1329	}
1330}
1331
1332
1333void
1334TCPEndpoint::_UpdateTimestamps(tcp_segment_header& segment,
1335	size_t segmentLength)
1336{
1337	if (fFlags & FLAG_OPTION_TIMESTAMP) {
1338		tcp_sequence sequence(segment.sequence);
1339
1340		if (fLastAcknowledgeSent >= sequence
1341			&& fLastAcknowledgeSent < (sequence + segmentLength))
1342			fReceivedTimestamp = segment.timestamp_value;
1343	}
1344}
1345
1346
1347ssize_t
1348TCPEndpoint::_AvailableData() const
1349{
1350	// TODO: Refer to the FLAG_NO_RECEIVE comment above regarding
1351	//       the application of FLAG_NO_RECEIVE in listen()ing
1352	//       sockets.
1353	if (fState == LISTEN)
1354		return gSocketModule->count_connected(socket);
1355	if (fState == SYNCHRONIZE_SENT)
1356		return 0;
1357
1358	ssize_t availableData = fReceiveQueue.Available();
1359
1360	if (availableData == 0 && !_ShouldReceive())
1361		return ENOTCONN;
1362	if (availableData == 0 && (fState == FINISH_RECEIVED || fState == WAIT_FOR_FINISH_ACKNOWLEDGE))
1363		return ESHUTDOWN;
1364	return availableData;
1365}
1366
1367
1368void
1369TCPEndpoint::_NotifyReader()
1370{
1371	fReceiveCondition.NotifyAll();
1372	gSocketModule->notify(socket, B_SELECT_READ, _AvailableData());
1373}
1374
1375
1376bool
1377TCPEndpoint::_AddData(tcp_segment_header& segment, net_buffer* buffer)
1378{
1379	if ((segment.flags & TCP_FLAG_FINISH) != 0) {
1380		// Remember the position of the finish received flag
1381		fFinishReceived = true;
1382		fFinishReceivedAt = segment.sequence + buffer->size;
1383	}
1384
1385	fReceiveQueue.Add(buffer, segment.sequence);
1386	fReceiveNext = fReceiveQueue.NextSequence();
1387
1388	if (fFinishReceived) {
1389		// Set or reset the finish flag on the current segment
1390		if (fReceiveNext < fFinishReceivedAt)
1391			segment.flags &= ~TCP_FLAG_FINISH;
1392		else
1393			segment.flags |= TCP_FLAG_FINISH;
1394	}
1395
1396	TRACE("  _AddData(): adding data, receive next = %" B_PRIu32 ". Now have %"
1397		B_PRIuSIZE " bytes.", fReceiveNext.Number(), fReceiveQueue.Available());
1398
1399	if ((segment.flags & TCP_FLAG_PUSH) != 0)
1400		fReceiveQueue.SetPushPointer();
1401
1402	return fReceiveQueue.Available() > 0;
1403}
1404
1405
1406void
1407TCPEndpoint::_PrepareReceivePath(tcp_segment_header& segment)
1408{
1409	fInitialReceiveSequence = segment.sequence;
1410	fFinishReceived = false;
1411
1412	// count the received SYN
1413	segment.sequence++;
1414
1415	fReceiveNext = segment.sequence;
1416	fReceiveQueue.SetInitialSequence(segment.sequence);
1417
1418	if ((fOptions & TCP_NOOPT) == 0) {
1419		if (segment.max_segment_size > 0) {
1420			// The maximum size of a segment that a TCP endpoint really sends,
1421			// the "effective send MSS", MUST be the smaller of the send MSS and
1422			// the largest transmission size permitted by the IP layer:
1423			fSendMaxSegmentSize = min_c(segment.max_segment_size,
1424				_MaxSegmentSize(*PeerAddress()));
1425		}
1426
1427		if (segment.options & TCP_HAS_WINDOW_SCALE) {
1428			fFlags |= FLAG_OPTION_WINDOW_SCALE;
1429			fSendWindowShift = segment.window_shift;
1430		} else {
1431			fFlags &= ~FLAG_OPTION_WINDOW_SCALE;
1432			fReceiveWindowShift = 0;
1433		}
1434
1435		if (segment.options & TCP_HAS_TIMESTAMPS) {
1436			fFlags |= FLAG_OPTION_TIMESTAMP;
1437			fReceivedTimestamp = segment.timestamp_value;
1438		} else
1439			fFlags &= ~FLAG_OPTION_TIMESTAMP;
1440
1441		if ((segment.options & TCP_SACK_PERMITTED) == 0)
1442			fFlags &= ~FLAG_OPTION_SACK_PERMITTED;
1443	}
1444
1445	if (fSendMaxSegmentSize > 2190)
1446		fCongestionWindow = 2 * fSendMaxSegmentSize;
1447	else if (fSendMaxSegmentSize > 1095)
1448		fCongestionWindow = 3 * fSendMaxSegmentSize;
1449	else
1450		fCongestionWindow = 4 * fSendMaxSegmentSize;
1451
1452	fSendMaxSegments = fCongestionWindow / fSendMaxSegmentSize;
1453	fSlowStartThreshold = (uint32)segment.advertised_window << fSendWindowShift;
1454}
1455
1456
1457bool
1458TCPEndpoint::_ShouldReceive() const
1459{
1460	if ((fFlags & FLAG_NO_RECEIVE) != 0)
1461		return false;
1462
1463	return fState == ESTABLISHED || fState == FINISH_SENT
1464		|| fState == FINISH_ACKNOWLEDGED || fState == FINISH_RECEIVED;
1465}
1466
1467
1468int32
1469TCPEndpoint::_Spawn(TCPEndpoint* parent, tcp_segment_header& segment,
1470	net_buffer* buffer)
1471{
1472	MutexLocker _(fLock);
1473
1474	TRACE("Spawn()");
1475
1476	// TODO: proper error handling!
1477	if (ProtocolSocket::Open() != B_OK) {
1478		T(Error(this, "opening failed", __LINE__));
1479		return DROP;
1480	}
1481
1482	fState = SYNCHRONIZE_RECEIVED;
1483	T(Spawn(parent, this));
1484
1485	fManager = parent->fManager;
1486
1487	if (fManager->BindChild(this, buffer->destination) != B_OK) {
1488		T(Error(this, "binding failed", __LINE__));
1489		return DROP;
1490	}
1491	if (_PrepareSendPath(buffer->source) != B_OK) {
1492		T(Error(this, "prepare send faild", __LINE__));
1493		return DROP;
1494	}
1495
1496	fOptions = parent->fOptions;
1497	fAcceptSemaphore = parent->fAcceptSemaphore;
1498
1499	_PrepareReceivePath(segment);
1500
1501	// send SYN+ACK
1502	if (_SendAcknowledge() != B_OK) {
1503		T(Error(this, "sending failed", __LINE__));
1504		return DROP;
1505	}
1506
1507	segment.flags &= ~TCP_FLAG_SYNCHRONIZE;
1508		// we handled this flag now, it must not be set for further processing
1509
1510	return _Receive(segment, buffer);
1511}
1512
1513
1514int32
1515TCPEndpoint::_ListenReceive(tcp_segment_header& segment, net_buffer* buffer)
1516{
1517	TRACE("ListenReceive()");
1518
1519	// Essentially, we accept only TCP_FLAG_SYNCHRONIZE in this state,
1520	// but the error behaviour differs
1521	if (segment.flags & TCP_FLAG_RESET)
1522		return DROP;
1523	if (segment.flags & TCP_FLAG_ACKNOWLEDGE)
1524		return DROP | RESET;
1525	if ((segment.flags & TCP_FLAG_SYNCHRONIZE) == 0)
1526		return DROP;
1527
1528	// TODO: drop broadcast/multicast
1529
1530	// spawn new endpoint for accept()
1531	net_socket* newSocket;
1532	if (gSocketModule->spawn_pending_socket(socket, &newSocket) < B_OK) {
1533		T(Error(this, "spawning failed", __LINE__));
1534		return DROP;
1535	}
1536
1537	return ((TCPEndpoint *)newSocket->first_protocol)->_Spawn(this,
1538		segment, buffer);
1539}
1540
1541
1542int32
1543TCPEndpoint::_SynchronizeSentReceive(tcp_segment_header &segment,
1544	net_buffer *buffer)
1545{
1546	TRACE("_SynchronizeSentReceive()");
1547
1548	if ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0
1549		&& (fInitialSendSequence >= segment.acknowledge
1550			|| fSendMax < segment.acknowledge))
1551		return DROP | RESET;
1552
1553	if (segment.flags & TCP_FLAG_RESET) {
1554		_HandleReset(ECONNREFUSED);
1555		return DROP;
1556	}
1557
1558	if ((segment.flags & TCP_FLAG_SYNCHRONIZE) == 0)
1559		return DROP;
1560
1561	fSendUnacknowledged = segment.acknowledge;
1562	_PrepareReceivePath(segment);
1563
1564	if (segment.flags & TCP_FLAG_ACKNOWLEDGE) {
1565		_MarkEstablished();
1566	} else {
1567		// simultaneous open
1568		fState = SYNCHRONIZE_RECEIVED;
1569		T(State(this));
1570	}
1571
1572	segment.flags &= ~TCP_FLAG_SYNCHRONIZE;
1573		// we handled this flag now, it must not be set for further processing
1574
1575	return _Receive(segment, buffer) | IMMEDIATE_ACKNOWLEDGE;
1576}
1577
1578
1579int32
1580TCPEndpoint::_Receive(tcp_segment_header& segment, net_buffer* buffer)
1581{
1582	// PAWS processing takes precedence over regular TCP acceptability check
1583	if ((fFlags & FLAG_OPTION_TIMESTAMP) != 0 && (segment.flags & TCP_FLAG_RESET) == 0) {
1584		if ((segment.options & TCP_HAS_TIMESTAMPS) == 0)
1585			return DROP;
1586		if ((int32)(fReceivedTimestamp - segment.timestamp_value) > 0
1587			&& (fReceivedTimestamp - segment.timestamp_value) <= INT32_MAX)
1588			return DROP | IMMEDIATE_ACKNOWLEDGE;
1589	}
1590
1591	uint32 advertisedWindow = segment.AdvertisedWindow(fSendWindowShift);
1592	size_t segmentLength = buffer->size;
1593
1594	// First, handle the most common case for uni-directional data transfer
1595	// (known as header prediction - the segment must not change the window,
1596	// and must be the expected sequence, and contain no control flags)
1597
1598	if (fState == ESTABLISHED
1599		&& segment.AcknowledgeOnly()
1600		&& fReceiveNext == segment.sequence
1601		&& advertisedWindow > 0 && advertisedWindow == fSendWindow
1602		&& fSendNext == fSendMax) {
1603		_UpdateTimestamps(segment, segmentLength);
1604
1605		if (segmentLength == 0) {
1606			// this is a pure acknowledge segment - we're on the sending end
1607			if (fSendUnacknowledged < segment.acknowledge
1608				&& fSendMax >= segment.acknowledge) {
1609				_Acknowledged(segment);
1610				return DROP;
1611			}
1612		} else if (segment.acknowledge == fSendUnacknowledged
1613			&& fReceiveQueue.IsContiguous()
1614			&& fReceiveQueue.Free() >= segmentLength
1615			&& (fFlags & FLAG_NO_RECEIVE) == 0) {
1616			if (_AddData(segment, buffer))
1617				_NotifyReader();
1618
1619			return KEEP | ((segment.flags & TCP_FLAG_PUSH) != 0
1620				? IMMEDIATE_ACKNOWLEDGE : ACKNOWLEDGE);
1621		}
1622	}
1623
1624	// The fast path was not applicable, so we continue with the standard
1625	// processing of the incoming segment
1626
1627	ASSERT(fState != SYNCHRONIZE_SENT && fState != LISTEN);
1628
1629	if (fState != CLOSED && fState != TIME_WAIT) {
1630		// Check sequence number
1631		if (!segment_in_sequence(segment, segmentLength, fReceiveNext,
1632				fReceiveWindow)) {
1633			TRACE("  Receive(): segment out of window, next: %" B_PRIu32
1634				" wnd: %" B_PRIu32, fReceiveNext.Number(), fReceiveWindow);
1635			if ((segment.flags & TCP_FLAG_RESET) != 0) {
1636				// TODO: this doesn't look right - review!
1637				return DROP;
1638			}
1639			return DROP | IMMEDIATE_ACKNOWLEDGE;
1640		}
1641	}
1642
1643	if ((segment.flags & TCP_FLAG_RESET) != 0) {
1644		// Is this a valid reset?
1645		// We generally ignore resets in time wait state (see RFC 1337)
1646		if (fLastAcknowledgeSent <= segment.sequence
1647			&& tcp_sequence(segment.sequence) < (fLastAcknowledgeSent
1648				+ fReceiveWindow)
1649			&& fState != TIME_WAIT) {
1650			status_t error;
1651			if (fState == SYNCHRONIZE_RECEIVED)
1652				error = ECONNREFUSED;
1653			else if (fState == CLOSING || fState == WAIT_FOR_FINISH_ACKNOWLEDGE)
1654				error = ENOTCONN;
1655			else
1656				error = ECONNRESET;
1657
1658			_HandleReset(error);
1659		}
1660
1661		return DROP;
1662	}
1663
1664	if ((segment.flags & TCP_FLAG_SYNCHRONIZE) != 0
1665		|| (fState == SYNCHRONIZE_RECEIVED
1666			&& (fInitialReceiveSequence > segment.sequence
1667				|| ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0
1668					&& (fSendUnacknowledged > segment.acknowledge
1669						|| fSendMax < segment.acknowledge))))) {
1670		// reset the connection - either the initial SYN was faulty, or we
1671		// received a SYN within the data stream
1672		return DROP | RESET;
1673	}
1674
1675	// TODO: Check this! Why do we advertize a window outside of what we should
1676	// buffer?
1677	fReceiveWindow = max_c(fReceiveQueue.Free(), fReceiveWindow);
1678		// the window must not shrink
1679
1680	// trim buffer to be within the receive window
1681	int32 drop = (int32)(fReceiveNext - segment.sequence).Number();
1682	if (drop > 0) {
1683		if ((uint32)drop > buffer->size
1684			|| ((uint32)drop == buffer->size
1685				&& (segment.flags & TCP_FLAG_FINISH) == 0)) {
1686			// don't accidently remove a FIN we shouldn't remove
1687			segment.flags &= ~TCP_FLAG_FINISH;
1688			drop = buffer->size;
1689		}
1690
1691		// remove duplicate data at the start
1692		TRACE("* remove %" B_PRId32 " bytes from the start", drop);
1693		gBufferModule->remove_header(buffer, drop);
1694		segment.sequence += drop;
1695	}
1696
1697	int32 action = KEEP;
1698
1699	// immediately acknowledge out-of-order segment to trigger fast-retransmit at the sender
1700	if (drop != 0)
1701		action |= IMMEDIATE_ACKNOWLEDGE;
1702
1703	drop = (int32)(segment.sequence + buffer->size
1704		- (fReceiveNext + fReceiveWindow)).Number();
1705	if (drop > 0) {
1706		// remove data exceeding our window
1707		if ((uint32)drop >= buffer->size) {
1708			// if we can accept data, or the segment is not what we'd expect,
1709			// drop the segment (an immediate acknowledge is always triggered)
1710			if (fReceiveWindow != 0 || segment.sequence != fReceiveNext)
1711				return DROP | IMMEDIATE_ACKNOWLEDGE;
1712
1713			action |= IMMEDIATE_ACKNOWLEDGE;
1714		}
1715
1716		if ((segment.flags & TCP_FLAG_FINISH) != 0) {
1717			// we need to remove the finish, too, as part of the data
1718			drop--;
1719		}
1720
1721		segment.flags &= ~(TCP_FLAG_FINISH | TCP_FLAG_PUSH);
1722		TRACE("* remove %" B_PRId32 " bytes from the end", drop);
1723		gBufferModule->remove_trailer(buffer, drop);
1724	}
1725
1726#ifdef TRACE_TCP
1727	if (advertisedWindow > fSendWindow) {
1728		TRACE("  Receive(): Window update %" B_PRIu32 " -> %" B_PRIu32,
1729			fSendWindow, advertisedWindow);
1730	}
1731#endif
1732
1733	if (fSendWindow < fSendMaxSegmentSize
1734			&& advertisedWindow >= fSendMaxSegmentSize) {
1735		// Our current send window is less than a segment wide, and the new one
1736		// is larger, so trigger a send in case there's anything to be sent.
1737		action |= SEND_QUEUED;
1738	}
1739
1740	fSendWindow = advertisedWindow;
1741	if (advertisedWindow > fSendMaxWindow)
1742		fSendMaxWindow = advertisedWindow;
1743
1744	// Then look at the acknowledgement for any updates
1745
1746	if ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0) {
1747		// process acknowledged data
1748		if (fState == SYNCHRONIZE_RECEIVED)
1749			_MarkEstablished();
1750
1751		if (fSendMax < segment.acknowledge)
1752			return DROP | IMMEDIATE_ACKNOWLEDGE;
1753
1754		if (segment.acknowledge == fSendUnacknowledged) {
1755			if (buffer->size == 0 && advertisedWindow == fSendWindow
1756				&& (segment.flags & TCP_FLAG_FINISH) == 0 && fSendUnacknowledged != fSendMax) {
1757				TRACE("Receive(): duplicate ack!");
1758				_DuplicateAcknowledge(segment);
1759			}
1760		} else if (segment.acknowledge < fSendUnacknowledged) {
1761			return DROP;
1762		} else {
1763			// this segment acknowledges in flight data
1764
1765			if (fDuplicateAcknowledgeCount >= 3) {
1766				// deflate the window.
1767				if (segment.acknowledge > fRecover) {
1768					uint32 flightSize = (fSendMax - fSendUnacknowledged).Number();
1769					fCongestionWindow = min_c(fSlowStartThreshold,
1770						max_c(flightSize, fSendMaxSegmentSize) + fSendMaxSegmentSize);
1771					fFlags &= ~FLAG_RECOVERY;
1772				}
1773			}
1774
1775			if (fSendMax == segment.acknowledge)
1776				TRACE("Receive(): all inflight data ack'd!");
1777
1778			if (segment.acknowledge > fSendQueue.LastSequence()
1779					&& fState > ESTABLISHED) {
1780				TRACE("Receive(): FIN has been acknowledged!");
1781
1782				switch (fState) {
1783					case FINISH_SENT:
1784						fState = FINISH_ACKNOWLEDGED;
1785						T(State(this));
1786						break;
1787					case CLOSING:
1788						fState = TIME_WAIT;
1789						T(State(this));
1790						_EnterTimeWait();
1791						return DROP;
1792					case WAIT_FOR_FINISH_ACKNOWLEDGE:
1793						_Close();
1794						break;
1795
1796					default:
1797						break;
1798				}
1799			}
1800
1801			if (fState != CLOSED) {
1802				tcp_sequence last = fLastAcknowledgeSent;
1803				_Acknowledged(segment);
1804				// we just sent an acknowledge, remove from action
1805				if (last < fLastAcknowledgeSent)
1806					action &= ~IMMEDIATE_ACKNOWLEDGE;
1807			}
1808		}
1809	}
1810
1811	if (segment.flags & TCP_FLAG_URGENT) {
1812		if (fState == ESTABLISHED || fState == FINISH_SENT
1813			|| fState == FINISH_ACKNOWLEDGED) {
1814			// TODO: Handle urgent data:
1815			//  - RCV.UP <- max(RCV.UP, SEG.UP)
1816			//  - signal the user that urgent data is available (SIGURG)
1817		}
1818	}
1819
1820	bool notify = false;
1821
1822	// The buffer may be freed if its data is added to the queue, so cache
1823	// the size as we still need it later.
1824	const uint32 bufferSize = buffer->size;
1825
1826	if ((bufferSize > 0 || (segment.flags & TCP_FLAG_FINISH) != 0)
1827		&& _ShouldReceive())
1828		notify = _AddData(segment, buffer);
1829	else {
1830		if ((fFlags & FLAG_NO_RECEIVE) != 0)
1831			fReceiveNext += buffer->size;
1832
1833		action = (action & ~KEEP) | DROP;
1834	}
1835
1836	if ((segment.flags & TCP_FLAG_FINISH) != 0) {
1837		segmentLength++;
1838		if (fState != CLOSED && fState != LISTEN && fState != SYNCHRONIZE_SENT) {
1839			TRACE("Receive(): peer is finishing connection!");
1840			fReceiveNext++;
1841			notify = true;
1842
1843			// FIN implies PUSH
1844			fReceiveQueue.SetPushPointer();
1845
1846			// we'll reply immediately to the FIN if we are not
1847			// transitioning to TIME WAIT so we immediatly ACK it.
1848			action |= IMMEDIATE_ACKNOWLEDGE;
1849
1850			// other side is closing connection; change states
1851			switch (fState) {
1852				case ESTABLISHED:
1853				case SYNCHRONIZE_RECEIVED:
1854					fState = FINISH_RECEIVED;
1855					T(State(this));
1856					break;
1857				case FINISH_SENT:
1858					// simultaneous close
1859					fState = CLOSING;
1860					T(State(this));
1861					break;
1862				case FINISH_ACKNOWLEDGED:
1863					fState = TIME_WAIT;
1864					T(State(this));
1865					_EnterTimeWait();
1866					break;
1867				case TIME_WAIT:
1868					_UpdateTimeWait();
1869					break;
1870
1871				default:
1872					break;
1873			}
1874		}
1875	}
1876
1877	if (notify)
1878		_NotifyReader();
1879
1880	if (bufferSize > 0 || (segment.flags & TCP_FLAG_SYNCHRONIZE) != 0)
1881		action |= ACKNOWLEDGE;
1882
1883	_UpdateTimestamps(segment, segmentLength);
1884
1885	TRACE("Receive() Action %" B_PRId32, action);
1886
1887	return action;
1888}
1889
1890
1891int32
1892TCPEndpoint::SegmentReceived(tcp_segment_header& segment, net_buffer* buffer)
1893{
1894	MutexLocker locker(fLock);
1895
1896	TRACE("SegmentReceived(): buffer %p (%" B_PRIu32 " bytes) address %s "
1897		"to %s flags %#" B_PRIx8 ", seq %" B_PRIu32 ", ack %" B_PRIu32
1898		", wnd %" B_PRIu32, buffer, buffer->size, PrintAddress(buffer->source),
1899		PrintAddress(buffer->destination), segment.flags, segment.sequence,
1900		segment.acknowledge,
1901		(uint32)segment.advertised_window << fSendWindowShift);
1902	T(Receive(this, segment,
1903		(uint32)segment.advertised_window << fSendWindowShift, buffer));
1904	int32 segmentAction = DROP;
1905
1906	switch (fState) {
1907		case LISTEN:
1908			segmentAction = _ListenReceive(segment, buffer);
1909			break;
1910
1911		case SYNCHRONIZE_SENT:
1912			segmentAction = _SynchronizeSentReceive(segment, buffer);
1913			break;
1914
1915		case SYNCHRONIZE_RECEIVED:
1916		case ESTABLISHED:
1917		case FINISH_RECEIVED:
1918		case WAIT_FOR_FINISH_ACKNOWLEDGE:
1919		case FINISH_SENT:
1920		case FINISH_ACKNOWLEDGED:
1921		case CLOSING:
1922		case TIME_WAIT:
1923		case CLOSED:
1924			segmentAction = _Receive(segment, buffer);
1925			break;
1926	}
1927
1928	// process acknowledge action as asked for by the *Receive() method
1929	if (segmentAction & IMMEDIATE_ACKNOWLEDGE)
1930		_SendAcknowledge(true);
1931	else if (segmentAction & ACKNOWLEDGE)
1932		DelayedAcknowledge();
1933
1934	if (segmentAction & SEND_QUEUED)
1935		_SendQueued();
1936
1937	if ((fFlags & (FLAG_CLOSED | FLAG_DELETE_ON_CLOSE))
1938			== (FLAG_CLOSED | FLAG_DELETE_ON_CLOSE)) {
1939
1940		locker.Unlock();
1941		if (gSocketModule->release_socket(socket))
1942			segmentAction |= DELETED_ENDPOINT;
1943	}
1944
1945	return segmentAction;
1946}
1947
1948
1949//	#pragma mark - send
1950
1951
1952tcp_segment_header
1953TCPEndpoint::_PrepareSendSegment()
1954{
1955	// we don't set FLAG_FINISH here, instead we do it
1956	// conditionally below depending if we are sending
1957	// the last bytes of the send queue.
1958
1959	uint8 flags = 0;
1960	switch (fState) {
1961		case CLOSED:
1962			flags = TCP_FLAG_RESET | TCP_FLAG_ACKNOWLEDGE;
1963			break;
1964
1965		case SYNCHRONIZE_SENT:
1966			flags = TCP_FLAG_SYNCHRONIZE;
1967			break;
1968
1969		case SYNCHRONIZE_RECEIVED:
1970			flags = TCP_FLAG_SYNCHRONIZE | TCP_FLAG_ACKNOWLEDGE;
1971			break;
1972
1973		case ESTABLISHED:
1974		case FINISH_RECEIVED:
1975		case FINISH_ACKNOWLEDGED:
1976		case TIME_WAIT:
1977		case WAIT_FOR_FINISH_ACKNOWLEDGE:
1978		case FINISH_SENT:
1979		case CLOSING:
1980			flags = TCP_FLAG_ACKNOWLEDGE;
1981
1982		default:
1983			break;
1984	}
1985
1986	tcp_segment_header segment(flags);
1987
1988	if ((fOptions & TCP_NOOPT) == 0) {
1989		if ((fFlags & FLAG_OPTION_TIMESTAMP) != 0) {
1990			segment.options |= TCP_HAS_TIMESTAMPS;
1991			segment.timestamp_reply = fReceivedTimestamp;
1992			segment.timestamp_value = tcp_now();
1993		}
1994
1995		if ((segment.flags & TCP_FLAG_SYNCHRONIZE) != 0
1996				&& fSendNext == fInitialSendSequence) {
1997			// add connection establishment options
1998			segment.max_segment_size = fReceiveMaxSegmentSize;
1999			if (fFlags & FLAG_OPTION_WINDOW_SCALE) {
2000				segment.options |= TCP_HAS_WINDOW_SCALE;
2001				segment.window_shift = fReceiveWindowShift;
2002			}
2003			if ((fFlags & FLAG_OPTION_SACK_PERMITTED) != 0)
2004				segment.options |= TCP_SACK_PERMITTED;
2005		}
2006
2007		if (!fReceiveQueue.IsContiguous()
2008				&& (fFlags & FLAG_OPTION_SACK_PERMITTED) != 0) {
2009			segment.options |= TCP_HAS_SACK;
2010			int maxSackCount = MAX_SACK_BLKS
2011				- ((fFlags & FLAG_OPTION_TIMESTAMP) != 0) ? 1 : 0;
2012			memset(segment.sacks, 0, sizeof(segment.sacks));
2013			segment.sackCount = fReceiveQueue.PopulateSackInfo(fReceiveNext,
2014				maxSackCount, segment.sacks);
2015		}
2016	}
2017
2018	size_t availableBytes = fReceiveQueue.Free();
2019	// window size must remain same for duplicate acknowledgements
2020	if (!fReceiveQueue.IsContiguous())
2021		availableBytes = (fReceiveMaxAdvertised - fReceiveNext).Number();
2022	segment.SetAdvertisedWindow(availableBytes, fReceiveWindowShift);
2023
2024	segment.acknowledge = fReceiveNext.Number();
2025
2026	// Process urgent data
2027	if (fSendUrgentOffset > fSendNext) {
2028		segment.flags |= TCP_FLAG_URGENT;
2029		segment.urgent_offset = (fSendUrgentOffset - fSendNext).Number();
2030	} else {
2031		fSendUrgentOffset = fSendUnacknowledged.Number();
2032			// Keep urgent offset updated, so that it doesn't reach into our
2033			// send window on overlap
2034		segment.urgent_offset = 0;
2035	}
2036
2037	return segment;
2038}
2039
2040
2041status_t
2042TCPEndpoint::_PrepareAndSend(tcp_segment_header& segment, net_buffer* buffer,
2043	bool isRetransmit)
2044{
2045	LocalAddress().CopyTo(buffer->source);
2046	PeerAddress().CopyTo(buffer->destination);
2047
2048	uint32 size = buffer->size, segmentLength = size;
2049	segment.sequence = fSendNext.Number();
2050
2051	TRACE("_PrepareAndSend(): buffer %p (%" B_PRIu32 " bytes) address %s to "
2052		"%s flags %#" B_PRIx8 ", seq %" B_PRIu32 ", ack %" B_PRIu32
2053		", rwnd %" B_PRIu16 ", cwnd %" B_PRIu32 ", ssthresh %" B_PRIu32
2054		", len %" B_PRIu32 ", first %" B_PRIu32 ", last %" B_PRIu32,
2055		buffer, buffer->size, PrintAddress(buffer->source),
2056		PrintAddress(buffer->destination), segment.flags, segment.sequence,
2057		segment.acknowledge, segment.advertised_window,
2058		fCongestionWindow, fSlowStartThreshold, segmentLength,
2059		fSendQueue.FirstSequence().Number(),
2060		fSendQueue.LastSequence().Number());
2061	T(Send(this, segment, buffer, fSendQueue.FirstSequence(),
2062		fSendQueue.LastSequence()));
2063
2064	PROBE(buffer, sendWindow);
2065
2066	status_t status = add_tcp_header(AddressModule(), segment, buffer);
2067	if (status != B_OK) {
2068		gBufferModule->free(buffer);
2069		return status;
2070	}
2071
2072	if (segment.flags & TCP_FLAG_SYNCHRONIZE) {
2073		segment.options &= ~TCP_HAS_WINDOW_SCALE;
2074		segment.max_segment_size = 0;
2075		size++;
2076	}
2077
2078	if (segment.flags & TCP_FLAG_FINISH)
2079		size++;
2080
2081	status = next->module->send_routed_data(next, fRoute, buffer);
2082	if (status < B_OK) {
2083		gBufferModule->free(buffer);
2084		return status;
2085	}
2086
2087	fSendNext += size;
2088	if (fSendMax < fSendNext)
2089		fSendMax = fSendNext;
2090
2091	fReceiveMaxAdvertised = fReceiveNext + segment.AdvertisedWindow(fReceiveWindowShift);
2092
2093	if (segmentLength != 0 && fState == ESTABLISHED)
2094		--fSendMaxSegments;
2095
2096	if (fSendTime == 0 && !isRetransmit
2097			&& (segmentLength != 0 || (segment.flags & TCP_FLAG_SYNCHRONIZE) != 0)) {
2098		fSendTime = tcp_now();
2099		fRoundTripStartSequence = segment.sequence;
2100	}
2101
2102	if (segment.flags & TCP_FLAG_ACKNOWLEDGE) {
2103		fLastAcknowledgeSent = segment.acknowledge;
2104		gStackModule->cancel_timer(&fDelayedAcknowledgeTimer);
2105	}
2106
2107	return B_OK;
2108}
2109
2110
2111inline bool
2112TCPEndpoint::_ShouldSendSegment(tcp_segment_header& segment, uint32 length,
2113	uint32 segmentMaxSize, uint32 flightSize)
2114{
2115	if (fState == ESTABLISHED && fSendMaxSegments == 0)
2116		return false;
2117
2118	if (length > 0) {
2119		// Avoid the silly window syndrome - we only send a segment in case:
2120		// - we have a full segment to send, or
2121		// - we're at the end of our buffer queue, or
2122		// - the buffer is at least larger than half of the maximum send window,
2123		//   or
2124		// - we're retransmitting data
2125		if (length == segmentMaxSize
2126			|| (fOptions & TCP_NODELAY) != 0
2127			|| tcp_sequence(fSendNext + length) == fSendQueue.LastSequence()
2128			|| (fSendMaxWindow > 0 && length >= fSendMaxWindow / 2))
2129			return true;
2130	}
2131
2132	// check if we need to send a window update to the peer
2133	if (segment.advertised_window > 0) {
2134		// correct the window to take into account what already has been advertised
2135		uint32 window = segment.AdvertisedWindow(fReceiveWindowShift)
2136			- (fReceiveMaxAdvertised - fReceiveNext).Number();
2137
2138		// if we can advertise a window larger than twice the maximum segment
2139		// size, or half the maximum buffer size we send a window update
2140		if (window >= (fReceiveMaxSegmentSize << 1)
2141			|| window >= (socket->receive.buffer_size >> 1))
2142			return true;
2143	}
2144
2145	if ((segment.flags & (TCP_FLAG_SYNCHRONIZE | TCP_FLAG_FINISH
2146			| TCP_FLAG_RESET)) != 0)
2147		return true;
2148
2149	// We do have urgent data pending
2150	if (fSendUrgentOffset > fSendNext)
2151		return true;
2152
2153	// there is no reason to send a segment just now
2154	return false;
2155}
2156
2157
2158status_t
2159TCPEndpoint::_SendAcknowledge(bool force)
2160{
2161	if (fRoute == NULL || fState == LISTEN)
2162		return B_ERROR;
2163
2164	tcp_segment_header segment = _PrepareSendSegment();
2165
2166	// Is there actually anything to do?
2167	if (!force && fState == ESTABLISHED
2168			&& fLastAcknowledgeSent == fReceiveNext
2169			&& fReceiveQueue.IsContiguous()
2170			&& !_ShouldSendSegment(segment, 0, 0, 0))
2171		return B_OK;
2172
2173	net_buffer* buffer = gBufferModule->create(256);
2174	if (buffer == NULL)
2175		return B_NO_MEMORY;
2176
2177	return _PrepareAndSend(segment, buffer, false);
2178}
2179
2180
2181/*!	Sends one or more TCP segments with the data waiting in the queue. */
2182status_t
2183TCPEndpoint::_SendQueued(bool force)
2184{
2185	if (fRoute == NULL || fState < ESTABLISHED)
2186		return B_ERROR;
2187
2188	tcp_segment_header segment = _PrepareSendSegment();
2189
2190	uint32 sendWindow = fSendWindow;
2191	if (fCongestionWindow > 0 && fCongestionWindow < sendWindow)
2192		sendWindow = fCongestionWindow;
2193
2194	// fSendUnacknowledged
2195	//  |    fSendNext      fSendMax
2196	//  |        |              |
2197	//  v        v              v
2198	//  -----------------------------------
2199	//  | effective window           |
2200	//  -----------------------------------
2201
2202	// Flight size represents the window of data which is currently in the
2203	// ether. We should never send data such as the flight size becomes larger
2204	// than the effective window. Note however that the effective window may be
2205	// reduced (by congestion for instance), so at some point in time flight
2206	// size may be larger than the currently calculated window.
2207
2208	uint32 flightSize = (fSendMax - fSendUnacknowledged).Number();
2209	uint32 consumedWindow = (fSendNext - fSendUnacknowledged).Number();
2210
2211	if (consumedWindow > sendWindow) {
2212		sendWindow = 0;
2213		// TODO: enter persist state? try to get a window update.
2214	} else
2215		sendWindow -= consumedWindow;
2216
2217	uint32 length = min_c(fSendQueue.Available(fSendNext), sendWindow);
2218	if (length == 0 && !state_needs_finish(fState)) {
2219		// Nothing to send.
2220		return B_OK;
2221	}
2222
2223	bool shouldStartRetransmitTimer = fSendNext == fSendUnacknowledged;
2224	bool retransmit = fSendNext < fSendMax;
2225
2226	if (fDuplicateAcknowledgeCount != 0) {
2227		// send at most 1 SMSS of data when under limited transmit, fast transmit/recovery
2228		length = min_c(length, fSendMaxSegmentSize);
2229	}
2230
2231	do {
2232		uint32 segmentMaxSize = fSendMaxSegmentSize
2233			- tcp_options_length(segment);
2234		uint32 segmentLength = min_c(length, segmentMaxSize);
2235
2236		if ((fSendNext + segmentLength) == fSendQueue.LastSequence() && !force) {
2237			if (state_needs_finish(fState))
2238				segment.flags |= TCP_FLAG_FINISH;
2239			if (length > 0)
2240				segment.flags |= TCP_FLAG_PUSH;
2241		}
2242
2243		// Determine if we should really send this segment
2244		if (!force && !retransmit && !_ShouldSendSegment(segment, segmentLength,
2245				segmentMaxSize, flightSize)) {
2246			if (fSendQueue.Available()
2247				&& !gStackModule->is_timer_active(&fPersistTimer)
2248				&& !gStackModule->is_timer_active(&fRetransmitTimer))
2249				_StartPersistTimer();
2250			break;
2251		}
2252
2253		net_buffer *buffer = gBufferModule->create(256);
2254		if (buffer == NULL)
2255			return B_NO_MEMORY;
2256
2257		status_t status = B_OK;
2258		if (segmentLength > 0)
2259			status = fSendQueue.Get(buffer, fSendNext, segmentLength);
2260		if (status < B_OK) {
2261			gBufferModule->free(buffer);
2262			return status;
2263		}
2264
2265		sendWindow -= buffer->size;
2266
2267		status = _PrepareAndSend(segment, buffer, retransmit);
2268		if (status != B_OK)
2269			return status;
2270
2271		if (shouldStartRetransmitTimer) {
2272			TRACE("starting initial retransmit timer of: %" B_PRIdBIGTIME,
2273				fRetransmitTimeout);
2274			gStackModule->set_timer(&fRetransmitTimer, fRetransmitTimeout);
2275			T(TimerSet(this, "retransmit", fRetransmitTimeout));
2276			shouldStartRetransmitTimer = false;
2277		}
2278
2279		length -= segmentLength;
2280		segment.flags &= ~(TCP_FLAG_SYNCHRONIZE | TCP_FLAG_RESET
2281			| TCP_FLAG_FINISH);
2282
2283		if (retransmit)
2284			break;
2285
2286	} while (length > 0);
2287
2288	return B_OK;
2289}
2290
2291
2292int
2293TCPEndpoint::_MaxSegmentSize(const sockaddr* address) const
2294{
2295	return next->module->get_mtu(next, address) - sizeof(tcp_header);
2296}
2297
2298
2299status_t
2300TCPEndpoint::_PrepareSendPath(const sockaddr* peer)
2301{
2302	if (fRoute == NULL) {
2303		fRoute = gDatalinkModule->get_route(Domain(), peer);
2304		if (fRoute == NULL)
2305			return ENETUNREACH;
2306
2307		if ((fRoute->flags & RTF_LOCAL) != 0)
2308			fFlags |= FLAG_LOCAL;
2309	}
2310
2311	// make sure connection does not already exist
2312	status_t status = fManager->SetConnection(this, *LocalAddress(), peer,
2313		fRoute->interface_address->local);
2314	if (status < B_OK)
2315		return status;
2316
2317	fInitialSendSequence = system_time() >> 4;
2318	fSendNext = fInitialSendSequence;
2319	fSendUnacknowledged = fInitialSendSequence;
2320	fSendMax = fInitialSendSequence;
2321	fSendUrgentOffset = fInitialSendSequence;
2322	fRecover = fInitialSendSequence.Number();
2323
2324	// we are counting the SYN here
2325	fSendQueue.SetInitialSequence(fSendNext + 1);
2326
2327	fReceiveMaxSegmentSize = _MaxSegmentSize(peer);
2328
2329	// Compute the window shift we advertise to our peer - if it doesn't support
2330	// this option, this will be reset to 0 (when its SYN is received)
2331	fReceiveWindowShift = 0;
2332	while (fReceiveWindowShift < TCP_MAX_WINDOW_SHIFT
2333		&& (0xffffUL << fReceiveWindowShift) < socket->receive.buffer_size) {
2334		fReceiveWindowShift++;
2335	}
2336
2337	return B_OK;
2338}
2339
2340
2341void
2342TCPEndpoint::_Acknowledged(tcp_segment_header& segment)
2343{
2344	TRACE("_Acknowledged(): ack %" B_PRIu32 "; uack %" B_PRIu32 "; next %"
2345		B_PRIu32 "; max %" B_PRIu32, segment.acknowledge,
2346		fSendUnacknowledged.Number(), fSendNext.Number(), fSendMax.Number());
2347
2348	ASSERT(fSendUnacknowledged <= segment.acknowledge);
2349
2350	if (fSendUnacknowledged < segment.acknowledge) {
2351		fSendQueue.RemoveUntil(segment.acknowledge);
2352
2353		uint32 bytesAcknowledged = segment.acknowledge - fSendUnacknowledged.Number();
2354		fPreviousHighestAcknowledge = fSendUnacknowledged;
2355		fSendUnacknowledged = segment.acknowledge;
2356		uint32 flightSize = (fSendMax - fSendUnacknowledged).Number();
2357		int32 expectedSamples = flightSize / (fSendMaxSegmentSize << 1);
2358
2359		if (fPreviousHighestAcknowledge > fSendUnacknowledged) {
2360			// need to update the recover variable upon a sequence wraparound
2361			fRecover = segment.acknowledge - 1;
2362		}
2363
2364		// the acknowledgment of the SYN/ACK MUST NOT increase the size of the congestion window
2365		if (fSendUnacknowledged != fInitialSendSequence) {
2366			if (fCongestionWindow < fSlowStartThreshold)
2367				fCongestionWindow += min_c(bytesAcknowledged, fSendMaxSegmentSize);
2368			else {
2369				uint32 increment = fSendMaxSegmentSize * fSendMaxSegmentSize;
2370
2371				if (increment < fCongestionWindow)
2372					increment = 1;
2373				else
2374					increment /= fCongestionWindow;
2375
2376				fCongestionWindow += increment;
2377			}
2378
2379			fSendMaxSegments = UINT32_MAX;
2380		}
2381
2382		if ((fFlags & FLAG_RECOVERY) != 0) {
2383			fSendNext = fSendUnacknowledged;
2384			_SendQueued();
2385			fCongestionWindow -= bytesAcknowledged;
2386
2387			if (bytesAcknowledged > fSendMaxSegmentSize)
2388				fCongestionWindow += fSendMaxSegmentSize;
2389
2390			fSendNext = fSendMax;
2391		} else
2392			fDuplicateAcknowledgeCount = 0;
2393
2394		if (fSendNext < fSendUnacknowledged)
2395			fSendNext = fSendUnacknowledged;
2396
2397		if (fFlags & FLAG_OPTION_TIMESTAMP) {
2398			_UpdateRoundTripTime(tcp_diff_timestamp(segment.timestamp_reply),
2399				expectedSamples > 0 ? expectedSamples : 1);
2400		} else if (fSendTime != 0 && fRoundTripStartSequence < segment.acknowledge) {
2401			_UpdateRoundTripTime(tcp_diff_timestamp(fSendTime), 1);
2402			fSendTime = 0;
2403		}
2404
2405		if (fSendUnacknowledged == fSendMax) {
2406			TRACE("all acknowledged, cancelling retransmission timer.");
2407			gStackModule->cancel_timer(&fRetransmitTimer);
2408			T(TimerSet(this, "retransmit", -1));
2409		} else {
2410			TRACE("data acknowledged, resetting retransmission timer to: %"
2411				B_PRIdBIGTIME, fRetransmitTimeout);
2412			gStackModule->set_timer(&fRetransmitTimer, fRetransmitTimeout);
2413			T(TimerSet(this, "retransmit", fRetransmitTimeout));
2414		}
2415
2416		if (is_writable(fState)) {
2417			// notify threads waiting on the socket to become writable again
2418			fSendCondition.NotifyAll();
2419			gSocketModule->notify(socket, B_SELECT_WRITE, fSendQueue.Free());
2420		}
2421	}
2422
2423	// if there is data left to be sent, send it now
2424	if (fSendQueue.Used() > 0)
2425		_SendQueued();
2426}
2427
2428
2429void
2430TCPEndpoint::_Retransmit()
2431{
2432	TRACE("Retransmit()");
2433
2434	if (fState < ESTABLISHED) {
2435		fRetransmitTimeout = TCP_SYN_RETRANSMIT_TIMEOUT;
2436		fCongestionWindow = fSendMaxSegmentSize;
2437	} else {
2438		_ResetSlowStart();
2439		fDuplicateAcknowledgeCount = 0;
2440		// Do exponential back off of the retransmit timeout
2441		fRetransmitTimeout *= 2;
2442		if (fRetransmitTimeout > TCP_MAX_RETRANSMIT_TIMEOUT)
2443			fRetransmitTimeout = TCP_MAX_RETRANSMIT_TIMEOUT;
2444	}
2445
2446	fSendNext = fSendUnacknowledged;
2447	_SendQueued();
2448
2449	fRecover = fSendNext.Number() - 1;
2450	if ((fFlags & FLAG_RECOVERY) != 0)
2451		fFlags &= ~FLAG_RECOVERY;
2452}
2453
2454
2455void
2456TCPEndpoint::_UpdateRoundTripTime(int32 roundTripTime, int32 expectedSamples)
2457{
2458	if (fSmoothedRoundTripTime == 0) {
2459		fSmoothedRoundTripTime = roundTripTime;
2460		fRoundTripVariation = roundTripTime / 2;
2461		fRetransmitTimeout = (fSmoothedRoundTripTime + max_c(100, fRoundTripVariation * 4))
2462				* kTimestampFactor;
2463	} else {
2464		int32 delta = fSmoothedRoundTripTime - roundTripTime;
2465		if (delta < 0)
2466			delta = -delta;
2467		fRoundTripVariation += (delta - fRoundTripVariation) / (expectedSamples * 4);
2468		fSmoothedRoundTripTime += (roundTripTime - fSmoothedRoundTripTime) / (expectedSamples * 8);
2469		fRetransmitTimeout = (fSmoothedRoundTripTime + max_c(100, fRoundTripVariation * 4))
2470			* kTimestampFactor;
2471	}
2472
2473	if (fRetransmitTimeout > TCP_MAX_RETRANSMIT_TIMEOUT)
2474		fRetransmitTimeout = TCP_MAX_RETRANSMIT_TIMEOUT;
2475
2476	if (fRetransmitTimeout < TCP_MIN_RETRANSMIT_TIMEOUT)
2477		fRetransmitTimeout = TCP_MIN_RETRANSMIT_TIMEOUT;
2478
2479	TRACE("  RTO is now %" B_PRIdBIGTIME " (after rtt %" B_PRId32 "ms)",
2480		fRetransmitTimeout, roundTripTime);
2481}
2482
2483
2484void
2485TCPEndpoint::_ResetSlowStart()
2486{
2487	fSlowStartThreshold = max_c((fSendMax - fSendUnacknowledged).Number() / 2,
2488		2 * fSendMaxSegmentSize);
2489	fCongestionWindow = fSendMaxSegmentSize;
2490}
2491
2492
2493//	#pragma mark - timer
2494
2495
2496/*static*/ void
2497TCPEndpoint::_RetransmitTimer(net_timer* timer, void* _endpoint)
2498{
2499	TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint;
2500	T(TimerTriggered(endpoint, "retransmit"));
2501
2502	MutexLocker locker(endpoint->fLock);
2503	if (!locker.IsLocked() || gStackModule->is_timer_active(timer))
2504		return;
2505
2506	endpoint->_Retransmit();
2507}
2508
2509
2510/*static*/ void
2511TCPEndpoint::_PersistTimer(net_timer* timer, void* _endpoint)
2512{
2513	TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint;
2514	T(TimerTriggered(endpoint, "persist"));
2515
2516	MutexLocker locker(endpoint->fLock);
2517	if (!locker.IsLocked())
2518		return;
2519
2520	// the timer might not have been canceled early enough
2521	if (endpoint->State() == CLOSED)
2522		return;
2523
2524	endpoint->_SendQueued(true);
2525}
2526
2527
2528/*static*/ void
2529TCPEndpoint::_DelayedAcknowledgeTimer(net_timer* timer, void* _endpoint)
2530{
2531	TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint;
2532	T(TimerTriggered(endpoint, "delayed ack"));
2533
2534	MutexLocker locker(endpoint->fLock);
2535	if (!locker.IsLocked())
2536		return;
2537
2538	// the timer might not have been canceled early enough
2539	if (endpoint->State() == CLOSED)
2540		return;
2541
2542	endpoint->_SendAcknowledge();
2543}
2544
2545
2546/*static*/ void
2547TCPEndpoint::_TimeWaitTimer(net_timer* timer, void* _endpoint)
2548{
2549	TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint;
2550	T(TimerTriggered(endpoint, "time-wait"));
2551
2552	MutexLocker locker(endpoint->fLock);
2553	if (!locker.IsLocked())
2554		return;
2555
2556	if ((endpoint->fFlags & FLAG_CLOSED) == 0) {
2557		endpoint->fFlags |= FLAG_DELETE_ON_CLOSE;
2558		return;
2559	}
2560
2561	locker.Unlock();
2562
2563	gSocketModule->release_socket(endpoint->socket);
2564}
2565
2566
2567/*static*/ status_t
2568TCPEndpoint::_WaitForCondition(ConditionVariable& condition,
2569	MutexLocker& locker, bigtime_t timeout)
2570{
2571	ConditionVariableEntry entry;
2572	condition.Add(&entry);
2573
2574	locker.Unlock();
2575	status_t result = entry.Wait(B_ABSOLUTE_TIMEOUT | B_CAN_INTERRUPT, timeout);
2576	locker.Lock();
2577
2578	return result;
2579}
2580
2581
2582//	#pragma mark -
2583
2584
2585void
2586TCPEndpoint::Dump() const
2587{
2588	kprintf("TCP endpoint %p\n", this);
2589	kprintf("  state: %s\n", name_for_state(fState));
2590	kprintf("  flags: 0x%" B_PRIx32 "\n", fFlags);
2591#if KDEBUG
2592	kprintf("  lock: { %p, holder: %" B_PRId32 " }\n", &fLock, fLock.holder);
2593#endif
2594	kprintf("  accept sem: %" B_PRId32 "\n", fAcceptSemaphore);
2595	kprintf("  options: 0x%" B_PRIx32 "\n", (uint32)fOptions);
2596	kprintf("  send\n");
2597	kprintf("    window shift: %" B_PRIu8 "\n", fSendWindowShift);
2598	kprintf("    unacknowledged: %" B_PRIu32 "\n",
2599		fSendUnacknowledged.Number());
2600	kprintf("    next: %" B_PRIu32 "\n", fSendNext.Number());
2601	kprintf("    max: %" B_PRIu32 "\n", fSendMax.Number());
2602	kprintf("    urgent offset: %" B_PRIu32 "\n", fSendUrgentOffset.Number());
2603	kprintf("    window: %" B_PRIu32 "\n", fSendWindow);
2604	kprintf("    max window: %" B_PRIu32 "\n", fSendMaxWindow);
2605	kprintf("    max segment size: %" B_PRIu32 "\n", fSendMaxSegmentSize);
2606	kprintf("    queue: %" B_PRIuSIZE " / %" B_PRIuSIZE "\n", fSendQueue.Used(),
2607		fSendQueue.Size());
2608#if DEBUG_TCP_BUFFER_QUEUE
2609	fSendQueue.Dump();
2610#endif
2611	kprintf("    last acknowledge sent: %" B_PRIu32 "\n",
2612		fLastAcknowledgeSent.Number());
2613	kprintf("    initial sequence: %" B_PRIu32 "\n",
2614		fInitialSendSequence.Number());
2615	kprintf("  receive\n");
2616	kprintf("    window shift: %" B_PRIu8 "\n", fReceiveWindowShift);
2617	kprintf("    next: %" B_PRIu32 "\n", fReceiveNext.Number());
2618	kprintf("    max advertised: %" B_PRIu32 "\n",
2619		fReceiveMaxAdvertised.Number());
2620	kprintf("    window: %" B_PRIu32 "\n", fReceiveWindow);
2621	kprintf("    max segment size: %" B_PRIu32 "\n", fReceiveMaxSegmentSize);
2622	kprintf("    queue: %" B_PRIuSIZE " / %" B_PRIuSIZE "\n",
2623		fReceiveQueue.Available(), fReceiveQueue.Size());
2624#if DEBUG_TCP_BUFFER_QUEUE
2625	fReceiveQueue.Dump();
2626#endif
2627	kprintf("    initial sequence: %" B_PRIu32 "\n",
2628		fInitialReceiveSequence.Number());
2629	kprintf("    duplicate acknowledge count: %" B_PRIu32 "\n",
2630		fDuplicateAcknowledgeCount);
2631	kprintf("  smoothed round trip time: %" B_PRId32 " (deviation %" B_PRId32 ")\n",
2632		fSmoothedRoundTripTime, fRoundTripVariation);
2633	kprintf("  retransmit timeout: %" B_PRId64 "\n", fRetransmitTimeout);
2634	kprintf("  congestion window: %" B_PRIu32 "\n", fCongestionWindow);
2635	kprintf("  slow start threshold: %" B_PRIu32 "\n", fSlowStartThreshold);
2636}
2637
2638