archive_read_support_format_warc.c revision 358088
1/*-
2 * Copyright (c) 2014 Sebastian Freundt
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
15 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
16 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
17 * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
18 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
19 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
20 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
21 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
23 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25
26#include "archive_platform.h"
27__FBSDID("$FreeBSD: stable/11/contrib/libarchive/libarchive/archive_read_support_format_warc.c 358088 2020-02-19 01:50:47Z mm $");
28
29/**
30 * WARC is standardised by ISO TC46/SC4/WG12 and currently available as
31 * ISO 28500:2009.
32 * For the purposes of this file we used the final draft from:
33 * http://bibnum.bnf.fr/warc/WARC_ISO_28500_version1_latestdraft.pdf
34 *
35 * Todo:
36 * [ ] real-world warcs can contain resources at endpoints ending in /
37 *     e.g. http://bibnum.bnf.fr/warc/
38 *     if you're lucky their response contains a Content-Location: header
39 *     pointing to a unix-compliant filename, in the example above it's
40 *     Content-Location: http://bibnum.bnf.fr/warc/index.html
41 *     however, that's not mandated and github for example doesn't follow
42 *     this convention.
43 *     We need a set of archive options to control what to do with
44 *     entries like these, at the moment care is taken to skip them.
45 *
46 **/
47
48#ifdef HAVE_SYS_STAT_H
49#include <sys/stat.h>
50#endif
51#ifdef HAVE_ERRNO_H
52#include <errno.h>
53#endif
54#ifdef HAVE_STDLIB_H
55#include <stdlib.h>
56#endif
57#ifdef HAVE_STRING_H
58#include <string.h>
59#endif
60#ifdef HAVE_LIMITS_H
61#include <limits.h>
62#endif
63#ifdef HAVE_CTYPE_H
64#include <ctype.h>
65#endif
66#ifdef HAVE_TIME_H
67#include <time.h>
68#endif
69
70#include "archive.h"
71#include "archive_entry.h"
72#include "archive_private.h"
73#include "archive_read_private.h"
74
75typedef enum {
76	WT_NONE,
77	/* warcinfo */
78	WT_INFO,
79	/* metadata */
80	WT_META,
81	/* resource */
82	WT_RSRC,
83	/* request, unsupported */
84	WT_REQ,
85	/* response, unsupported */
86	WT_RSP,
87	/* revisit, unsupported */
88	WT_RVIS,
89	/* conversion, unsupported */
90	WT_CONV,
91	/* continuation, unsupported at the moment */
92	WT_CONT,
93	/* invalid type */
94	LAST_WT
95} warc_type_t;
96
97typedef struct {
98	size_t len;
99	const char *str;
100} warc_string_t;
101
102typedef struct {
103	size_t len;
104	char *str;
105} warc_strbuf_t;
106
107struct warc_s {
108	/* content length ahead */
109	size_t cntlen;
110	/* and how much we've processed so far */
111	size_t cntoff;
112	/* and how much we need to consume between calls */
113	size_t unconsumed;
114
115	/* string pool */
116	warc_strbuf_t pool;
117	/* previous version */
118	unsigned int pver;
119	/* stringified format name */
120	struct archive_string sver;
121};
122
123static int _warc_bid(struct archive_read *a, int);
124static int _warc_cleanup(struct archive_read *a);
125static int _warc_read(struct archive_read*, const void**, size_t*, int64_t*);
126static int _warc_skip(struct archive_read *a);
127static int _warc_rdhdr(struct archive_read *a, struct archive_entry *e);
128
129/* private routines */
130static unsigned int _warc_rdver(const char buf[10], size_t bsz);
131static unsigned int _warc_rdtyp(const char *buf, size_t bsz);
132static warc_string_t _warc_rduri(const char *buf, size_t bsz);
133static ssize_t _warc_rdlen(const char *buf, size_t bsz);
134static time_t _warc_rdrtm(const char *buf, size_t bsz);
135static time_t _warc_rdmtm(const char *buf, size_t bsz);
136static const char *_warc_find_eoh(const char *buf, size_t bsz);
137static const char *_warc_find_eol(const char *buf, size_t bsz);
138
139int
140archive_read_support_format_warc(struct archive *_a)
141{
142	struct archive_read *a = (struct archive_read *)_a;
143	struct warc_s *w;
144	int r;
145
146	archive_check_magic(_a, ARCHIVE_READ_MAGIC,
147	    ARCHIVE_STATE_NEW, "archive_read_support_format_warc");
148
149	if ((w = calloc(1, sizeof(*w))) == NULL) {
150		archive_set_error(&a->archive, ENOMEM,
151		    "Can't allocate warc data");
152		return (ARCHIVE_FATAL);
153	}
154
155	r = __archive_read_register_format(
156		a, w, "warc",
157		_warc_bid, NULL, _warc_rdhdr, _warc_read,
158		_warc_skip, NULL, _warc_cleanup, NULL, NULL);
159
160	if (r != ARCHIVE_OK) {
161		free(w);
162		return (r);
163	}
164	return (ARCHIVE_OK);
165}
166
167static int
168_warc_cleanup(struct archive_read *a)
169{
170	struct warc_s *w = a->format->data;
171
172	if (w->pool.len > 0U) {
173		free(w->pool.str);
174	}
175	archive_string_free(&w->sver);
176	free(w);
177	a->format->data = NULL;
178	return (ARCHIVE_OK);
179}
180
181static int
182_warc_bid(struct archive_read *a, int best_bid)
183{
184	const char *hdr;
185	ssize_t nrd;
186	unsigned int ver;
187
188	(void)best_bid; /* UNUSED */
189
190	/* check first line of file, it should be a record already */
191	if ((hdr = __archive_read_ahead(a, 12U, &nrd)) == NULL) {
192		/* no idea what to do */
193		return -1;
194	} else if (nrd < 12) {
195		/* nah, not for us, our magic cookie is at least 12 bytes */
196		return -1;
197	}
198
199	/* otherwise snarf the record's version number */
200	ver = _warc_rdver(hdr, nrd);
201	if (ver < 1200U || ver > 10000U) {
202		/* we only support WARC 0.12 to 1.0 */
203		return -1;
204	}
205
206	/* otherwise be confident */
207	return (64);
208}
209
210static int
211_warc_rdhdr(struct archive_read *a, struct archive_entry *entry)
212{
213#define HDR_PROBE_LEN		(12U)
214	struct warc_s *w = a->format->data;
215	unsigned int ver;
216	const char *buf;
217	ssize_t nrd;
218	const char *eoh;
219	/* for the file name, saves some strndup()'ing */
220	warc_string_t fnam;
221	/* warc record type, not that we really use it a lot */
222	warc_type_t ftyp;
223	/* content-length+error monad */
224	ssize_t cntlen;
225	/* record time is the WARC-Date time we reinterpret it as ctime */
226	time_t rtime;
227	/* mtime is the Last-Modified time which will be the entry's mtime */
228	time_t mtime;
229
230start_over:
231	/* just use read_ahead() they keep track of unconsumed
232	 * bits and bobs for us; no need to put an extra shift in
233	 * and reproduce that functionality here */
234	buf = __archive_read_ahead(a, HDR_PROBE_LEN, &nrd);
235
236	if (nrd < 0) {
237		/* no good */
238		archive_set_error(
239			&a->archive, ARCHIVE_ERRNO_MISC,
240			"Bad record header");
241		return (ARCHIVE_FATAL);
242	} else if (buf == NULL) {
243		/* there should be room for at least WARC/bla\r\n
244		 * must be EOF therefore */
245		return (ARCHIVE_EOF);
246	}
247 	/* looks good so far, try and find the end of the header now */
248	eoh = _warc_find_eoh(buf, nrd);
249	if (eoh == NULL) {
250		/* still no good, the header end might be beyond the
251		 * probe we've requested, but then again who'd cram
252		 * so much stuff into the header *and* be 28500-compliant */
253		archive_set_error(
254			&a->archive, ARCHIVE_ERRNO_MISC,
255			"Bad record header");
256		return (ARCHIVE_FATAL);
257	}
258	ver = _warc_rdver(buf, eoh - buf);
259	/* we currently support WARC 0.12 to 1.0 */
260	if (ver == 0U) {
261		archive_set_error(
262			&a->archive, ARCHIVE_ERRNO_MISC,
263			"Invalid record version");
264		return (ARCHIVE_FATAL);
265	} else if (ver < 1200U || ver > 10000U) {
266		archive_set_error(
267			&a->archive, ARCHIVE_ERRNO_MISC,
268			"Unsupported record version: %u.%u",
269			ver / 10000, (ver % 10000) / 100);
270		return (ARCHIVE_FATAL);
271	}
272	cntlen = _warc_rdlen(buf, eoh - buf);
273	if (cntlen < 0) {
274		/* nightmare!  the specs say content-length is mandatory
275		 * so I don't feel overly bad stopping the reader here */
276		archive_set_error(
277			&a->archive, EINVAL,
278			"Bad content length");
279		return (ARCHIVE_FATAL);
280	}
281	rtime = _warc_rdrtm(buf, eoh - buf);
282	if (rtime == (time_t)-1) {
283		/* record time is mandatory as per WARC/1.0,
284		 * so just barf here, fast and loud */
285		archive_set_error(
286			&a->archive, EINVAL,
287			"Bad record time");
288		return (ARCHIVE_FATAL);
289	}
290
291	/* let the world know we're a WARC archive */
292	a->archive.archive_format = ARCHIVE_FORMAT_WARC;
293	if (ver != w->pver) {
294		/* stringify this entry's version */
295		archive_string_sprintf(&w->sver,
296			"WARC/%u.%u", ver / 10000, (ver % 10000) / 100);
297		/* remember the version */
298		w->pver = ver;
299	}
300	/* start off with the type */
301	ftyp = _warc_rdtyp(buf, eoh - buf);
302	/* and let future calls know about the content */
303	w->cntlen = cntlen;
304	w->cntoff = 0U;
305	mtime = 0;/* Avoid compiling error on some platform. */
306
307	switch (ftyp) {
308	case WT_RSRC:
309	case WT_RSP:
310		/* only try and read the filename in the cases that are
311		 * guaranteed to have one */
312		fnam = _warc_rduri(buf, eoh - buf);
313		/* check the last character in the URI to avoid creating
314		 * directory endpoints as files, see Todo above */
315		if (fnam.len == 0 || fnam.str[fnam.len - 1] == '/') {
316			/* break here for now */
317			fnam.len = 0U;
318			fnam.str = NULL;
319			break;
320		}
321		/* bang to our string pool, so we save a
322		 * malloc()+free() roundtrip */
323		if (fnam.len + 1U > w->pool.len) {
324			w->pool.len = ((fnam.len + 64U) / 64U) * 64U;
325			w->pool.str = realloc(w->pool.str, w->pool.len);
326		}
327		memcpy(w->pool.str, fnam.str, fnam.len);
328		w->pool.str[fnam.len] = '\0';
329		/* let no one else know about the pool, it's a secret, shhh */
330		fnam.str = w->pool.str;
331
332		/* snarf mtime or deduce from rtime
333		 * this is a custom header added by our writer, it's quite
334		 * hard to believe anyone else would go through with it
335		 * (apart from being part of some http responses of course) */
336		if ((mtime = _warc_rdmtm(buf, eoh - buf)) == (time_t)-1) {
337			mtime = rtime;
338		}
339		break;
340	default:
341		fnam.len = 0U;
342		fnam.str = NULL;
343		break;
344	}
345
346	/* now eat some of those delicious buffer bits */
347	__archive_read_consume(a, eoh - buf);
348
349	switch (ftyp) {
350	case WT_RSRC:
351	case WT_RSP:
352		if (fnam.len > 0U) {
353			/* populate entry object */
354			archive_entry_set_filetype(entry, AE_IFREG);
355			archive_entry_copy_pathname(entry, fnam.str);
356			archive_entry_set_size(entry, cntlen);
357			archive_entry_set_perm(entry, 0644);
358			/* rtime is the new ctime, mtime stays mtime */
359			archive_entry_set_ctime(entry, rtime, 0L);
360			archive_entry_set_mtime(entry, mtime, 0L);
361			break;
362		}
363		/* FALLTHROUGH */
364	default:
365		/* consume the content and start over */
366		_warc_skip(a);
367		goto start_over;
368	}
369	return (ARCHIVE_OK);
370}
371
372static int
373_warc_read(struct archive_read *a, const void **buf, size_t *bsz, int64_t *off)
374{
375	struct warc_s *w = a->format->data;
376	const char *rab;
377	ssize_t nrd;
378
379	if (w->cntoff >= w->cntlen) {
380	eof:
381		/* it's our lucky day, no work, we can leave early */
382		*buf = NULL;
383		*bsz = 0U;
384		*off = w->cntoff + 4U/*for \r\n\r\n separator*/;
385		w->unconsumed = 0U;
386		return (ARCHIVE_EOF);
387	}
388
389	if (w->unconsumed) {
390		__archive_read_consume(a, w->unconsumed);
391		w->unconsumed = 0U;
392	}
393
394	rab = __archive_read_ahead(a, 1U, &nrd);
395	if (nrd < 0) {
396		*bsz = 0U;
397		/* big catastrophe */
398		return (int)nrd;
399	} else if (nrd == 0) {
400		goto eof;
401	} else if ((size_t)nrd > w->cntlen - w->cntoff) {
402		/* clamp to content-length */
403		nrd = w->cntlen - w->cntoff;
404	}
405	*off = w->cntoff;
406	*bsz = nrd;
407	*buf = rab;
408
409	w->cntoff += nrd;
410	w->unconsumed = (size_t)nrd;
411	return (ARCHIVE_OK);
412}
413
414static int
415_warc_skip(struct archive_read *a)
416{
417	struct warc_s *w = a->format->data;
418
419	__archive_read_consume(a, w->cntlen + 4U/*\r\n\r\n separator*/);
420	w->cntlen = 0U;
421	w->cntoff = 0U;
422	return (ARCHIVE_OK);
423}
424
425
426/* private routines */
427static void*
428deconst(const void *c)
429{
430	return (char *)0x1 + (((const char *)c) - (const char *)0x1);
431}
432
433static char*
434xmemmem(const char *hay, const size_t haysize,
435	const char *needle, const size_t needlesize)
436{
437	const char *const eoh = hay + haysize;
438	const char *const eon = needle + needlesize;
439	const char *hp;
440	const char *np;
441	const char *cand;
442	unsigned int hsum;
443	unsigned int nsum;
444	unsigned int eqp;
445
446	/* trivial checks first
447         * a 0-sized needle is defined to be found anywhere in haystack
448         * then run strchr() to find a candidate in HAYSTACK (i.e. a portion
449         * that happens to begin with *NEEDLE) */
450	if (needlesize == 0UL) {
451		return deconst(hay);
452	} else if ((hay = memchr(hay, *needle, haysize)) == NULL) {
453		/* trivial */
454		return NULL;
455	}
456
457	/* First characters of haystack and needle are the same now. Both are
458	 * guaranteed to be at least one character long.  Now computes the sum
459	 * of characters values of needle together with the sum of the first
460	 * needle_len characters of haystack. */
461	for (hp = hay + 1U, np = needle + 1U, hsum = *hay, nsum = *hay, eqp = 1U;
462	     hp < eoh && np < eon;
463	     hsum ^= *hp, nsum ^= *np, eqp &= *hp == *np, hp++, np++);
464
465	/* HP now references the (NEEDLESIZE + 1)-th character. */
466	if (np < eon) {
467		/* haystack is smaller than needle, :O */
468		return NULL;
469	} else if (eqp) {
470		/* found a match */
471		return deconst(hay);
472	}
473
474	/* now loop through the rest of haystack,
475	 * updating the sum iteratively */
476	for (cand = hay; hp < eoh; hp++) {
477		hsum ^= *cand++;
478		hsum ^= *hp;
479
480		/* Since the sum of the characters is already known to be
481		 * equal at that point, it is enough to check just NEEDLESIZE - 1
482		 * characters for equality,
483		 * also CAND is by design < HP, so no need for range checks */
484		if (hsum == nsum && memcmp(cand, needle, needlesize - 1U) == 0) {
485			return deconst(cand);
486		}
487	}
488	return NULL;
489}
490
491static int
492strtoi_lim(const char *str, const char **ep, int llim, int ulim)
493{
494	int res = 0;
495	const char *sp;
496	/* we keep track of the number of digits via rulim */
497	int rulim;
498
499	for (sp = str, rulim = ulim > 10 ? ulim : 10;
500	     res * 10 <= ulim && rulim && *sp >= '0' && *sp <= '9';
501	     sp++, rulim /= 10) {
502		res *= 10;
503		res += *sp - '0';
504	}
505	if (sp == str) {
506		res = -1;
507	} else if (res < llim || res > ulim) {
508		res = -2;
509	}
510	*ep = (const char*)sp;
511	return res;
512}
513
514static time_t
515time_from_tm(struct tm *t)
516{
517#if HAVE_TIMEGM
518        /* Use platform timegm() if available. */
519        return (timegm(t));
520#elif HAVE__MKGMTIME64
521        return (_mkgmtime64(t));
522#else
523        /* Else use direct calculation using POSIX assumptions. */
524        /* First, fix up tm_yday based on the year/month/day. */
525        if (mktime(t) == (time_t)-1)
526                return ((time_t)-1);
527        /* Then we can compute timegm() from first principles. */
528        return (t->tm_sec
529            + t->tm_min * 60
530            + t->tm_hour * 3600
531            + t->tm_yday * 86400
532            + (t->tm_year - 70) * 31536000
533            + ((t->tm_year - 69) / 4) * 86400
534            - ((t->tm_year - 1) / 100) * 86400
535            + ((t->tm_year + 299) / 400) * 86400);
536#endif
537}
538
539static time_t
540xstrpisotime(const char *s, char **endptr)
541{
542/** like strptime() but strictly for ISO 8601 Zulu strings */
543	struct tm tm;
544	time_t res = (time_t)-1;
545
546	/* make sure tm is clean */
547	memset(&tm, 0, sizeof(tm));
548
549	/* as a courtesy to our callers, and since this is a non-standard
550	 * routine, we skip leading whitespace */
551	while (*s == ' ' || *s == '\t')
552		++s;
553
554	/* read year */
555	if ((tm.tm_year = strtoi_lim(s, &s, 1583, 4095)) < 0 || *s++ != '-') {
556		goto out;
557	}
558	/* read month */
559	if ((tm.tm_mon = strtoi_lim(s, &s, 1, 12)) < 0 || *s++ != '-') {
560		goto out;
561	}
562	/* read day-of-month */
563	if ((tm.tm_mday = strtoi_lim(s, &s, 1, 31)) < 0 || *s++ != 'T') {
564		goto out;
565	}
566	/* read hour */
567	if ((tm.tm_hour = strtoi_lim(s, &s, 0, 23)) < 0 || *s++ != ':') {
568		goto out;
569	}
570	/* read minute */
571	if ((tm.tm_min = strtoi_lim(s, &s, 0, 59)) < 0 || *s++ != ':') {
572		goto out;
573	}
574	/* read second */
575	if ((tm.tm_sec = strtoi_lim(s, &s, 0, 60)) < 0 || *s++ != 'Z') {
576		goto out;
577	}
578
579	/* massage TM to fulfill some of POSIX' constraints */
580	tm.tm_year -= 1900;
581	tm.tm_mon--;
582
583	/* now convert our custom tm struct to a unix stamp using UTC */
584	res = time_from_tm(&tm);
585
586out:
587	if (endptr != NULL) {
588		*endptr = deconst(s);
589	}
590	return res;
591}
592
593static unsigned int
594_warc_rdver(const char *buf, size_t bsz)
595{
596	static const char magic[] = "WARC/";
597	const char *c;
598	unsigned int ver = 0U;
599	unsigned int end = 0U;
600
601	if (bsz < 12 || memcmp(buf, magic, sizeof(magic) - 1U) != 0) {
602		/* buffer too small or invalid magic */
603		return ver;
604	}
605	/* looks good so far, read the version number for a laugh */
606	buf += sizeof(magic) - 1U;
607
608	if (isdigit((unsigned char)buf[0U]) && (buf[1U] == '.') &&
609	    isdigit((unsigned char)buf[2U])) {
610		/* we support a maximum of 2 digits in the minor version */
611		if (isdigit((unsigned char)buf[3U]))
612			end = 1U;
613		/* set up major version */
614		ver = (buf[0U] - '0') * 10000U;
615		/* set up minor version */
616		if (end == 1U) {
617			ver += (buf[2U] - '0') * 1000U;
618			ver += (buf[3U] - '0') * 100U;
619		} else
620			ver += (buf[2U] - '0') * 100U;
621		/*
622		 * WARC below version 0.12 has a space-separated header
623		 * WARC 0.12 and above terminates the version with a CRLF
624		 */
625		c = buf + 3U + end;
626		if (ver >= 1200U) {
627			if (memcmp(c, "\r\n", 2U) != 0)
628				ver = 0U;
629		} else {
630			/* ver < 1200U */
631			if (*c != ' ' && *c != '\t')
632				ver = 0U;
633		}
634	}
635	return ver;
636}
637
638static unsigned int
639_warc_rdtyp(const char *buf, size_t bsz)
640{
641	static const char _key[] = "\r\nWARC-Type:";
642	const char *val, *eol;
643
644	if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
645		/* no bother */
646		return WT_NONE;
647	}
648	val += sizeof(_key) - 1U;
649	if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL) {
650		/* no end of line */
651		return WT_NONE;
652	}
653
654	/* overread whitespace */
655	while (val < eol && (*val == ' ' || *val == '\t'))
656		++val;
657
658	if (val + 8U == eol) {
659		if (memcmp(val, "resource", 8U) == 0)
660			return WT_RSRC;
661		else if (memcmp(val, "response", 8U) == 0)
662			return WT_RSP;
663	}
664	return WT_NONE;
665}
666
667static warc_string_t
668_warc_rduri(const char *buf, size_t bsz)
669{
670	static const char _key[] = "\r\nWARC-Target-URI:";
671	const char *val, *uri, *eol, *p;
672	warc_string_t res = {0U, NULL};
673
674	if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
675		/* no bother */
676		return res;
677	}
678	/* overread whitespace */
679	val += sizeof(_key) - 1U;
680	if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL) {
681		/* no end of line */
682		return res;
683	}
684
685	while (val < eol && (*val == ' ' || *val == '\t'))
686		++val;
687
688	/* overread URL designators */
689	if ((uri = xmemmem(val, eol - val, "://", 3U)) == NULL) {
690		/* not touching that! */
691		return res;
692	}
693
694	/* spaces inside uri are not allowed, CRLF should follow */
695	for (p = val; p < eol; p++) {
696		if (isspace((unsigned char)*p))
697			return res;
698	}
699
700	/* there must be at least space for ftp */
701	if (uri < (val + 3U))
702		return res;
703
704	/* move uri to point to after :// */
705	uri += 3U;
706
707	/* now then, inspect the URI */
708	if (memcmp(val, "file", 4U) == 0) {
709		/* perfect, nothing left to do here */
710
711	} else if (memcmp(val, "http", 4U) == 0 ||
712		   memcmp(val, "ftp", 3U) == 0) {
713		/* overread domain, and the first / */
714		while (uri < eol && *uri++ != '/');
715	} else {
716		/* not sure what to do? best to bugger off */
717		return res;
718	}
719	res.str = uri;
720	res.len = eol - uri;
721	return res;
722}
723
724static ssize_t
725_warc_rdlen(const char *buf, size_t bsz)
726{
727	static const char _key[] = "\r\nContent-Length:";
728	const char *val, *eol;
729	char *on = NULL;
730	long int len;
731
732	if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
733		/* no bother */
734		return -1;
735	}
736	val += sizeof(_key) - 1U;
737	if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL) {
738		/* no end of line */
739		return -1;
740	}
741
742	/* skip leading whitespace */
743	while (val < eol && (*val == ' ' || *val == '\t'))
744		val++;
745	/* there must be at least one digit */
746	if (!isdigit((unsigned char)*val))
747		return -1;
748	errno = 0;
749	len = strtol(val, &on, 10);
750	if (errno != 0 || on != eol) {
751		/* line must end here */
752		return -1;
753	}
754
755	return (size_t)len;
756}
757
758static time_t
759_warc_rdrtm(const char *buf, size_t bsz)
760{
761	static const char _key[] = "\r\nWARC-Date:";
762	const char *val, *eol;
763	char *on = NULL;
764	time_t res;
765
766	if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
767		/* no bother */
768		return (time_t)-1;
769	}
770	val += sizeof(_key) - 1U;
771	if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL ) {
772		/* no end of line */
773		return -1;
774	}
775
776	/* xstrpisotime() kindly overreads whitespace for us, so use that */
777	res = xstrpisotime(val, &on);
778	if (on != eol) {
779		/* line must end here */
780		return -1;
781	}
782	return res;
783}
784
785static time_t
786_warc_rdmtm(const char *buf, size_t bsz)
787{
788	static const char _key[] = "\r\nLast-Modified:";
789	const char *val, *eol;
790	char *on = NULL;
791	time_t res;
792
793	if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
794		/* no bother */
795		return (time_t)-1;
796	}
797	val += sizeof(_key) - 1U;
798	if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL ) {
799		/* no end of line */
800		return -1;
801	}
802
803	/* xstrpisotime() kindly overreads whitespace for us, so use that */
804	res = xstrpisotime(val, &on);
805	if (on != eol) {
806		/* line must end here */
807		return -1;
808	}
809	return res;
810}
811
812static const char*
813_warc_find_eoh(const char *buf, size_t bsz)
814{
815	static const char _marker[] = "\r\n\r\n";
816	const char *hit = xmemmem(buf, bsz, _marker, sizeof(_marker) - 1U);
817
818	if (hit != NULL) {
819		hit += sizeof(_marker) - 1U;
820	}
821	return hit;
822}
823
824static const char*
825_warc_find_eol(const char *buf, size_t bsz)
826{
827	static const char _marker[] = "\r\n";
828	const char *hit = xmemmem(buf, bsz, _marker, sizeof(_marker) - 1U);
829
830	return hit;
831}
832/* archive_read_support_format_warc.c ends here */
833