archive_read_support_format_warc.c revision 305192
1/*-
2 * Copyright (c) 2014 Sebastian Freundt
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
15 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
16 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
17 * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
18 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
19 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
20 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
21 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
23 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25
26#include "archive_platform.h"
27__FBSDID("$FreeBSD: stable/10/contrib/libarchive/libarchive/archive_read_support_format_warc.c 305192 2016-09-01 12:01:23Z mm $");
28
29/**
30 * WARC is standardised by ISO TC46/SC4/WG12 and currently available as
31 * ISO 28500:2009.
32 * For the purposes of this file we used the final draft from:
33 * http://bibnum.bnf.fr/warc/WARC_ISO_28500_version1_latestdraft.pdf
34 *
35 * Todo:
36 * [ ] real-world warcs can contain resources at endpoints ending in /
37 *     e.g. http://bibnum.bnf.fr/warc/
38 *     if you're lucky their response contains a Content-Location: header
39 *     pointing to a unix-compliant filename, in the example above it's
40 *     Content-Location: http://bibnum.bnf.fr/warc/index.html
41 *     however, that's not mandated and github for example doesn't follow
42 *     this convention.
43 *     We need a set of archive options to control what to do with
44 *     entries like these, at the moment care is taken to skip them.
45 *
46 **/
47
48#ifdef HAVE_SYS_STAT_H
49#include <sys/stat.h>
50#endif
51#ifdef HAVE_ERRNO_H
52#include <errno.h>
53#endif
54#ifdef HAVE_STDLIB_H
55#include <stdlib.h>
56#endif
57#ifdef HAVE_STRING_H
58#include <string.h>
59#endif
60#ifdef HAVE_LIMITS_H
61#include <limits.h>
62#endif
63#ifdef HAVE_CTYPE_H
64#include <ctype.h>
65#endif
66#ifdef HAVE_TIME_H
67#include <time.h>
68#endif
69
70#include "archive.h"
71#include "archive_entry.h"
72#include "archive_private.h"
73#include "archive_read_private.h"
74
75typedef enum {
76	WT_NONE,
77	/* warcinfo */
78	WT_INFO,
79	/* metadata */
80	WT_META,
81	/* resource */
82	WT_RSRC,
83	/* request, unsupported */
84	WT_REQ,
85	/* response, unsupported */
86	WT_RSP,
87	/* revisit, unsupported */
88	WT_RVIS,
89	/* conversion, unsupported */
90	WT_CONV,
91	/* continutation, unsupported at the moment */
92	WT_CONT,
93	/* invalid type */
94	LAST_WT
95} warc_type_t;
96
97typedef struct {
98	size_t len;
99	const char *str;
100} warc_string_t;
101
102typedef struct {
103	size_t len;
104	char *str;
105} warc_strbuf_t;
106
107struct warc_s {
108	/* content length ahead */
109	size_t cntlen;
110	/* and how much we've processed so far */
111	size_t cntoff;
112	/* and how much we need to consume between calls */
113	size_t unconsumed;
114
115	/* string pool */
116	warc_strbuf_t pool;
117	/* previous version */
118	unsigned int pver;
119	/* stringified format name */
120	struct archive_string sver;
121};
122
123static int _warc_bid(struct archive_read *a, int);
124static int _warc_cleanup(struct archive_read *a);
125static int _warc_read(struct archive_read*, const void**, size_t*, int64_t*);
126static int _warc_skip(struct archive_read *a);
127static int _warc_rdhdr(struct archive_read *a, struct archive_entry *e);
128
129/* private routines */
130static unsigned int _warc_rdver(const char buf[10], size_t bsz);
131static unsigned int _warc_rdtyp(const char *buf, size_t bsz);
132static warc_string_t _warc_rduri(const char *buf, size_t bsz);
133static ssize_t _warc_rdlen(const char *buf, size_t bsz);
134static time_t _warc_rdrtm(const char *buf, size_t bsz);
135static time_t _warc_rdmtm(const char *buf, size_t bsz);
136static const char *_warc_find_eoh(const char *buf, size_t bsz);
137
138
139int
140archive_read_support_format_warc(struct archive *_a)
141{
142	struct archive_read *a = (struct archive_read *)_a;
143	struct warc_s *w;
144	int r;
145
146	archive_check_magic(_a, ARCHIVE_READ_MAGIC,
147	    ARCHIVE_STATE_NEW, "archive_read_support_format_warc");
148
149	if ((w = malloc(sizeof(*w))) == NULL) {
150		archive_set_error(&a->archive, ENOMEM,
151		    "Can't allocate warc data");
152		return (ARCHIVE_FATAL);
153	}
154	memset(w, 0, sizeof(*w));
155
156	r = __archive_read_register_format(
157		a, w, "warc",
158		_warc_bid, NULL, _warc_rdhdr, _warc_read,
159		_warc_skip, NULL, _warc_cleanup, NULL, NULL);
160
161	if (r != ARCHIVE_OK) {
162		free(w);
163		return (r);
164	}
165	return (ARCHIVE_OK);
166}
167
168static int
169_warc_cleanup(struct archive_read *a)
170{
171	struct warc_s *w = a->format->data;
172
173	if (w->pool.len > 0U) {
174		free(w->pool.str);
175	}
176	archive_string_free(&w->sver);
177	free(w);
178	a->format->data = NULL;
179	return (ARCHIVE_OK);
180}
181
182static int
183_warc_bid(struct archive_read *a, int best_bid)
184{
185	const char *hdr;
186	ssize_t nrd;
187	unsigned int ver;
188
189	(void)best_bid; /* UNUSED */
190
191	/* check first line of file, it should be a record already */
192	if ((hdr = __archive_read_ahead(a, 12U, &nrd)) == NULL) {
193		/* no idea what to do */
194		return -1;
195	} else if (nrd < 12) {
196		/* nah, not for us, our magic cookie is at least 12 bytes */
197		return -1;
198	}
199
200	/* otherwise snarf the record's version number */
201	ver = _warc_rdver(hdr, nrd);
202	if (ver == 0U || ver > 10000U) {
203		/* oh oh oh, best not to wager ... */
204		return -1;
205	}
206
207	/* otherwise be confident */
208	return (64);
209}
210
211static int
212_warc_rdhdr(struct archive_read *a, struct archive_entry *entry)
213{
214#define HDR_PROBE_LEN		(12U)
215	struct warc_s *w = a->format->data;
216	unsigned int ver;
217	const char *buf;
218	ssize_t nrd;
219	const char *eoh;
220	/* for the file name, saves some strndup()'ing */
221	warc_string_t fnam;
222	/* warc record type, not that we really use it a lot */
223	warc_type_t ftyp;
224	/* content-length+error monad */
225	ssize_t cntlen;
226	/* record time is the WARC-Date time we reinterpret it as ctime */
227	time_t rtime;
228	/* mtime is the Last-Modified time which will be the entry's mtime */
229	time_t mtime;
230
231start_over:
232	/* just use read_ahead() they keep track of unconsumed
233	 * bits and bobs for us; no need to put an extra shift in
234	 * and reproduce that functionality here */
235	buf = __archive_read_ahead(a, HDR_PROBE_LEN, &nrd);
236
237	if (nrd < 0) {
238		/* no good */
239		archive_set_error(
240			&a->archive, ARCHIVE_ERRNO_MISC,
241			"Bad record header");
242		return (ARCHIVE_FATAL);
243	} else if (buf == NULL) {
244		/* there should be room for at least WARC/bla\r\n
245		 * must be EOF therefore */
246		return (ARCHIVE_EOF);
247	}
248 	/* looks good so far, try and find the end of the header now */
249	eoh = _warc_find_eoh(buf, nrd);
250	if (eoh == NULL) {
251		/* still no good, the header end might be beyond the
252		 * probe we've requested, but then again who'd cram
253		 * so much stuff into the header *and* be 28500-compliant */
254		archive_set_error(
255			&a->archive, ARCHIVE_ERRNO_MISC,
256			"Bad record header");
257		return (ARCHIVE_FATAL);
258	} else if ((ver = _warc_rdver(buf, eoh - buf)) > 10000U) {
259		/* nawww, I wish they promised backward compatibility
260		 * anyhoo, in their infinite wisdom the 28500 guys might
261		 * come up with something we can't possibly handle so
262		 * best end things here */
263		archive_set_error(
264			&a->archive, ARCHIVE_ERRNO_MISC,
265			"Unsupported record version");
266		return (ARCHIVE_FATAL);
267	} else if ((cntlen = _warc_rdlen(buf, eoh - buf)) < 0) {
268		/* nightmare!  the specs say content-length is mandatory
269		 * so I don't feel overly bad stopping the reader here */
270		archive_set_error(
271			&a->archive, EINVAL,
272			"Bad content length");
273		return (ARCHIVE_FATAL);
274	} else if ((rtime = _warc_rdrtm(buf, eoh - buf)) == (time_t)-1) {
275		/* record time is mandatory as per WARC/1.0,
276		 * so just barf here, fast and loud */
277		archive_set_error(
278			&a->archive, EINVAL,
279			"Bad record time");
280		return (ARCHIVE_FATAL);
281	}
282
283	/* let the world know we're a WARC archive */
284	a->archive.archive_format = ARCHIVE_FORMAT_WARC;
285	if (ver != w->pver) {
286		/* stringify this entry's version */
287		archive_string_sprintf(&w->sver,
288			"WARC/%u.%u", ver / 10000, ver % 10000);
289		/* remember the version */
290		w->pver = ver;
291	}
292	/* start off with the type */
293	ftyp = _warc_rdtyp(buf, eoh - buf);
294	/* and let future calls know about the content */
295	w->cntlen = cntlen;
296	w->cntoff = 0U;
297	mtime = 0;/* Avoid compiling error on some platform. */
298
299	switch (ftyp) {
300	case WT_RSRC:
301	case WT_RSP:
302		/* only try and read the filename in the cases that are
303		 * guaranteed to have one */
304		fnam = _warc_rduri(buf, eoh - buf);
305		/* check the last character in the URI to avoid creating
306		 * directory endpoints as files, see Todo above */
307		if (fnam.len == 0 || fnam.str[fnam.len - 1] == '/') {
308			/* break here for now */
309			fnam.len = 0U;
310			fnam.str = NULL;
311			break;
312		}
313		/* bang to our string pool, so we save a
314		 * malloc()+free() roundtrip */
315		if (fnam.len + 1U > w->pool.len) {
316			w->pool.len = ((fnam.len + 64U) / 64U) * 64U;
317			w->pool.str = realloc(w->pool.str, w->pool.len);
318		}
319		memcpy(w->pool.str, fnam.str, fnam.len);
320		w->pool.str[fnam.len] = '\0';
321		/* let no one else know about the pool, it's a secret, shhh */
322		fnam.str = w->pool.str;
323
324		/* snarf mtime or deduce from rtime
325		 * this is a custom header added by our writer, it's quite
326		 * hard to believe anyone else would go through with it
327		 * (apart from being part of some http responses of course) */
328		if ((mtime = _warc_rdmtm(buf, eoh - buf)) == (time_t)-1) {
329			mtime = rtime;
330		}
331		break;
332	default:
333		fnam.len = 0U;
334		fnam.str = NULL;
335		break;
336	}
337
338	/* now eat some of those delicious buffer bits */
339	__archive_read_consume(a, eoh - buf);
340
341	switch (ftyp) {
342	case WT_RSRC:
343	case WT_RSP:
344		if (fnam.len > 0U) {
345			/* populate entry object */
346			archive_entry_set_filetype(entry, AE_IFREG);
347			archive_entry_copy_pathname(entry, fnam.str);
348			archive_entry_set_size(entry, cntlen);
349			archive_entry_set_perm(entry, 0644);
350			/* rtime is the new ctime, mtime stays mtime */
351			archive_entry_set_ctime(entry, rtime, 0L);
352			archive_entry_set_mtime(entry, mtime, 0L);
353			break;
354		}
355		/* FALLTHROUGH */
356	default:
357		/* consume the content and start over */
358		_warc_skip(a);
359		goto start_over;
360	}
361	return (ARCHIVE_OK);
362}
363
364static int
365_warc_read(struct archive_read *a, const void **buf, size_t *bsz, int64_t *off)
366{
367	struct warc_s *w = a->format->data;
368	const char *rab;
369	ssize_t nrd;
370
371	if (w->cntoff >= w->cntlen) {
372	eof:
373		/* it's our lucky day, no work, we can leave early */
374		*buf = NULL;
375		*bsz = 0U;
376		*off = w->cntoff + 4U/*for \r\n\r\n separator*/;
377		w->unconsumed = 0U;
378		return (ARCHIVE_EOF);
379	}
380
381	rab = __archive_read_ahead(a, 1U, &nrd);
382	if (nrd < 0) {
383		*bsz = 0U;
384		/* big catastrophe */
385		return (int)nrd;
386	} else if (nrd == 0) {
387		goto eof;
388	} else if ((size_t)nrd > w->cntlen - w->cntoff) {
389		/* clamp to content-length */
390		nrd = w->cntlen - w->cntoff;
391	}
392	*off = w->cntoff;
393	*bsz = nrd;
394	*buf = rab;
395
396	w->cntoff += nrd;
397	w->unconsumed = (size_t)nrd;
398	return (ARCHIVE_OK);
399}
400
401static int
402_warc_skip(struct archive_read *a)
403{
404	struct warc_s *w = a->format->data;
405
406	__archive_read_consume(a, w->cntlen + 4U/*\r\n\r\n separator*/);
407	w->cntlen = 0U;
408	w->cntoff = 0U;
409	return (ARCHIVE_OK);
410}
411
412
413/* private routines */
414static void*
415deconst(const void *c)
416{
417	return (char *)0x1 + (((const char *)c) - (const char *)0x1);
418}
419
420static char*
421xmemmem(const char *hay, const size_t haysize,
422	const char *needle, const size_t needlesize)
423{
424	const char *const eoh = hay + haysize;
425	const char *const eon = needle + needlesize;
426	const char *hp;
427	const char *np;
428	const char *cand;
429	unsigned int hsum;
430	unsigned int nsum;
431	unsigned int eqp;
432
433	/* trivial checks first
434         * a 0-sized needle is defined to be found anywhere in haystack
435         * then run strchr() to find a candidate in HAYSTACK (i.e. a portion
436         * that happens to begin with *NEEDLE) */
437	if (needlesize == 0UL) {
438		return deconst(hay);
439	} else if ((hay = memchr(hay, *needle, haysize)) == NULL) {
440		/* trivial */
441		return NULL;
442	}
443
444	/* First characters of haystack and needle are the same now. Both are
445	 * guaranteed to be at least one character long.  Now computes the sum
446	 * of characters values of needle together with the sum of the first
447	 * needle_len characters of haystack. */
448	for (hp = hay + 1U, np = needle + 1U, hsum = *hay, nsum = *hay, eqp = 1U;
449	     hp < eoh && np < eon;
450	     hsum ^= *hp, nsum ^= *np, eqp &= *hp == *np, hp++, np++);
451
452	/* HP now references the (NEEDLESIZE + 1)-th character. */
453	if (np < eon) {
454		/* haystack is smaller than needle, :O */
455		return NULL;
456	} else if (eqp) {
457		/* found a match */
458		return deconst(hay);
459	}
460
461	/* now loop through the rest of haystack,
462	 * updating the sum iteratively */
463	for (cand = hay; hp < eoh; hp++) {
464		hsum ^= *cand++;
465		hsum ^= *hp;
466
467		/* Since the sum of the characters is already known to be
468		 * equal at that point, it is enough to check just NEEDLESIZE - 1
469		 * characters for equality,
470		 * also CAND is by design < HP, so no need for range checks */
471		if (hsum == nsum && memcmp(cand, needle, needlesize - 1U) == 0) {
472			return deconst(cand);
473		}
474	}
475	return NULL;
476}
477
478static int
479strtoi_lim(const char *str, const char **ep, int llim, int ulim)
480{
481	int res = 0;
482	const char *sp;
483	/* we keep track of the number of digits via rulim */
484	int rulim;
485
486	for (sp = str, rulim = ulim > 10 ? ulim : 10;
487	     res * 10 <= ulim && rulim && *sp >= '0' && *sp <= '9';
488	     sp++, rulim /= 10) {
489		res *= 10;
490		res += *sp - '0';
491	}
492	if (sp == str) {
493		res = -1;
494	} else if (res < llim || res > ulim) {
495		res = -2;
496	}
497	*ep = (const char*)sp;
498	return res;
499}
500
501static time_t
502time_from_tm(struct tm *t)
503{
504#if HAVE_TIMEGM
505        /* Use platform timegm() if available. */
506        return (timegm(t));
507#elif HAVE__MKGMTIME64
508        return (_mkgmtime64(t));
509#else
510        /* Else use direct calculation using POSIX assumptions. */
511        /* First, fix up tm_yday based on the year/month/day. */
512        if (mktime(t) == (time_t)-1)
513                return ((time_t)-1);
514        /* Then we can compute timegm() from first principles. */
515        return (t->tm_sec
516            + t->tm_min * 60
517            + t->tm_hour * 3600
518            + t->tm_yday * 86400
519            + (t->tm_year - 70) * 31536000
520            + ((t->tm_year - 69) / 4) * 86400
521            - ((t->tm_year - 1) / 100) * 86400
522            + ((t->tm_year + 299) / 400) * 86400);
523#endif
524}
525
526static time_t
527xstrpisotime(const char *s, char **endptr)
528{
529/** like strptime() but strictly for ISO 8601 Zulu strings */
530	struct tm tm;
531	time_t res = (time_t)-1;
532
533	/* make sure tm is clean */
534	memset(&tm, 0, sizeof(tm));
535
536	/* as a courtesy to our callers, and since this is a non-standard
537	 * routine, we skip leading whitespace */
538	while (isspace((unsigned char)*s))
539		++s;
540
541	/* read year */
542	if ((tm.tm_year = strtoi_lim(s, &s, 1583, 4095)) < 0 || *s++ != '-') {
543		goto out;
544	}
545	/* read month */
546	if ((tm.tm_mon = strtoi_lim(s, &s, 1, 12)) < 0 || *s++ != '-') {
547		goto out;
548	}
549	/* read day-of-month */
550	if ((tm.tm_mday = strtoi_lim(s, &s, 1, 31)) < 0 || *s++ != 'T') {
551		goto out;
552	}
553	/* read hour */
554	if ((tm.tm_hour = strtoi_lim(s, &s, 0, 23)) < 0 || *s++ != ':') {
555		goto out;
556	}
557	/* read minute */
558	if ((tm.tm_min = strtoi_lim(s, &s, 0, 59)) < 0 || *s++ != ':') {
559		goto out;
560	}
561	/* read second */
562	if ((tm.tm_sec = strtoi_lim(s, &s, 0, 60)) < 0 || *s++ != 'Z') {
563		goto out;
564	}
565
566	/* massage TM to fulfill some of POSIX' contraints */
567	tm.tm_year -= 1900;
568	tm.tm_mon--;
569
570	/* now convert our custom tm struct to a unix stamp using UTC */
571	res = time_from_tm(&tm);
572
573out:
574	if (endptr != NULL) {
575		*endptr = deconst(s);
576	}
577	return res;
578}
579
580static unsigned int
581_warc_rdver(const char buf[10], size_t bsz)
582{
583	static const char magic[] = "WARC/";
584	unsigned int ver;
585
586	(void)bsz; /* UNUSED */
587
588	if (memcmp(buf, magic, sizeof(magic) - 1U) != 0) {
589		/* nope */
590		return 99999U;
591	}
592	/* looks good so far, read the version number for a laugh */
593	buf += sizeof(magic) - 1U;
594	/* most common case gets a quick-check here */
595	if (memcmp(buf, "1.0\r\n", 5U) == 0) {
596		ver = 10000U;
597	} else {
598		switch (*buf) {
599		case '0':
600		case '1':
601		case '2':
602		case '3':
603		case '4':
604		case '5':
605		case '6':
606		case '7':
607		case '8':
608			if (buf[1U] == '.') {
609				char *on;
610
611				/* set up major version */
612				ver = (buf[0U] - '0') * 10000U;
613				/* minor version, anyone? */
614				ver += (strtol(buf + 2U, &on, 10)) * 100U;
615				/* don't parse anything else */
616				if (on > buf + 2U) {
617					break;
618				}
619			}
620			/* FALLTHROUGH */
621		case '9':
622		default:
623			/* just make the version ridiculously high */
624			ver = 999999U;
625			break;
626		}
627	}
628	return ver;
629}
630
631static unsigned int
632_warc_rdtyp(const char *buf, size_t bsz)
633{
634	static const char _key[] = "\r\nWARC-Type:";
635	const char *const eob = buf + bsz;
636	const char *val;
637
638	if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
639		/* no bother */
640		return WT_NONE;
641	}
642	/* overread whitespace */
643	val += sizeof(_key) - 1U;
644	while (val < eob && isspace((unsigned char)*val))
645		++val;
646
647	if (val + 8U > eob) {
648		;
649	} else if (memcmp(val, "resource", 8U) == 0) {
650		return WT_RSRC;
651	} else if (memcmp(val, "warcinfo", 8U) == 0) {
652		return WT_INFO;
653	} else if (memcmp(val, "metadata", 8U) == 0) {
654		return WT_META;
655	} else if (memcmp(val, "request", 7U) == 0) {
656		return WT_REQ;
657	} else if (memcmp(val, "response", 8U) == 0) {
658		return WT_RSP;
659	} else if (memcmp(val, "conversi", 8U) == 0) {
660		return WT_CONV;
661	} else if (memcmp(val, "continua", 8U) == 0) {
662		return WT_CONT;
663	}
664	return WT_NONE;
665}
666
667static warc_string_t
668_warc_rduri(const char *buf, size_t bsz)
669{
670	static const char _key[] = "\r\nWARC-Target-URI:";
671	const char *const eob = buf + bsz;
672	const char *val;
673	const char *uri;
674	const char *eol;
675	warc_string_t res = {0U, NULL};
676
677	if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
678		/* no bother */
679		return res;
680	}
681	/* overread whitespace */
682	val += sizeof(_key) - 1U;
683	while (val < eob && isspace((unsigned char)*val))
684		++val;
685
686	/* overread URL designators */
687	if ((uri = xmemmem(val, eob - val, "://", 3U)) == NULL) {
688		/* not touching that! */
689		return res;
690	} else if ((eol = memchr(uri, '\n', eob - uri)) == NULL) {
691		/* no end of line? :O */
692		return res;
693	}
694
695	/* massage uri to point to after :// */
696	uri += 3U;
697	/* also massage eol to point to the first whitespace
698	 * after the last non-whitespace character before
699	 * the end of the line */
700	while (eol > uri && isspace((unsigned char)eol[-1]))
701		--eol;
702
703	/* now then, inspect the URI */
704	if (memcmp(val, "file", 4U) == 0) {
705		/* perfect, nothing left to do here */
706
707	} else if (memcmp(val, "http", 4U) == 0 ||
708		   memcmp(val, "ftp", 3U) == 0) {
709		/* overread domain, and the first / */
710		while (uri < eol && *uri++ != '/');
711	} else {
712		/* not sure what to do? best to bugger off */
713		return res;
714	}
715	res.str = uri;
716	res.len = eol - uri;
717	return res;
718}
719
720static ssize_t
721_warc_rdlen(const char *buf, size_t bsz)
722{
723	static const char _key[] = "\r\nContent-Length:";
724	const char *val;
725	char *on = NULL;
726	long int len;
727
728	if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
729		/* no bother */
730		return -1;
731	}
732
733	/* strtol kindly overreads whitespace for us, so use that */
734	val += sizeof(_key) - 1U;
735	len = strtol(val, &on, 10);
736	if (on == NULL || !isspace((unsigned char)*on)) {
737		/* hm, can we trust that number?  Best not. */
738		return -1;
739	}
740	return (size_t)len;
741}
742
743static time_t
744_warc_rdrtm(const char *buf, size_t bsz)
745{
746	static const char _key[] = "\r\nWARC-Date:";
747	const char *val;
748	char *on = NULL;
749	time_t res;
750
751	if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
752		/* no bother */
753		return (time_t)-1;
754	}
755
756	/* xstrpisotime() kindly overreads whitespace for us, so use that */
757	val += sizeof(_key) - 1U;
758	res = xstrpisotime(val, &on);
759	if (on == NULL || !isspace((unsigned char)*on)) {
760		/* hm, can we trust that number?  Best not. */
761		return (time_t)-1;
762	}
763	return res;
764}
765
766static time_t
767_warc_rdmtm(const char *buf, size_t bsz)
768{
769	static const char _key[] = "\r\nLast-Modified:";
770	const char *val;
771	char *on = NULL;
772	time_t res;
773
774	if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
775		/* no bother */
776		return (time_t)-1;
777	}
778
779	/* xstrpisotime() kindly overreads whitespace for us, so use that */
780	val += sizeof(_key) - 1U;
781	res = xstrpisotime(val, &on);
782	if (on == NULL || !isspace((unsigned char)*on)) {
783		/* hm, can we trust that number?  Best not. */
784		return (time_t)-1;
785	}
786	return res;
787}
788
789static const char*
790_warc_find_eoh(const char *buf, size_t bsz)
791{
792	static const char _marker[] = "\r\n\r\n";
793	const char *hit = xmemmem(buf, bsz, _marker, sizeof(_marker) - 1U);
794
795	if (hit != NULL) {
796		hit += sizeof(_marker) - 1U;
797	}
798	return hit;
799}
800
801/* archive_read_support_format_warc.c ends here */
802