archive_write_set_format_warc.c revision 313570
1/*-
2 * Copyright (c) 2014 Sebastian Freundt
3 * Author: Sebastian Freundt  <devel@fresse.org>
4 *
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19 * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include "archive_platform.h"
29__FBSDID("$FreeBSD: stable/11/contrib/libarchive/libarchive/archive_write_set_format_warc.c 313570 2017-02-11 00:54:16Z mm $");
30
31#ifdef HAVE_ERRNO_H
32#include <errno.h>
33#endif
34#include <stdio.h>
35#ifdef HAVE_STDLIB_H
36#include <stdlib.h>
37#endif
38#ifdef HAVE_STRING_H
39#include <string.h>
40#endif
41#ifdef HAVE_TIME_H
42#include <time.h>
43#endif
44
45#include "archive.h"
46#include "archive_entry.h"
47#include "archive_entry_locale.h"
48#include "archive_private.h"
49#include "archive_random_private.h"
50#include "archive_write_private.h"
51
52struct warc_s {
53	unsigned int omit_warcinfo:1;
54
55	time_t now;
56	mode_t typ;
57	unsigned int rng;
58	/* populated size */
59	uint64_t populz;
60};
61
62static const char warcinfo[] =
63    "software: libarchive/" ARCHIVE_VERSION_ONLY_STRING "\r\n"
64    "format: WARC file version 1.0\r\n";
65
66typedef enum {
67	WT_NONE,
68	/* warcinfo */
69	WT_INFO,
70	/* metadata */
71	WT_META,
72	/* resource */
73	WT_RSRC,
74	/* request, unsupported */
75	WT_REQ,
76	/* response, unsupported */
77	WT_RSP,
78	/* revisit, unsupported */
79	WT_RVIS,
80	/* conversion, unsupported */
81	WT_CONV,
82	/* continuation, unsupported at the moment */
83	WT_CONT,
84	/* invalid type */
85	LAST_WT
86} warc_type_t;
87
88typedef struct {
89	warc_type_t type;
90	const char *tgturi;
91	const char *recid;
92	time_t rtime;
93	time_t mtime;
94	const char *cnttyp;
95	uint64_t cntlen;
96} warc_essential_hdr_t;
97
98typedef struct {
99	unsigned int u[4U];
100} warc_uuid_t;
101
102static int _warc_options(struct archive_write*, const char *key, const char *v);
103static int _warc_header(struct archive_write *a, struct archive_entry *entry);
104static ssize_t _warc_data(struct archive_write *a, const void *buf, size_t sz);
105static int _warc_finish_entry(struct archive_write *a);
106static int _warc_close(struct archive_write *a);
107static int _warc_free(struct archive_write *a);
108
109/* private routines */
110static ssize_t _popul_ehdr(struct archive_string *t, size_t z, warc_essential_hdr_t);
111static int _gen_uuid(warc_uuid_t *tgt);
112
113
114/*
115 * Set output format to ISO 28500 (aka WARC) format.
116 */
117int
118archive_write_set_format_warc(struct archive *_a)
119{
120	struct archive_write *a = (struct archive_write *)_a;
121	struct warc_s *w;
122
123	archive_check_magic(_a, ARCHIVE_WRITE_MAGIC,
124	    ARCHIVE_STATE_NEW, "archive_write_set_format_warc");
125
126	/* If another format was already registered, unregister it. */
127	if (a->format_free != NULL) {
128		(a->format_free)(a);
129	}
130
131	w = malloc(sizeof(*w));
132	if (w == NULL) {
133		archive_set_error(&a->archive, ENOMEM,
134		    "Can't allocate warc data");
135		return (ARCHIVE_FATAL);
136	}
137	/* by default we're emitting a file wide header */
138	w->omit_warcinfo = 0U;
139	/* obtain current time for date fields */
140	w->now = time(NULL);
141	/* reset file type info */
142	w->typ = 0;
143	/* also initialise our rng */
144	w->rng = (unsigned int)w->now;
145
146	a->format_data = w;
147	a->format_name = "WARC/1.0";
148	a->format_options = _warc_options;
149	a->format_write_header = _warc_header;
150	a->format_write_data = _warc_data;
151	a->format_close = _warc_close;
152	a->format_free = _warc_free;
153	a->format_finish_entry = _warc_finish_entry;
154	a->archive.archive_format = ARCHIVE_FORMAT_WARC;
155	a->archive.archive_format_name = "WARC/1.0";
156	return (ARCHIVE_OK);
157}
158
159
160/* archive methods */
161static int
162_warc_options(struct archive_write *a, const char *key, const char *val)
163{
164	struct warc_s *w = a->format_data;
165
166	if (strcmp(key, "omit-warcinfo") == 0) {
167		if (val == NULL || strcmp(val, "true") == 0) {
168			/* great */
169			w->omit_warcinfo = 1U;
170			return (ARCHIVE_OK);
171		}
172	}
173
174	/* Note: The "warn" return is just to inform the options
175	 * supervisor that we didn't handle it.  It will generate
176	 * a suitable error if no one used this option. */
177	return (ARCHIVE_WARN);
178}
179
180static int
181_warc_header(struct archive_write *a, struct archive_entry *entry)
182{
183	struct warc_s *w = a->format_data;
184	struct archive_string hdr;
185#define MAX_HDR_SIZE 512
186
187	/* check whether warcinfo record needs outputting */
188	if (!w->omit_warcinfo) {
189		ssize_t r;
190		warc_essential_hdr_t wi = {
191			WT_INFO,
192			/*uri*/NULL,
193			/*urn*/NULL,
194			/*rtm*/0,
195			/*mtm*/0,
196			/*cty*/"application/warc-fields",
197			/*len*/sizeof(warcinfo) - 1U,
198		};
199		wi.rtime = w->now;
200		wi.mtime = w->now;
201
202		archive_string_init(&hdr);
203		r = _popul_ehdr(&hdr, MAX_HDR_SIZE, wi);
204		if (r >= 0) {
205			/* jackpot! */
206			/* now also use HDR buffer for the actual warcinfo */
207			archive_strncat(&hdr, warcinfo, sizeof(warcinfo) -1);
208
209			/* append end-of-record indicator */
210			archive_strncat(&hdr, "\r\n\r\n", 4);
211
212			/* write to output stream */
213			__archive_write_output(a, hdr.s, archive_strlen(&hdr));
214		}
215		/* indicate we're done with file header writing */
216		w->omit_warcinfo = 1U;
217		archive_string_free(&hdr);
218	}
219
220	if (archive_entry_pathname(entry) == NULL) {
221		archive_set_error(&a->archive, EINVAL,
222		    "Invalid filename");
223		return (ARCHIVE_WARN);
224	}
225
226	w->typ = archive_entry_filetype(entry);
227	w->populz = 0U;
228	if (w->typ == AE_IFREG) {
229		warc_essential_hdr_t rh = {
230			WT_RSRC,
231			/*uri*/NULL,
232			/*urn*/NULL,
233			/*rtm*/0,
234			/*mtm*/0,
235			/*cty*/NULL,
236			/*len*/0,
237		};
238		ssize_t r;
239		rh.tgturi = archive_entry_pathname(entry);
240		rh.rtime = w->now;
241		rh.mtime = archive_entry_mtime(entry);
242		rh.cntlen = (size_t)archive_entry_size(entry);
243
244		archive_string_init(&hdr);
245		r = _popul_ehdr(&hdr, MAX_HDR_SIZE, rh);
246		if (r < 0) {
247			/* don't bother */
248			archive_set_error(
249				&a->archive,
250				ARCHIVE_ERRNO_FILE_FORMAT,
251				"cannot archive file");
252			return (ARCHIVE_WARN);
253		}
254		/* otherwise append to output stream */
255		__archive_write_output(a, hdr.s, r);
256		/* and let subsequent calls to _data() know about the size */
257		w->populz = rh.cntlen;
258		archive_string_free(&hdr);
259		return (ARCHIVE_OK);
260	}
261	/* just resort to erroring as per Tim's advice */
262	archive_set_error(
263		&a->archive,
264		ARCHIVE_ERRNO_FILE_FORMAT,
265		"WARC can only process regular files");
266	return (ARCHIVE_FAILED);
267}
268
269static ssize_t
270_warc_data(struct archive_write *a, const void *buf, size_t len)
271{
272	struct warc_s *w = a->format_data;
273
274	if (w->typ == AE_IFREG) {
275		int rc;
276
277		/* never write more bytes than announced */
278		if (len > w->populz) {
279			len = (size_t)w->populz;
280		}
281
282		/* now then, out we put the whole shebang */
283		rc = __archive_write_output(a, buf, len);
284		if (rc != ARCHIVE_OK) {
285			return rc;
286		}
287	}
288	return len;
289}
290
291static int
292_warc_finish_entry(struct archive_write *a)
293{
294	static const char _eor[] = "\r\n\r\n";
295	struct warc_s *w = a->format_data;
296
297	if (w->typ == AE_IFREG) {
298		int rc = __archive_write_output(a, _eor, sizeof(_eor) - 1U);
299
300		if (rc != ARCHIVE_OK) {
301			return rc;
302		}
303	}
304	/* reset type info */
305	w->typ = 0;
306	return (ARCHIVE_OK);
307}
308
309static int
310_warc_close(struct archive_write *a)
311{
312	(void)a; /* UNUSED */
313	return (ARCHIVE_OK);
314}
315
316static int
317_warc_free(struct archive_write *a)
318{
319	struct warc_s *w = a->format_data;
320
321	free(w);
322	a->format_data = NULL;
323	return (ARCHIVE_OK);
324}
325
326
327/* private routines */
328static void
329xstrftime(struct archive_string *as, const char *fmt, time_t t)
330{
331/** like strftime(3) but for time_t objects */
332	struct tm *rt;
333#if defined(HAVE_GMTIME_R) || defined(HAVE__GMTIME64_S)
334	struct tm timeHere;
335#endif
336	char strtime[100];
337	size_t len;
338
339#ifdef HAVE_GMTIME_R
340	if ((rt = gmtime_r(&t, &timeHere)) == NULL)
341		return;
342#elif defined(HAVE__GMTIME64_S)
343	_gmtime64_s(&timeHere, &t);
344#else
345	if ((rt = gmtime(&t)) == NULL)
346		return;
347#endif
348	/* leave the hard yacker to our role model strftime() */
349	len = strftime(strtime, sizeof(strtime)-1, fmt, rt);
350	archive_strncat(as, strtime, len);
351}
352
353static ssize_t
354_popul_ehdr(struct archive_string *tgt, size_t tsz, warc_essential_hdr_t hdr)
355{
356	static const char _ver[] = "WARC/1.0\r\n";
357	static const char *_typ[LAST_WT] = {
358		NULL, "warcinfo", "metadata", "resource", NULL
359	};
360	char std_uuid[48U];
361
362	if (hdr.type == WT_NONE || hdr.type > WT_RSRC) {
363		/* brilliant, how exactly did we get here? */
364		return -1;
365	}
366
367	archive_strcpy(tgt, _ver);
368
369	archive_string_sprintf(tgt, "WARC-Type: %s\r\n", _typ[hdr.type]);
370
371	if (hdr.tgturi != NULL) {
372		/* check if there's a xyz:// */
373		static const char _uri[] = "";
374		static const char _fil[] = "file://";
375		const char *u;
376		char *chk = strchr(hdr.tgturi, ':');
377
378		if (chk != NULL && chk[1U] == '/' && chk[2U] == '/') {
379			/* yep, it's definitely a URI */
380			u = _uri;
381		} else {
382			/* hm, best to prepend file:// then */
383			u = _fil;
384		}
385		archive_string_sprintf(tgt,
386			"WARC-Target-URI: %s%s\r\n", u, hdr.tgturi);
387	}
388
389	/* record time is usually when the http is sent off,
390	 * just treat the archive writing as such for a moment */
391	xstrftime(tgt, "WARC-Date: %Y-%m-%dT%H:%M:%SZ\r\n", hdr.rtime);
392
393	/* while we're at it, record the mtime */
394	xstrftime(tgt, "Last-Modified: %Y-%m-%dT%H:%M:%SZ\r\n", hdr.mtime);
395
396	if (hdr.recid == NULL) {
397		/* generate one, grrrr */
398		warc_uuid_t u;
399
400		_gen_uuid(&u);
401		/* Unfortunately, archive_string_sprintf does not
402		 * handle the minimum number following '%'.
403		 * So we have to use snprintf function here instead
404		 * of archive_string_snprintf function. */
405#if defined(_WIN32) && !defined(__CYGWIN__) && !( defined(_MSC_VER) && _MSC_VER >= 1900)
406#define snprintf _snprintf
407#endif
408		snprintf(
409			std_uuid, sizeof(std_uuid),
410			"<urn:uuid:%08x-%04x-%04x-%04x-%04x%08x>",
411			u.u[0U],
412			u.u[1U] >> 16U, u.u[1U] & 0xffffU,
413			u.u[2U] >> 16U, u.u[2U] & 0xffffU,
414			u.u[3U]);
415		hdr.recid = std_uuid;
416	}
417
418	/* record-id is mandatory, fingers crossed we won't fail */
419	archive_string_sprintf(tgt, "WARC-Record-ID: %s\r\n", hdr.recid);
420
421	if (hdr.cnttyp != NULL) {
422		archive_string_sprintf(tgt, "Content-Type: %s\r\n", hdr.cnttyp);
423	}
424
425	/* next one is mandatory */
426	archive_string_sprintf(tgt, "Content-Length: %ju\r\n", (uintmax_t)hdr.cntlen);
427	/**/
428	archive_strncat(tgt, "\r\n", 2);
429
430	return (archive_strlen(tgt) >= tsz)? -1: (ssize_t)archive_strlen(tgt);
431}
432
433static int
434_gen_uuid(warc_uuid_t *tgt)
435{
436	archive_random(tgt->u, sizeof(tgt->u));
437	/* obey uuid version 4 rules */
438	tgt->u[1U] &= 0xffff0fffU;
439	tgt->u[1U] |= 0x4000U;
440	tgt->u[2U] &= 0x3fffffffU;
441	tgt->u[2U] |= 0x80000000U;
442	return 0;
443}
444
445/* archive_write_set_format_warc.c ends here */
446