1/* Collect URLs from CSS source.
2   Copyright (C) 1998, 2000, 2001, 2002, 2003, 2009 Free Software
3   Foundation, Inc.
4
5This file is part of GNU Wget.
6
7GNU Wget is free software; you can redistribute it and/or modify
8it under the terms of the GNU General Public License as published by
9the Free Software Foundation; either version 3 of the License, or (at
10your option) any later version.
11
12GNU Wget is distributed in the hope that it will be useful,
13but WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15GNU General Public License for more details.
16
17You should have received a copy of the GNU General Public License
18along with Wget.  If not, see <http://www.gnu.org/licenses/>.
19
20Additional permission under GNU GPL version 3 section 7
21
22If you modify this program, or any covered work, by linking or
23combining it with the OpenSSL project's OpenSSL library (or a
24modified version of that library), containing parts covered by the
25terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
26grants you additional permission to convey the resulting work.
27Corresponding Source for a non-source form of such a combination
28shall include the source code for the parts of OpenSSL used as well
29as that of the covered work.  */
30
31/*
32  Note that this is not an actual CSS parser, but just a lexical
33  scanner with a tiny bit more smarts bolted on top.  A full parser
34  is somewhat overkill for this job.  The only things we're interested
35  in are @import rules and url() tokens, so it's easy enough to
36  grab those without truly understanding the input.  The only downside
37  to this is that we might be coerced into downloading files that
38  a browser would ignore.  That might merit some more investigation.
39 */
40
41#include <wget.h>
42
43#include <stdio.h>
44#ifdef HAVE_STRING_H
45# include <string.h>
46#else
47# include <strings.h>
48#endif
49#include <stdlib.h>
50#include <ctype.h>
51#include <errno.h>
52
53#include "wget.h"
54#include "utils.h"
55#include "convert.h"
56#include "html-url.h"
57#include "css-tokens.h"
58
59/* from lex.yy.c */
60extern char *yytext;
61extern int yyleng;
62typedef struct yy_buffer_state *YY_BUFFER_STATE;
63extern YY_BUFFER_STATE yy_scan_bytes (const char *bytes,int len  );
64extern int yylex (void);
65
66#if 1
67const char *token_names[] = {
68  "CSSEOF",
69  "S",
70  "CDO",
71  "CDC",
72  "INCLUDES",
73  "DASHMATCH",
74  "LBRACE",
75  "PLUS",
76  "GREATER",
77  "COMMA",
78  "STRING",
79  "INVALID",
80  "IDENT",
81  "HASH",
82  "IMPORT_SYM",
83  "PAGE_SYM",
84  "MEDIA_SYM",
85  "CHARSET_SYM",
86  "IMPORTANT_SYM",
87  "EMS",
88  "EXS",
89  "LENGTH",
90  "ANGLE",
91  "TIME",
92  "FREQ",
93  "DIMENSION",
94  "PERCENTAGE",
95  "NUMBER",
96  "URI",
97  "FUNCTION"
98};
99#endif
100
101/*
102  Given a detected URI token, get only the URI specified within.
103  Also adjust the starting position and length of the string.
104
105  A URI can be specified with or without quotes, and the quotes
106  can be single or double quotes.  In addition there can be
107  whitespace after the opening parenthesis and before the closing
108  parenthesis.
109*/
110char *
111get_uri_string (const char *at, int *pos, int *length)
112{
113  char *uri;
114  /*char buf[1024];
115  strncpy(buf,at + *pos, *length);
116  buf[*length] = '\0';
117  DEBUGP (("get_uri_string: \"%s\"\n", buf));*/
118
119  if (0 != strncasecmp (at + *pos, "url(", 4))
120    return NULL;
121
122  *pos += 4;
123  *length -= 5; /* url() */
124  /* skip leading space */
125  while (isspace (at[*pos]))
126    {
127    (*pos)++;
128    (*length)--;
129    }
130  /* skip trailing space */
131  while (isspace (at[*pos + *length - 1]))
132    {
133      (*length)--;
134    }
135  /* trim off quotes */
136  if (at[*pos] == '\'' || at[*pos] == '"')
137    {
138      (*pos)++;
139      *length -= 2;
140    }
141
142  uri = xmalloc (*length + 1);
143  if (uri)
144    {
145      strncpy (uri, at + *pos, *length);
146      uri[*length] = '\0';
147    }
148
149  return uri;
150}
151
152void
153get_urls_css (struct map_context *ctx, int offset, int buf_length)
154{
155  int token;
156  /*char tmp[2048];*/
157  int buffer_pos = 0;
158  int pos, length;
159  char *uri;
160
161  /*
162  strncpy(tmp,ctx->text + offset, buf_length);
163  tmp[buf_length] = '\0';
164  DEBUGP (("get_urls_css: \"%s\"\n", tmp));
165  */
166
167  /* tell flex to scan from this buffer */
168  yy_scan_bytes (ctx->text + offset, buf_length);
169
170  while((token = yylex()) != CSSEOF)
171    {
172      /*DEBUGP (("%s ", token_names[token]));*/
173      /* @import "foo.css"
174         or @import url(foo.css)
175      */
176      if(token == IMPORT_SYM)
177        {
178          do {
179            buffer_pos += yyleng;
180          } while((token = yylex()) == S);
181
182          /*DEBUGP (("%s ", token_names[token]));*/
183
184          if (token == STRING || token == URI)
185            {
186              /*DEBUGP (("Got URI "));*/
187              pos = buffer_pos + offset;
188              length = yyleng;
189
190              if (token == URI)
191                {
192                  uri = get_uri_string (ctx->text, &pos, &length);
193                }
194              else
195                {
196                  /* cut out quote characters */
197                  pos++;
198                  length -= 2;
199                  uri = xmalloc (length + 1);
200                  strncpy (uri, yytext + 1, length);
201                  uri[length] = '\0';
202                }
203
204              if (uri)
205                {
206                  struct urlpos *up = append_url (uri, pos, length, ctx);
207                  DEBUGP (("Found @import: [%s] at %d [%s]\n", yytext, buffer_pos, uri));
208
209                  if (up)
210                    {
211                      up->link_inline_p = 1;
212                      up->link_css_p = 1;
213                      up->link_expect_css = 1;
214                    }
215
216                  xfree(uri);
217                }
218            }
219        }
220      /* background-image: url(foo.png)
221         note that we don't care what
222         property this is actually on.
223      */
224      else if(token == URI)
225        {
226          pos = buffer_pos + offset;
227          length = yyleng;
228          uri = get_uri_string (ctx->text, &pos, &length);
229
230          if (uri)
231            {
232              struct urlpos *up = append_url (uri, pos, length, ctx);
233              DEBUGP (("Found URI: [%s] at %d [%s]\n", yytext, buffer_pos, uri));
234              if (up)
235                {
236                  up->link_inline_p = 1;
237                  up->link_css_p = 1;
238                }
239
240              xfree (uri);
241            }
242        }
243      buffer_pos += yyleng;
244    }
245  DEBUGP (("\n"));
246}
247
248struct urlpos *
249get_urls_css_file (const char *file, const char *url)
250{
251  struct file_memory *fm;
252  struct map_context ctx;
253
254  /* Load the file. */
255  fm = read_file (file);
256  if (!fm)
257    {
258      logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
259      return NULL;
260    }
261  DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
262
263  ctx.text = fm->content;
264  ctx.head = ctx.tail = NULL;
265  ctx.base = NULL;
266  ctx.parent_base = url ? url : opt.base_href;
267  ctx.document_file = file;
268  ctx.nofollow = 0;
269
270  get_urls_css (&ctx, 0, fm->length);
271  read_file_free (fm);
272  return ctx.head;
273}
274