1#!/usr/bin/env python
2#===- gen_std.py -  ------------------------------------------*- python -*--===#
3#
4# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5# See https://llvm.org/LICENSE.txt for license information.
6# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7#
8#===------------------------------------------------------------------------===#
9
10"""gen_std.py is a tool to generate a lookup table (from qualified names to
11include headers) for C/C++ Standard Library symbols by parsing archived HTML
12files from cppreference.
13
14The generated files are located in clang/include/Tooling/Inclusions.
15
16Caveats and FIXMEs:
17  - only symbols directly in "std" namespace are added, we should also add std's
18    subnamespace symbols (e.g. chrono).
19  - symbols with multiple variants or defined in multiple headers aren't added,
20    e.g. std::move, std::swap
21
22Usage:
23  1. Install BeautifulSoup dependency, see instruction:
24       https://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-beautiful-soup
25  2. Download cppreference offline HTML files (e.g. html_book_20181028.zip) at
26       https://en.cppreference.com/w/Cppreference:Archives
27  3. Unzip the zip file from step 2 (e.g., to a "cppreference" directory). You should
28     get a "cppreference/reference" directory.
29  4. Run the command:
30       // Generate C++ symbols
31       python3 gen_std.py -cppreference cppreference/reference -symbols=cpp > StdSymbolMap.inc
32       // Generate C++ removed symbols
33       python3 gen_std.py -cppreference cppreference/reference -symbols=cpp_removed > RemovedSymbolMap.inc
34       // Generate C symbols
35       python3 gen_std.py -cppreference cppreference/reference -symbols=c > CSymbolMap.inc
36"""
37
38
39import cppreference_parser
40import argparse
41import datetime
42import os
43import sys
44
45CODE_PREFIX = """\
46//===-- gen_std.py generated file -------------------------------*- C++ -*-===//
47//
48// Used to build a lookup table (qualified names => include headers) for %s
49// Standard Library symbols.
50//
51// This file was generated automatically by
52// clang/tools/include-mapping/gen_std.py, DO NOT EDIT!
53//
54// Generated from cppreference offline HTML book (modified on %s).
55//===----------------------------------------------------------------------===//
56"""
57
58def ParseArg():
59  parser = argparse.ArgumentParser(description='Generate StdGen file')
60  parser.add_argument('-cppreference', metavar='PATH',
61                      default='',
62                      help='path to the cppreference offline HTML directory',
63                      required=True
64                     )
65  parser.add_argument('-symbols',
66                      default='cpp',
67                      help='Generate c or cpp (removed) symbols. One of {cpp, c, cpp_removed}.',
68                      required=True)
69  return parser.parse_args()
70
71
72def main():
73  args = ParseArg()
74  if args.symbols == 'cpp':
75    page_root = os.path.join(args.cppreference, "en", "cpp")
76    symbol_index_root = os.path.join(page_root, "symbol_index")
77    parse_pages =  [
78      (page_root, "symbol_index.html", "std::"),
79      # std sub-namespace symbols have separated pages.
80      # We don't index std literal operators (e.g.
81      # std::literals::chrono_literals::operator""d), these symbols can't be
82      # accessed by std::<symbol_name>.
83      # FIXME: index std::placeholders symbols, placeholders.html page is
84      # different (which contains one entry for _1, _2, ..., _N), we need special
85      # handling.
86      (symbol_index_root, "chrono.html", "std::chrono::"),
87      (symbol_index_root, "filesystem.html", "std::filesystem::"),
88      (symbol_index_root, "pmr.html", "std::pmr::"),
89      (symbol_index_root, "regex_constants.html", "std::regex_constants::"),
90      (symbol_index_root, "this_thread.html", "std::this_thread::"),
91    ]
92  elif args.symbols == 'cpp_removed':
93    page_root = os.path.join(args.cppreference, "en", "cpp")
94    symbol_index_root = os.path.join(page_root, "symbol_index")
95    parse_pages = [(symbol_index_root, "zombie_names.html", "std::")]
96  elif args.symbols == 'c':
97    page_root = os.path.join(args.cppreference, "en", "c")
98    symbol_index_root = page_root
99    parse_pages = [(page_root, "index.html", None)]
100
101  if not os.path.exists(symbol_index_root):
102    exit("Path %s doesn't exist!" % symbol_index_root)
103
104  symbols = cppreference_parser.GetSymbols(parse_pages)
105
106  # We don't have version information from the unzipped offline HTML files.
107  # so we use the modified time of the symbol_index.html as the version.
108  index_page_path = os.path.join(page_root, "index.html")
109  cppreference_modified_date = datetime.datetime.fromtimestamp(
110    os.stat(index_page_path).st_mtime).strftime('%Y-%m-%d')
111  print(CODE_PREFIX % (args.symbols.upper(), cppreference_modified_date))
112  for symbol in symbols:
113    if len(symbol.headers) == 1:
114      # SYMBOL(unqualified_name, namespace, header)
115      print("SYMBOL(%s, %s, %s)" % (symbol.name, symbol.namespace,
116                                    symbol.headers[0]))
117    elif len(symbol.headers) == 0:
118      sys.stderr.write("No header found for symbol %s\n" % symbol.name)
119    else:
120      # FIXME: support symbols with multiple headers (e.g. std::move).
121      sys.stderr.write("Ambiguous header for symbol %s: %s\n" % (
122          symbol.name, ', '.join(symbol.headers)))
123
124
125if __name__ == '__main__':
126  main()
127