meta2deps.py revision 319884
1#!/usr/bin/env python
2
3from __future__ import print_function
4
5"""
6This script parses each "meta" file and extracts the
7information needed to deduce build and src dependencies.
8
9It works much the same as the original shell script, but is
10*much* more efficient.
11
12The parsing work is handled by the class MetaFile.
13We only pay attention to a subset of the information in the
14"meta" files.  Specifically:
15
16'CWD'	to initialize our notion.
17
18'C'	to track chdir(2) on a per process basis
19
20'R'	files read are what we really care about.
21	directories read, provide a clue to resolving
22	subsequent relative paths.  That is if we cannot find
23	them relative to 'cwd', we check relative to the last
24	dir read.
25
26'W'	files opened for write or read-write,
27	for filemon V3 and earlier.
28
29'E'	files executed.
30
31'L'	files linked
32
33'V'	the filemon version, this record is used as a clue
34	that we have reached the interesting bit.
35
36"""
37
38"""
39RCSid:
40	$Id: meta2deps.py,v 1.26 2017/05/09 04:04:16 sjg Exp $
41
42	Copyright (c) 2011-2013, Juniper Networks, Inc.
43	All rights reserved.
44
45	Redistribution and use in source and binary forms, with or without
46	modification, are permitted provided that the following conditions
47	are met:
48	1. Redistributions of source code must retain the above copyright
49	   notice, this list of conditions and the following disclaimer.
50	2. Redistributions in binary form must reproduce the above copyright
51	   notice, this list of conditions and the following disclaimer in the
52	   documentation and/or other materials provided with the distribution.
53
54	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
55	"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
56	LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
57	A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
58	OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
59	SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
60	LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
61	DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
62	THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
63	(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
64	OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
65
66"""
67
68import os, re, sys
69
70def getv(dict, key, d=None):
71    """Lookup key in dict and return value or the supplied default."""
72    if key in dict:
73        return dict[key]
74    return d
75
76def resolve(path, cwd, last_dir=None, debug=0, debug_out=sys.stderr):
77    """
78    Return an absolute path, resolving via cwd or last_dir if needed.
79    """
80    if path.endswith('/.'):
81        path = path[0:-2]
82    if len(path) > 0 and path[0] == '/':
83        return path
84    if path == '.':
85        return cwd
86    if path.startswith('./'):
87        return cwd + path[1:]
88    if last_dir == cwd:
89        last_dir = None
90    for d in [last_dir, cwd]:
91        if not d:
92            continue
93        p = '/'.join([d,path])
94        if debug > 2:
95            print("looking for:", p, end=' ', file=debug_out)
96        if not os.path.exists(p):
97            if debug > 2:
98                print("nope", file=debug_out)
99            p = None
100            continue
101        if debug > 2:
102            print("found:", p, file=debug_out)
103        return p
104    return None
105
106def cleanpath(path):
107    """cleanup path without using realpath(3)"""
108    if path.startswith('/'):
109        r = '/'
110    else:
111        r = ''
112    p = []
113    w = path.split('/')
114    for d in w:
115        if not d or d == '.':
116            continue
117        if d == '..':
118            p.pop()
119            continue
120        p.append(d)
121
122    return r + '/'.join(p)
123
124def abspath(path, cwd, last_dir=None, debug=0, debug_out=sys.stderr):
125    """
126    Return an absolute path, resolving via cwd or last_dir if needed.
127    this gets called a lot, so we try to avoid calling realpath.
128    """
129    rpath = resolve(path, cwd, last_dir, debug, debug_out)
130    if rpath:
131        path = rpath
132    if (path.find('/') < 0 or
133        path.find('./') > 0 or
134        path.endswith('/..')):
135        path = cleanpath(path)
136    return path
137
138def sort_unique(list, cmp=None, key=None, reverse=False):
139    list.sort(cmp, key, reverse)
140    nl = []
141    le = None
142    for e in list:
143        if e == le:
144            continue
145        le = e
146        nl.append(e)
147    return nl
148
149def add_trims(x):
150    return ['/' + x + '/',
151            '/' + x,
152            x + '/',
153            x]
154
155class MetaFile:
156    """class to parse meta files generated by bmake."""
157
158    conf = None
159    dirdep_re = None
160    host_target = None
161    srctops = []
162    objroots = []
163    excludes = []
164    seen = {}
165    obj_deps = []
166    src_deps = []
167    file_deps = []
168
169    def __init__(self, name, conf={}):
170        """if name is set we will parse it now.
171        conf can have the follwing keys:
172
173        SRCTOPS list of tops of the src tree(s).
174
175        CURDIR  the src directory 'bmake' was run from.
176
177        RELDIR  the relative path from SRCTOP to CURDIR
178
179        MACHINE the machine we built for.
180                set to 'none' if we are not cross-building.
181                More specifically if machine cannot be deduced from objdirs.
182
183        TARGET_SPEC
184                Sometimes MACHINE isn't enough.
185
186        HOST_TARGET
187                when we build for the pseudo machine 'host'
188                the object tree uses HOST_TARGET rather than MACHINE.
189
190        OBJROOTS a list of the common prefix for all obj dirs it might
191                end in '/' or '-'.
192
193        DPDEPS  names an optional file to which per file dependencies
194                will be appended.
195                For example if 'some/path/foo.h' is read from SRCTOP
196                then 'DPDEPS_some/path/foo.h +=' "RELDIR" is output.
197                This can allow 'bmake' to learn all the dirs within
198                the tree that depend on 'foo.h'
199
200        EXCLUDES
201                A list of paths to ignore.
202                ccache(1) can otherwise be trouble.
203
204        debug   desired debug level
205
206        debug_out open file to send debug output to (sys.stderr)
207
208        """
209
210        self.name = name
211        self.debug = getv(conf, 'debug', 0)
212        self.debug_out = getv(conf, 'debug_out', sys.stderr)
213
214        self.machine = getv(conf, 'MACHINE', '')
215        self.machine_arch = getv(conf, 'MACHINE_ARCH', '')
216        self.target_spec = getv(conf, 'TARGET_SPEC', '')
217        self.curdir = getv(conf, 'CURDIR')
218        self.reldir = getv(conf, 'RELDIR')
219        self.dpdeps = getv(conf, 'DPDEPS')
220        self.line = 0
221
222        if not self.conf:
223            # some of the steps below we want to do only once
224            self.conf = conf
225            self.host_target = getv(conf, 'HOST_TARGET')
226            for srctop in getv(conf, 'SRCTOPS', []):
227                if srctop[-1] != '/':
228                    srctop += '/'
229                if not srctop in self.srctops:
230                    self.srctops.append(srctop)
231                _srctop = os.path.realpath(srctop)
232                if _srctop[-1] != '/':
233                    _srctop += '/'
234                if not _srctop in self.srctops:
235                    self.srctops.append(_srctop)
236
237            trim_list = add_trims(self.machine)
238            if self.machine == 'host':
239                trim_list += add_trims(self.host_target)
240            if self.target_spec:
241                trim_list += add_trims(self.target_spec)
242
243            for objroot in getv(conf, 'OBJROOTS', []):
244                for e in trim_list:
245                    if objroot.endswith(e):
246                        # this is not what we want - fix it
247                        objroot = objroot[0:-len(e)]
248
249                if objroot[-1] != '/':
250                    objroot += '/'
251                if not objroot in self.objroots:
252                    self.objroots.append(objroot)
253                    _objroot = os.path.realpath(objroot)
254                    if objroot[-1] == '/':
255                        _objroot += '/'
256                    if not _objroot in self.objroots:
257                        self.objroots.append(_objroot)
258
259            # we want the longest match
260            self.srctops.sort(reverse=True)
261            self.objroots.sort(reverse=True)
262
263            self.excludes = getv(conf, 'EXCLUDES', [])
264
265            if self.debug:
266                print("host_target=", self.host_target, file=self.debug_out)
267                print("srctops=", self.srctops, file=self.debug_out)
268                print("objroots=", self.objroots, file=self.debug_out)
269                print("excludes=", self.excludes, file=self.debug_out)
270
271            self.dirdep_re = re.compile(r'([^/]+)/(.+)')
272
273        if self.dpdeps and not self.reldir:
274            if self.debug:
275                print("need reldir:", end=' ', file=self.debug_out)
276            if self.curdir:
277                srctop = self.find_top(self.curdir, self.srctops)
278                if srctop:
279                    self.reldir = self.curdir.replace(srctop,'')
280                    if self.debug:
281                        print(self.reldir, file=self.debug_out)
282            if not self.reldir:
283                self.dpdeps = None      # we cannot do it?
284
285        self.cwd = os.getcwd()          # make sure this is initialized
286        self.last_dir = self.cwd
287
288        if name:
289            self.try_parse()
290
291    def reset(self):
292        """reset state if we are being passed meta files from multiple directories."""
293        self.seen = {}
294        self.obj_deps = []
295        self.src_deps = []
296        self.file_deps = []
297
298    def dirdeps(self, sep='\n'):
299        """return DIRDEPS"""
300        return sep.strip() + sep.join(self.obj_deps)
301
302    def src_dirdeps(self, sep='\n'):
303        """return SRC_DIRDEPS"""
304        return sep.strip() + sep.join(self.src_deps)
305
306    def file_depends(self, out=None):
307        """Append DPDEPS_${file} += ${RELDIR}
308        for each file we saw, to the output file."""
309        if not self.reldir:
310            return None
311        for f in sort_unique(self.file_deps):
312            print('DPDEPS_%s += %s' % (f, self.reldir), file=out)
313        # these entries provide for reverse DIRDEPS lookup
314        for f in self.obj_deps:
315            print('DEPDIRS_%s += %s' % (f, self.reldir), file=out)
316
317    def seenit(self, dir):
318        """rememer that we have seen dir."""
319        self.seen[dir] = 1
320
321    def add(self, list, data, clue=''):
322        """add data to list if it isn't already there."""
323        if data not in list:
324            list.append(data)
325            if self.debug:
326                print("%s: %sAdd: %s" % (self.name, clue, data), file=self.debug_out)
327
328    def find_top(self, path, list):
329        """the logical tree may be split across multiple trees"""
330        for top in list:
331            if path.startswith(top):
332                if self.debug > 2:
333                    print("found in", top, file=self.debug_out)
334                return top
335        return None
336
337    def find_obj(self, objroot, dir, path, input):
338        """return path within objroot, taking care of .dirdep files"""
339        ddep = None
340        for ddepf in [path + '.dirdep', dir + '/.dirdep']:
341            if not ddep and os.path.exists(ddepf):
342                ddep = open(ddepf, 'r').readline().strip('# \n')
343                if self.debug > 1:
344                    print("found %s: %s\n" % (ddepf, ddep), file=self.debug_out)
345                if ddep.endswith(self.machine):
346                    ddep = ddep[0:-(1+len(self.machine))]
347                elif self.target_spec and ddep.endswith(self.target_spec):
348                    ddep = ddep[0:-(1+len(self.target_spec))]
349
350        if not ddep:
351            # no .dirdeps, so remember that we've seen the raw input
352            self.seenit(input)
353            self.seenit(dir)
354            if self.machine == 'none':
355                if dir.startswith(objroot):
356                    return dir.replace(objroot,'')
357                return None
358            m = self.dirdep_re.match(dir.replace(objroot,''))
359            if m:
360                ddep = m.group(2)
361                dmachine = m.group(1)
362                if dmachine != self.machine:
363                    if not (self.machine == 'host' and
364                            dmachine == self.host_target):
365                        if self.debug > 2:
366                            print("adding .%s to %s" % (dmachine, ddep), file=self.debug_out)
367                        ddep += '.' + dmachine
368
369        return ddep
370
371    def try_parse(self, name=None, file=None):
372        """give file and line number causing exception"""
373        try:
374            self.parse(name, file)
375        except:
376            # give a useful clue
377            print('{}:{}: '.format(self.name, self.line), end=' ', file=sys.stderr)
378            raise
379
380    def parse(self, name=None, file=None):
381        """A meta file looks like:
382
383        # Meta data file "path"
384        CMD "command-line"
385        CWD "cwd"
386        TARGET "target"
387        -- command output --
388        -- filemon acquired metadata --
389        # buildmon version 3
390        V 3
391        C "pid" "cwd"
392        E "pid" "path"
393        F "pid" "child"
394        R "pid" "path"
395        W "pid" "path"
396        X "pid" "status"
397        D "pid" "path"
398        L "pid" "src" "target"
399        M "pid" "old" "new"
400        S "pid" "path"
401        # Bye bye
402
403        We go to some effort to avoid processing a dependency more than once.
404        Of the above record types only C,E,F,L,R,V and W are of interest.
405        """
406
407        version = 0                     # unknown
408        if name:
409            self.name = name;
410        if file:
411            f = file
412            cwd = self.last_dir = self.cwd
413        else:
414            f = open(self.name, 'r')
415        skip = True
416        pid_cwd = {}
417        pid_last_dir = {}
418        last_pid = 0
419
420        self.line = 0
421        if self.curdir:
422            self.seenit(self.curdir)    # we ignore this
423
424        interesting = 'CEFLRV'
425        for line in f:
426            self.line += 1
427            # ignore anything we don't care about
428            if not line[0] in interesting:
429                continue
430            if self.debug > 2:
431                print("input:", line, end=' ', file=self.debug_out)
432            w = line.split()
433
434            if skip:
435                if w[0] == 'V':
436                    skip = False
437                    version = int(w[1])
438                    """
439                    if version < 4:
440                        # we cannot ignore 'W' records
441                        # as they may be 'rw'
442                        interesting += 'W'
443                    """
444                elif w[0] == 'CWD':
445                    self.cwd = cwd = self.last_dir = w[1]
446                    self.seenit(cwd)    # ignore this
447                    if self.debug:
448                        print("%s: CWD=%s" % (self.name, cwd), file=self.debug_out)
449                continue
450
451            pid = int(w[1])
452            if pid != last_pid:
453                if last_pid:
454                    pid_last_dir[last_pid] = self.last_dir
455                cwd = getv(pid_cwd, pid, self.cwd)
456                self.last_dir = getv(pid_last_dir, pid, self.cwd)
457                last_pid = pid
458
459            # process operations
460            if w[0] == 'F':
461                npid = int(w[2])
462                pid_cwd[npid] = cwd
463                pid_last_dir[npid] = cwd
464                last_pid = npid
465                continue
466            elif w[0] == 'C':
467                cwd = abspath(w[2], cwd, None, self.debug, self.debug_out)
468                if cwd.endswith('/.'):
469                    cwd = cwd[0:-2]
470                self.last_dir = pid_last_dir[pid] = cwd
471                pid_cwd[pid] = cwd
472                if self.debug > 1:
473                    print("cwd=", cwd, file=self.debug_out)
474                continue
475
476            if w[2] in self.seen:
477                if self.debug > 2:
478                    print("seen:", w[2], file=self.debug_out)
479                continue
480            # file operations
481            if w[0] in 'ML':
482                # these are special, tread src as read and
483                # target as write
484                self.parse_path(w[1].strip("'"), cwd, 'R', w)
485                self.parse_path(w[2].strip("'"), cwd, 'W', w)
486                continue
487            elif w[0] in 'ERWS':
488                path = w[2]
489                self.parse_path(path, cwd, w[0], w)
490
491        if not file:
492            f.close()
493
494    def is_src(self, base, dir, rdir):
495        """is base in srctop"""
496        for dir in [dir,rdir]:
497            if not dir:
498                continue
499            path = '/'.join([dir,base])
500            srctop = self.find_top(path, self.srctops)
501            if srctop:
502                if self.dpdeps:
503                    self.add(self.file_deps, path.replace(srctop,''), 'file')
504                self.add(self.src_deps, dir.replace(srctop,''), 'src')
505                self.seenit(dir)
506                return True
507        return False
508
509    def parse_path(self, path, cwd, op=None, w=[]):
510        """look at a path for the op specified"""
511
512        if not op:
513            op = w[0]
514
515        # we are never interested in .dirdep files as dependencies
516        if path.endswith('.dirdep'):
517            return
518        for p in self.excludes:
519            if p and path.startswith(p):
520                if self.debug > 2:
521                    print("exclude:", p, path, file=self.debug_out)
522                return
523        # we don't want to resolve the last component if it is
524        # a symlink
525        path = resolve(path, cwd, self.last_dir, self.debug, self.debug_out)
526        if not path:
527            return
528        dir,base = os.path.split(path)
529        if dir in self.seen:
530            if self.debug > 2:
531                print("seen:", dir, file=self.debug_out)
532            return
533        # we can have a path in an objdir which is a link
534        # to the src dir, we may need to add dependencies for each
535        rdir = dir
536        dir = abspath(dir, cwd, self.last_dir, self.debug, self.debug_out)
537        rdir = os.path.realpath(dir)
538        if rdir == dir:
539            rdir = None
540        # now put path back together
541        path = '/'.join([dir,base])
542        if self.debug > 1:
543            print("raw=%s rdir=%s dir=%s path=%s" % (w[2], rdir, dir, path), file=self.debug_out)
544        if op in 'RWS':
545            if path in [self.last_dir, cwd, self.cwd, self.curdir]:
546                if self.debug > 1:
547                    print("skipping:", path, file=self.debug_out)
548                return
549            if os.path.isdir(path):
550                if op in 'RW':
551                    self.last_dir = path;
552                if self.debug > 1:
553                    print("ldir=", self.last_dir, file=self.debug_out)
554                return
555
556        if op in 'ERW':
557            # finally, we get down to it
558            if dir == self.cwd or dir == self.curdir:
559                return
560            if self.is_src(base, dir, rdir):
561                self.seenit(w[2])
562                if not rdir:
563                    return
564
565            objroot = None
566            for dir in [dir,rdir]:
567                if not dir:
568                    continue
569                objroot = self.find_top(dir, self.objroots)
570                if objroot:
571                    break
572            if objroot:
573                ddep = self.find_obj(objroot, dir, path, w[2])
574                if ddep:
575                    self.add(self.obj_deps, ddep, 'obj')
576                    if self.dpdeps and objroot.endswith('/stage/'):
577                        sp = '/'.join(path.replace(objroot,'').split('/')[1:])
578                        self.add(self.file_deps, sp, 'file')
579            else:
580                # don't waste time looking again
581                self.seenit(w[2])
582                self.seenit(dir)
583
584
585def main(argv, klass=MetaFile, xopts='', xoptf=None):
586    """Simple driver for class MetaFile.
587
588    Usage:
589        script [options] [key=value ...] "meta" ...
590
591    Options and key=value pairs contribute to the
592    dictionary passed to MetaFile.
593
594    -S "SRCTOP"
595                add "SRCTOP" to the "SRCTOPS" list.
596
597    -C "CURDIR"
598
599    -O "OBJROOT"
600                add "OBJROOT" to the "OBJROOTS" list.
601
602    -m "MACHINE"
603
604    -a "MACHINE_ARCH"
605
606    -H "HOST_TARGET"
607
608    -D "DPDEPS"
609
610    -d  bumps debug level
611
612    """
613    import getopt
614
615    # import Psyco if we can
616    # it can speed things up quite a bit
617    have_psyco = 0
618    try:
619        import psyco
620        psyco.full()
621        have_psyco = 1
622    except:
623        pass
624
625    conf = {
626        'SRCTOPS': [],
627        'OBJROOTS': [],
628        'EXCLUDES': [],
629        }
630
631    try:
632        machine = os.environ['MACHINE']
633        if machine:
634            conf['MACHINE'] = machine
635        machine_arch = os.environ['MACHINE_ARCH']
636        if machine_arch:
637            conf['MACHINE_ARCH'] = machine_arch
638        srctop = os.environ['SB_SRC']
639        if srctop:
640            conf['SRCTOPS'].append(srctop)
641        objroot = os.environ['SB_OBJROOT']
642        if objroot:
643            conf['OBJROOTS'].append(objroot)
644    except:
645        pass
646
647    debug = 0
648    output = True
649
650    opts, args = getopt.getopt(argv[1:], 'a:dS:C:O:R:m:D:H:qT:X:' + xopts)
651    for o, a in opts:
652        if o == '-a':
653            conf['MACHINE_ARCH'] = a
654        elif o == '-d':
655            debug += 1
656        elif o == '-q':
657            output = False
658        elif o == '-H':
659            conf['HOST_TARGET'] = a
660        elif o == '-S':
661            if a not in conf['SRCTOPS']:
662                conf['SRCTOPS'].append(a)
663        elif o == '-C':
664            conf['CURDIR'] = a
665        elif o == '-O':
666            if a not in conf['OBJROOTS']:
667                conf['OBJROOTS'].append(a)
668        elif o == '-R':
669            conf['RELDIR'] = a
670        elif o == '-D':
671            conf['DPDEPS'] = a
672        elif o == '-m':
673            conf['MACHINE'] = a
674        elif o == '-T':
675            conf['TARGET_SPEC'] = a
676        elif o == '-X':
677            if a not in conf['EXCLUDES']:
678                conf['EXCLUDES'].append(a)
679        elif xoptf:
680            xoptf(o, a, conf)
681
682    conf['debug'] = debug
683
684    # get any var=val assignments
685    eaten = []
686    for a in args:
687        if a.find('=') > 0:
688            k,v = a.split('=')
689            if k in ['SRCTOP','OBJROOT','SRCTOPS','OBJROOTS']:
690                if k == 'SRCTOP':
691                    k = 'SRCTOPS'
692                elif k == 'OBJROOT':
693                    k = 'OBJROOTS'
694                if v not in conf[k]:
695                    conf[k].append(v)
696            else:
697                conf[k] = v
698            eaten.append(a)
699            continue
700        break
701
702    for a in eaten:
703        args.remove(a)
704
705    debug_out = getv(conf, 'debug_out', sys.stderr)
706
707    if debug:
708        print("config:", file=debug_out)
709        print("psyco=", have_psyco, file=debug_out)
710        for k,v in list(conf.items()):
711            print("%s=%s" % (k,v), file=debug_out)
712
713    m = None
714    for a in args:
715        if a.endswith('.meta'):
716            if not os.path.exists(a):
717                continue
718            m = klass(a, conf)
719        elif a.startswith('@'):
720            # there can actually multiple files per line
721            for line in open(a[1:]):
722                for f in line.strip().split():
723                    if not os.path.exists(f):
724                        continue
725                    m = klass(f, conf)
726
727    if output and m:
728        print(m.dirdeps())
729
730        print(m.src_dirdeps('\nsrc:'))
731
732        dpdeps = getv(conf, 'DPDEPS')
733        if dpdeps:
734            m.file_depends(open(dpdeps, 'wb'))
735
736    return m
737
738if __name__ == '__main__':
739    try:
740        main(sys.argv)
741    except:
742        # yes, this goes to stdout
743        print("ERROR: ", sys.exc_info()[1])
744        raise
745
746