feat(third_party/bazel): Check in rules_haskell from Tweag

This commit is contained in:
Vincent Ambo 2019-07-04 11:18:12 +01:00
parent 2eb1dc26e4
commit f723b8b878
479 changed files with 51484 additions and 0 deletions

View file

@ -0,0 +1,50 @@
load(
":ldd_test.bzl",
"ldd_test",
)
py_library(
name = "linking_utils",
srcs = ["ldd.py"],
visibility = ["//visibility:public"],
)
# test the ldd debug library on the output of `//tests/binary-indirect-cbits`
ldd_test(
name = "test-ldd",
current_workspace = None,
elf_binary = "//tests/binary-indirect-cbits",
script = r'''
import sys
def contains_error(error):
"""check whether any of the dependencies contains `error`,
where error is something from `LDD_ERRORS`.
Returns {} if there's no error.
"""
def f(d):
return { k: v for k, v in d['needed'].items()
if (v == error
or (v not in LDD_ERRORS
and dict_remove_empty(v['item']) != {})) }
return f
# output should have some runpaths
assert \
ldd(identity, sys.argv[1])['runpath_dirs']\
> 0
# some of the dependencies are implicit and not in NEEDED flags
assert ldd(contains_error(LDD_UNKNOWN), sys.argv[1])
import pprint
# none of the dependencies must be missing
res = ldd(contains_error(LDD_MISSING), sys.argv[1])
if res != {}:
print("These dependencies are missing:")
pprint.pprint(res)
exit(1)
''',
# it only works on linux
tags = ["dont_test_on_darwin"],
)

View file

@ -0,0 +1,265 @@
# Debugging linking errors
The usual utilities, like `nm`, `objdump`, and of course `ldd` (see
[here](https://linux-audit.com/elf-binaries-on-linux-understanding-and-analysis/#tools-for-binary-analysis)
for a good overview of existing tools) go a long way. Yet, when
debugging non-trivial runtime linker failures one would oftentimes
like to filter outputs programmatically, with more advanced query
logic than just simple `grep` and `sed` expressions.
This library provides a small set of utility subroutines. These can
help debug complicated linker errors.
The main function is `ldd(f, elf_path)`. It is in the same spirit
as `ldd(1)`, but instead of a flat list of resolved libraries, it
returns a tree of structured information.
When we use the term `ldd` in the following document, it refers
to the `ldd` function exported from [./ldd.py](./ldd.py).
To query that tree, you pass it a function `f`, which is applied to
each dependency recursively (transforming the tree from the bottom
up).
The following functions are exported alongside the `ldd` function.
They can be passed to `ldd` and used as building blocks for insightful
queries:
- `identity`: dont transform, output everything
- `remove_matching_needed`: remove needed entries that match a regex
- `remove_matching_runpaths`: remove runpaths that match a regex
- `non_existing_runpaths`: return a list of runpaths that dont exist
in the filesystem
- `unused_runpaths`: return a list of runpaths that are listed in the
elf binary header, but no dependency was actually found in them
- `collect_unused_runpaths`: give an overview of all unused runpaths
Helpers:
- `dict_remove_empty`: remove fields with empty lists/dicts from an output
- `items`: `dict.iteritems()` for both python 2 and 3
See the introductory tutorial below on how to use these functions.
## Example usage
### Setup
If you have a bazel target which outputs a binary which you want to
debug, the easiest way is to use `ldd_test`:
```python
load(
"//:debug/linking_utils/ldd_test.bzl",
"ldd_test",
)
ldd_test(
name = "test-ldd",
elf_binary = "//tests/binary-indirect-cbits",
current_workspace = None,
script = r'''
YOUR SCRIPT HERE
'''
)
```
All exported functions from `ldd.py` are already in scope.
See the [`BUILD`](./BUILD) file in this directory for an example.
### Writing queries
`ldd` takes a function that is applied to each layer of elf
dependencies. This function is passed a set of structured data.
This data is gathered by querying the elf binary with `objdump`
and parsing the header fields of the dynamic section:
```
DependencyInfo :
{ needed : dict(string, union(
LDD_MISSING, LDD_UNKNOWN,
{
# the needed dependency
item : a,
# where the dependency was found in
found_in : RunpathDir
}))
# all runpath directories that were searched
, runpath_dirs : [ RunpathDir ] }
```
The amount of data can get quite extensive for larger projects, so you
need a way to filter it down to get to the bottom of our problem.
If a transitive dependency cannot be found by the runtime linker, the
binary cannot be started. `ldd` shows such a problem by setting
the corresponding value in the `needed` dict to `LDD_MISSING`.
To remove everything from the output but the missing dependency and
the path to that dependency, you can write a filter like this:
```python
# `d` is the DependencyInfo dict from above
def filter_down_to_missing(d):
res = {}
# items is a .iteritems() that works for py 2 and 3
for name, dep in items(d['needed']):
if dep == LDD_MISSING:
res[name] = LDD_MISSING
elif dep in LDD_ERRORS:
pass
else:
# dep['item'] contains the already converted info
# from the previous layer
res[name] = dep['item']
# dict_remove_empty removes all empty fields from the dict,
# otherwise your result contains a lot of {} in the values.
return dict_remove_empty(res)
# To get human-readable output, we re-use pythons pretty printing
# library. Its only simple python values after all!
import pprint
pprint.pprint(
# actually parse the elf binary and apply only_missing on each layer
ldd(
filter_down_to_missing,
# the path to the elf binary you want to expect.
elf_binary_path
)
)
```
Note that in the filter you only need to filter the data for the
current executable, and add the info from previous layers (which are
available in `d['item']`).
The result might look something like:
```python
{'libfoo.so.5': {'libbar.so.1': {'libbaz.so.6': 'MISSING'}}}
```
or
```python
{}
```
if nothing is missing.
Now, that is a similar output to what a tool like `lddtree(1)` could
give you. But we dont need to stop there because its trivial to
augment your output with more information:
```python
def missing_with_runpath(d):
# our previous function can be re-used
missing = filter_down_to_missing(d)
# only display runpaths if there are missing deps
runpaths = [] if missing is {} else d['runpath_dirs']
# dict_remove_empty keeps the output clean
return dict_remove_empty({
'rpth': runpaths,
'miss': missing
})
# same invocation, different function
pprint.pprint(
ldd(
missing_with_runpath,
elf_binary_path
)
)
```
which displays something like this for my example binary:
```python
{ 'miss': { 'libfoo.so.5': { 'miss': { 'libbar.so.1': { 'miss': { 'libbaz.so.6': 'MISSING'},
'rpth': [ { 'absolute_path': '/home/philip/.cache/bazel/_bazel_philip/fd9fea5ad581ea59473dc1f9d6bce826/execroot/myproject/bazel-out/k8-fastbuild/bin/something/and/bazel-out/k8-fastbuild/bin/other/integrate',
'path': '$ORIGIN/../../../../../../bazel-out/k8-fastbuild/bin/other/integrate'}]}},
'rpth': [ { 'absolute_path': '/nix/store/xdsjx0gba4id3yyqxv66bxnm2sqixkjj-glibc-2.27/lib',
'path': '/nix/store/xdsjx0gba4id3yyqxv66bxnm2sqixkjj-glibc-2.27/lib'},
{ 'absolute_path': '/nix/store/x6inizi5ahlyhqxxwv1rvn05a25icarq-gcc-7.3.0-lib/lib',
'path': '/nix/store/x6inizi5ahlyhqxxwv1rvn05a25icarq-gcc-7.3.0-lib/lib'}]}},
'rpth': [ … lots more nix rpaths … ]}
```
Thats still a bit cluttered for my taste, so lets filter out
the `/nix/store` paths (which are mostly noise):
```python
import re
nix_matcher = re.compile("/nix/store.*")
def missing_with_runpath(d):
missing = filter_down_to_missing(d)
# this is one of the example functions provided by ldd.py
remove_matching_runpaths(d, nix_matcher)
# ^^^
runpaths = [] if missing is {} else d['runpath_dirs']
# dict_remove_empty keeps the output clean
return dict_remove_empty({
'rpth': runpaths,
'miss': missing
})
```
and we are down to:
```python
{ 'miss': { 'libfoo.so.5': { 'miss': { 'libbar.so.1': { 'miss': { 'libbaz.so.6': 'MISSING'},
'rpth': [ { 'absolute_path': '/home/philip/.cache/bazel/_bazel_philip/fd9fea5ad581ea59473dc1f9d6bce826/execroot/myproject/bazel-out/k8-fastbuild/bin/something/and/bazel-out/k8-fastbuild/bin/other/integrate',
'path': '$ORIGIN/../../../../../../bazel-out/k8-fastbuild/bin/other/integrate'}]}}}
```
… which shows exactly the path that is missing the dependency we
expect. But what has gone wrong? Does this path even exist? We can
find out!
```python
import re
nix_matcher = re.compile("/nix/store.*")
def missing_with_runpath(d):
missing = filter_down_to_missing(d)
remove_matching_runpaths(d, nix_matcher)
runpaths = [] if missing is {} else d['runpath_dirs']
# returns a list of runpaths that dont exist in the filesystem
doesnt_exist = non_existing_runpaths(d)
# ^^^
return dict_remove_empty({
'rpth': runpaths,
'miss': missing,
'doesnt_exist': doesnt_exist,
})
```
I amended the output by a list of runpaths which point to non-existing
directories:
```python
{ 'miss': { 'libfoo.so.5': { 'miss': { 'libbar.so.1': { 'miss': { 'libbaz.so.6': 'MISSING'},
'rpth': [ { 'absolute_path': '/home/philip/.cache/bazel/_bazel_philip/fd9fea5ad581ea59473dc1f9d6bce826/execroot/myproject/bazel-out/k8-fastbuild/bin/something/and/bazel-out/k8-fastbuild/bin/other/integrate',
'path': '$ORIGIN/../../../../../../bazel-out/k8-fastbuild/bin/other/integrate'}]
'doesnt_exist': [ { 'absolute_path': '/home/philip/.cache/bazel/_bazel_philip/fd9fea5ad581ea59473dc1f9d6bce826/execroot/myproject/bazel-out/k8-fastbuild/bin/something/and/bazel-out/k8-fastbuild/bin/other/integrate',
'path': '$ORIGIN/../../../../../../bazel-out/k8-fastbuild/bin/other/integrate'}]}}}
```
Suddenly its perfectly clear where the problem lies,
`$ORIGIN/../../../../../../bazel-out/k8-fastbuild/bin/other/integrate`
points to a path that does not exist.
Any data query youd like to do is possible, as long as it uses
the data provided by the `ldd` function. See the lower part of
`ldd.py` for more examples.

View file

@ -0,0 +1,288 @@
import subprocess
import os
import sys
import re
### helper functions
def list_to_dict(f, l):
"""dict with elements of list as keys & as values transformed by f"""
d = {}
for el in l:
d[el] = f(el)
return d
def dict_remove_empty(d):
"""remove keys that have [] or {} or as values"""
new = {}
for k, v in d.items():
if not (v == [] or v == {}):
new[k] = v
return new
def identity(x):
"""identity function"""
return x
def const(x):
"""(curried) constant function"""
def f(y):
return x
return f
def memoized(cache, f, arg):
"""Memoizes a call to `f` with `arg` in the dict `cache`.
Modifies the cache dict in place."""
res = cache.get(arg)
if arg in cache:
return cache[arg]
else:
res = f(arg)
cache[arg] = res
return res
### IO functions that find elf dependencies
_field_matcher = re.compile(b" ([A-Z0-9_]+) +(.*)$")
def read_dynamic_fields(elf_path):
"""Read the dynamic header fields from an elf binary
Args:
elf_path: path to the elf binary (either absolute or relative to pwd)
Returns:
a list [(field_key, field_value)] where field_keys could appear multiple
times (for example there's usually more than one NEEDED field).
"""
res = subprocess.check_output([
# force locale to C for stable output
"env", "LC_ALL=C",
"objdump",
# specifying the section brings execution time down from 150ms to 10ms
"--section=.dynamic",
"--all-headers",
elf_path
])
to_end = res.split(b"Dynamic Section:\n")[1]
# to first empty line
dyn_section = to_end[: 1 + to_end.find(b"\n\n")]
def read_dynamic_field(s):
"""return (field_key, field_value)"""
return _field_matcher.match(s).groups()
return list(map(read_dynamic_field, dyn_section.splitlines(True)))
def __query_dynamic_fields(df, key):
"""takes a list of dynamic field tuples (key and value),
where keys can appear multiple times, and returns a list of all
values with the given key (in stable order)."""
return [v for k, v in df if k == key]
def parse_runpath_dirs(elf_path, elf_dynamic_fields):
"""Parse a RUNPATH entry from an elf header bytestring.
Returns:
{ path: unmodified string from DT_RUNPATH
, absolute_path: fully normalized, absolute path to dir }
"""
fields = __query_dynamic_fields(elf_dynamic_fields, b"RUNPATH")
if fields == []:
return []
assert len(fields) == 1
val = fields[0]
origin = os.path.dirname(elf_path)
return [{ 'path': path,
'absolute_path': os.path.abspath(path.replace("$ORIGIN", origin)) }
for path in val.decode().strip(":").split(":")
if path != ""]
def parse_needed(elf_dynamic_fields):
"""Returns the list of DT_NEEDED entries for elf"""
return [n.decode() for n in __query_dynamic_fields(elf_dynamic_fields, b"NEEDED")]
### Main utility
# cannot find dependency
LDD_MISSING = "MISSING"
# don't know how to search for dependency
LDD_UNKNOWN = "DUNNO"
# list of all errors for easy branching
LDD_ERRORS = [ LDD_MISSING, LDD_UNKNOWN ]
def _ldd(elf_cache, f, elf_path):
"""Same as `ldd` (below), except for an additional `elf_cache` argument,
which is a dict needed for memoizing elf files that were already read.
This is done because the elf reading operation is quite expensive
and many files are referenced multiple times (e.g. glib.so)."""
def search(rdirs, elf_libname):
"""search for elf_libname in runfile dirs
and return either the name or missing"""
res = LDD_MISSING
for rdir in rdirs:
potential_path = os.path.join(rdir['absolute_path'], elf_libname)
if os.path.exists(potential_path):
res = {
'item': potential_path,
'found_in': rdir,
}
break
return res
def recurse(search_res):
"""Unfold the subtree of ELF dependencies for a `search` result"""
if search_res == LDD_MISSING:
return LDD_MISSING
else:
# we keep all other fields in search_res the same,
# just item is the one that does the recursion.
# This is the part that would normally be done by fmap.
search_res['item'] = _ldd(elf_cache, f, search_res['item'])
return search_res
# (GNU) ld.so resolves any symlinks before searching for dependencies
elf_realpath = os.path.realpath(elf_path)
# memoized uses the cache to not repeat the I/O action
# for the same elf files (same path)
dyn_fields = memoized(
elf_cache, read_dynamic_fields, elf_realpath
)
rdirs = parse_runpath_dirs(elf_realpath, dyn_fields)
all_needed = parse_needed(dyn_fields)
# if there's no runpath dirs we don't know where to search
if rdirs == []:
needed = list_to_dict(const(LDD_UNKNOWN), all_needed)
else:
needed = list_to_dict(
lambda name: recurse(search(rdirs, name)),
all_needed
)
result = {
'runpath_dirs': rdirs,
'needed': needed
}
# Here, f is applied to the result of the previous level of recursion
return f(result)
def ldd(f, elf_path):
"""follows DT_NEEDED ELF headers for elf by searching the through DT_RUNPATH.
DependencyInfo :
{ needed : dict(string, union(
LDD_MISSING, LDD_UNKNOWN,
{
# the needed dependency
item : a,
# where the dependency was found in
found_in : RunpathDir
}))
# all runpath directories that were searched
, runpath_dirs : [ RunpathDir ] }
Args:
f: DependencyInfo -> a
modifies the results of each level
elf_path: path to ELF file, either absolute or relative to current working dir
Returns: a
"""
elf_cache = {}
return _ldd(elf_cache, f, elf_path)
### Functions to pass to ldd
# Only use the current layer
def remove_matching_needed(d, re_matcher_absolute_path=None, re_matcher_path=None):
"""Destructively removes needed values from d['needed']
if they match the given regex matcher.
Doesn't remove LDD_ERRORS."""
def pred(v):
"""return true if match"""
if v in LDD_ERRORS:
return False
found_in = v['found_in']
abs_match = re_matcher_absolute_path.match(found_in['absolute_path']) \
if re_matcher_absolute_path else False
match = re_matcher_path.match(found_in['path']) \
if re_matcher_path else False
if abs_match or match:
return True
d['needed'] = {
k: v for k, v in d['needed'].items()
if not pred(v)
}
def remove_matching_runpaths(d, re_matcher):
"""Destructively removes runpaths from d['runpath_dirs']
if they match the given regex matcher."""
d['runpath_dirs'] = [
runp for runp in d['runpath_dirs']
if not re_matcher.match(runp['absolute_path'])
]
return d
def non_existing_runpaths(d):
"""Return a list of runpaths_dirs that do not exist in the file system."""
return [
runp for runp in d['runpath_dirs']
if not os.path.exists(runp['absolute_path'])
]
def unused_runpaths(d):
"""Return a list of runpath_dirs that were not used to find NEEDED dependencies."""
used = set()
for k, v in d['needed'].items():
if not v in LDD_ERRORS:
used.add(v['found_in']['absolute_path'])
return [
u for u in d['runpath_dirs']
if u['absolute_path'] not in used
]
# Also use the results of sub-layers
def collect_unused_runpaths(d):
"""This is like `unused_runpaths`, but it creates a deduplicated list of all unused runpaths
for its dependencies instead of just returning them for the current layer.
Returns:
a dict of two fields;
`mine` contains the unused dependencies of the current binary under scrutiny
`others` contains a flat dict of all .sos with unused runpath entries and a list of them for each .so
"""
used = set()
given = set(r['absolute_path'] for r in d['runpath_dirs'])
prev = {}
# TODO: use `unused_runpaths` here
for k, v in d['needed'].items():
if not v in LDD_ERRORS:
used.add(v['found_in']['absolute_path'])
prev[k] = v['item']
unused = [
u for u in given.difference(used)
# leave out nix storepaths
if not u.startswith("/nix/store")
]
# Each layer doesn't know about their own name
# So we return a list of unused for this layer ('mine')
# and a dict of all previeous layers combined (name to list)
def combine_unused(deps):
res = {}
for name, dep in deps.items():
res.update(dep['others'])
res[name] = dep['mine']
return res
return {
'mine': unused,
'others': combine_unused(prev),
}

View file

@ -0,0 +1,26 @@
load(
"//:tests/inline_tests.bzl",
"py_inline_test",
)
#
def ldd_test(name, elf_binary, script, current_workspace = None, tags = []):
"""Test with imported linking_utils.ldd library.
The path to the `elf_binary` is passed in sys.argv[1].
"""
py_inline_test(
name,
deps = ["@io_tweag_rules_haskell//debug/linking_utils"],
data = [elf_binary],
args = ["{}/$(rootpath {})".format(current_workspace, elf_binary)] if current_workspace else ["$(rootpath {})".format(elf_binary)],
script = """
from io_tweag_rules_haskell.debug.linking_utils.ldd import \\
dict_remove_empty, identity, const, \\
LDD_MISSING, LDD_UNKNOWN, LDD_ERRORS, \\
ldd, \\
remove_matching_needed, remove_matching_runpaths, \\
non_existing_runpaths, unused_runpaths, \\
collect_unused_runpaths
""" + script,
tags = tags,
)