Merge pull request #2457 from signalwire/deadlock
[support-d] Add deadlock.py to tree.
This commit is contained in:
commit
da93927c01
|
@ -0,0 +1,76 @@
|
|||
`gdb` scripts
|
||||
-----------
|
||||
|
||||
Originally from: https://github.com/facebook/folly/tree/593b6e76881042031b7f21d898c8e0874ea79fe0/folly/experimental/gdb
|
||||
|
||||
This directory contains a collection of `gdb` scripts that we have found helpful.
|
||||
These scripts use the [gdb extension Python API](https://sourceware.org/gdb/current/onlinedocs/gdb/Python.html#Python).
|
||||
|
||||
### How to run the scripts
|
||||
|
||||
To run the scripts, fire up `gdb` and load a script with `source -v`. Example:
|
||||
|
||||
```lang=bash
|
||||
$ gdb -p 123456
|
||||
(gdb) source -v ./folly/experimental/gdb/deadlock.py
|
||||
Type "deadlock" to detect deadlocks.
|
||||
# At this point, any new commands defined in `deadlock.py` are available.
|
||||
(gdb) deadlock
|
||||
Found deadlock!
|
||||
...
|
||||
```
|
||||
|
||||
### What does each script do?
|
||||
|
||||
#### `deadlock.py` - Detect deadlocks
|
||||
|
||||
Consider the following program that always deadlocks:
|
||||
|
||||
```lang=cpp
|
||||
void deadlock3() {
|
||||
std::mutex m1, m2, m3;
|
||||
folly::Baton<> b1, b2, b3;
|
||||
|
||||
auto t1 = std::thread([&m1, &m2, &b1, &b2] {
|
||||
std::lock_guard<std::mutex> g1(m1);
|
||||
b1.post();
|
||||
b2.wait();
|
||||
std::lock_guard<std::mutex> g2(m2);
|
||||
});
|
||||
|
||||
auto t2 = std::thread([&m3, &m2, &b3, &b2] {
|
||||
std::lock_guard<std::mutex> g2(m2);
|
||||
b2.post();
|
||||
b3.wait();
|
||||
std::lock_guard<std::mutex> g3(m3);
|
||||
});
|
||||
|
||||
auto t3 = std::thread([&m3, &m1, &b3, &b1] {
|
||||
std::lock_guard<std::mutex> g3(m3);
|
||||
b3.post();
|
||||
b1.wait();
|
||||
std::lock_guard<std::mutex> g1(m1);
|
||||
});
|
||||
|
||||
t1.join();
|
||||
t2.join();
|
||||
t3.join();
|
||||
}
|
||||
```
|
||||
|
||||
The `deadlock.py` script introduces a new `deadlock` command that can help
|
||||
us identify the threads and mutexes involved with the deadlock.
|
||||
|
||||
```lang=bash
|
||||
$ gdb -p 2174496
|
||||
(gdb) source -v ./folly/experimental/gdb/deadlock.py
|
||||
Type "deadlock" to detect deadlocks.
|
||||
(gdb) deadlock
|
||||
Found deadlock!
|
||||
Thread 2 (LWP 2174497) is waiting on mutex (0x00007ffcff42a4c0) held by Thread 3 (LWP 2174498)
|
||||
Thread 3 (LWP 2174498) is waiting on mutex (0x00007ffcff42a4f0) held by Thread 4 (LWP 2174499)
|
||||
Thread 4 (LWP 2174499) is waiting on mutex (0x00007ffcff42a490) held by Thread 2 (LWP 2174497)
|
||||
```
|
||||
|
||||
NOTE: This script only works on Linux and requires debug symbols to be installed
|
||||
for the `pthread` library.
|
|
@ -0,0 +1,474 @@
|
|||
#!/usr/bin/env python3
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import re
|
||||
from collections import defaultdict
|
||||
from enum import Enum
|
||||
|
||||
import gdb
|
||||
|
||||
|
||||
class DiGraph:
|
||||
"""
|
||||
Adapted from networkx: http://networkx.github.io/
|
||||
Represents a directed graph. Edges can store (key, value) attributes.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
# Map of node -> set of nodes
|
||||
self.adjacency_map = {}
|
||||
# Map of (node1, node2) -> map string -> arbitrary attribute
|
||||
# This will not be copied in subgraph()
|
||||
self.attributes_map = {}
|
||||
|
||||
def neighbors(self, node):
|
||||
return self.adjacency_map.get(node, set())
|
||||
|
||||
def edges(self):
|
||||
edges = []
|
||||
for node, neighbors in self.adjacency_map.items():
|
||||
for neighbor in neighbors:
|
||||
edges.append((node, neighbor))
|
||||
return edges
|
||||
|
||||
def nodes(self):
|
||||
return self.adjacency_map.keys()
|
||||
|
||||
def attributes(self, node1, node2):
|
||||
return self.attributes_map[(node1, node2)]
|
||||
|
||||
def add_edge(self, node1, node2, **kwargs):
|
||||
if node1 not in self.adjacency_map:
|
||||
self.adjacency_map[node1] = set()
|
||||
if node2 not in self.adjacency_map:
|
||||
self.adjacency_map[node2] = set()
|
||||
self.adjacency_map[node1].add(node2)
|
||||
self.attributes_map[(node1, node2)] = kwargs
|
||||
|
||||
def remove_node(self, node):
|
||||
self.adjacency_map.pop(node, None)
|
||||
for _, neighbors in self.adjacency_map.items():
|
||||
neighbors.discard(node)
|
||||
|
||||
def subgraph(self, nodes):
|
||||
graph = DiGraph()
|
||||
for node in nodes:
|
||||
for neighbor in self.neighbors(node):
|
||||
if neighbor in nodes:
|
||||
graph.add_edge(node, neighbor)
|
||||
return graph
|
||||
|
||||
def node_link_data(self):
|
||||
"""
|
||||
Returns the graph as a dictionary in a format that can be
|
||||
serialized.
|
||||
"""
|
||||
data = {
|
||||
"directed": True,
|
||||
"multigraph": False,
|
||||
"graph": {},
|
||||
"links": [],
|
||||
"nodes": [],
|
||||
}
|
||||
|
||||
# Do one pass to build a map of node -> position in nodes
|
||||
node_to_number = {}
|
||||
for node in self.adjacency_map.keys():
|
||||
node_to_number[node] = len(data["nodes"])
|
||||
data["nodes"].append({"id": node})
|
||||
|
||||
# Do another pass to build the link information
|
||||
for node, neighbors in self.adjacency_map.items():
|
||||
for neighbor in neighbors:
|
||||
link = self.attributes_map[(node, neighbor)].copy()
|
||||
link["source"] = node_to_number[node]
|
||||
link["target"] = node_to_number[neighbor]
|
||||
data["links"].append(link)
|
||||
return data
|
||||
|
||||
|
||||
def strongly_connected_components(G): # noqa: C901
|
||||
"""
|
||||
Adapted from networkx: http://networkx.github.io/
|
||||
Parameters
|
||||
----------
|
||||
G : DiGraph
|
||||
Returns
|
||||
-------
|
||||
comp : generator of sets
|
||||
A generator of sets of nodes, one for each strongly connected
|
||||
component of G.
|
||||
"""
|
||||
preorder = {}
|
||||
lowlink = {}
|
||||
scc_found = {}
|
||||
scc_queue = []
|
||||
i = 0 # Preorder counter
|
||||
for source in G.nodes():
|
||||
if source not in scc_found:
|
||||
queue = [source]
|
||||
while queue:
|
||||
v = queue[-1]
|
||||
if v not in preorder:
|
||||
i = i + 1
|
||||
preorder[v] = i
|
||||
done = 1
|
||||
v_nbrs = G.neighbors(v)
|
||||
for w in v_nbrs:
|
||||
if w not in preorder:
|
||||
queue.append(w)
|
||||
done = 0
|
||||
break
|
||||
if done == 1:
|
||||
lowlink[v] = preorder[v]
|
||||
for w in v_nbrs:
|
||||
if w not in scc_found:
|
||||
if preorder[w] > preorder[v]:
|
||||
lowlink[v] = min([lowlink[v], lowlink[w]])
|
||||
else:
|
||||
lowlink[v] = min([lowlink[v], preorder[w]])
|
||||
queue.pop()
|
||||
if lowlink[v] == preorder[v]:
|
||||
scc_found[v] = True
|
||||
scc = {v}
|
||||
while scc_queue and preorder[scc_queue[-1]] > preorder[v]:
|
||||
k = scc_queue.pop()
|
||||
scc_found[k] = True
|
||||
scc.add(k)
|
||||
yield scc
|
||||
else:
|
||||
scc_queue.append(v)
|
||||
|
||||
|
||||
def simple_cycles(G): # noqa: C901
|
||||
"""
|
||||
Adapted from networkx: http://networkx.github.io/
|
||||
Parameters
|
||||
----------
|
||||
G : DiGraph
|
||||
Returns
|
||||
-------
|
||||
cycle_generator: generator
|
||||
A generator that produces elementary cycles of the graph.
|
||||
Each cycle is represented by a list of nodes along the cycle.
|
||||
"""
|
||||
|
||||
def _unblock(thisnode, blocked, B):
|
||||
stack = {thisnode}
|
||||
while stack:
|
||||
node = stack.pop()
|
||||
if node in blocked:
|
||||
blocked.remove(node)
|
||||
stack.update(B[node])
|
||||
B[node].clear()
|
||||
|
||||
# Johnson's algorithm requires some ordering of the nodes.
|
||||
# We assign the arbitrary ordering given by the strongly connected comps
|
||||
# There is no need to track the ordering as each node removed as processed.
|
||||
# save the actual graph so we can mutate it here
|
||||
# We only take the edges because we do not want to
|
||||
# copy edge and node attributes here.
|
||||
subG = G.subgraph(G.nodes())
|
||||
sccs = list(strongly_connected_components(subG))
|
||||
while sccs:
|
||||
scc = sccs.pop()
|
||||
# order of scc determines ordering of nodes
|
||||
startnode = scc.pop()
|
||||
# Processing node runs 'circuit' routine from recursive version
|
||||
path = [startnode]
|
||||
blocked = set() # vertex: blocked from search?
|
||||
closed = set() # nodes involved in a cycle
|
||||
blocked.add(startnode)
|
||||
B = defaultdict(set) # graph portions that yield no elementary circuit
|
||||
stack = [(startnode, list(subG.neighbors(startnode)))]
|
||||
while stack:
|
||||
thisnode, nbrs = stack[-1]
|
||||
if nbrs:
|
||||
nextnode = nbrs.pop()
|
||||
if nextnode == startnode:
|
||||
yield path[:]
|
||||
closed.update(path)
|
||||
elif nextnode not in blocked:
|
||||
path.append(nextnode)
|
||||
stack.append((nextnode, list(subG.neighbors(nextnode))))
|
||||
closed.discard(nextnode)
|
||||
blocked.add(nextnode)
|
||||
continue
|
||||
# done with nextnode... look for more neighbors
|
||||
if not nbrs: # no more nbrs
|
||||
if thisnode in closed:
|
||||
_unblock(thisnode, blocked, B)
|
||||
else:
|
||||
for nbr in subG.neighbors(thisnode):
|
||||
if thisnode not in B[nbr]:
|
||||
B[nbr].add(thisnode)
|
||||
stack.pop()
|
||||
path.pop()
|
||||
# done processing this node
|
||||
subG.remove_node(startnode)
|
||||
H = subG.subgraph(scc) # make smaller to avoid work in SCC routine
|
||||
sccs.extend(list(strongly_connected_components(H)))
|
||||
|
||||
|
||||
def find_cycle(graph):
|
||||
"""
|
||||
Looks for a cycle in the graph. If found, returns the first cycle.
|
||||
If nodes a1, a2, ..., an are in a cycle, then this returns:
|
||||
[(a1,a2), (a2,a3), ... (an-1,an), (an, a1)]
|
||||
Otherwise returns an empty list.
|
||||
"""
|
||||
cycles = list(simple_cycles(graph))
|
||||
if cycles:
|
||||
nodes = cycles[0]
|
||||
nodes.append(nodes[0])
|
||||
edges = []
|
||||
prev = nodes[0]
|
||||
for node in nodes[1:]:
|
||||
edges.append((prev, node))
|
||||
prev = node
|
||||
return edges
|
||||
else:
|
||||
return []
|
||||
|
||||
|
||||
def get_stacktrace(thread_id):
|
||||
"""
|
||||
Returns the stack trace for the thread id as a list of strings.
|
||||
"""
|
||||
gdb.execute("thread %d" % thread_id, from_tty=False, to_string=True)
|
||||
output = gdb.execute("bt", from_tty=False, to_string=True)
|
||||
stacktrace_lines = output.strip().split("\n")
|
||||
return stacktrace_lines
|
||||
|
||||
|
||||
def is_thread_blocked_with_frame(
|
||||
thread_id, top_line, expected_top_lines, expected_frame
|
||||
):
|
||||
"""
|
||||
Returns True if we found expected_top_line in top_line, and
|
||||
we found the expected_frame in the thread's stack trace.
|
||||
"""
|
||||
if all(expected not in top_line for expected in expected_top_lines):
|
||||
return False
|
||||
stacktrace_lines = get_stacktrace(thread_id)
|
||||
return any(expected_frame in line for line in stacktrace_lines)
|
||||
|
||||
|
||||
class MutexType(Enum):
|
||||
"""Types of mutexes that we can detect deadlocks."""
|
||||
|
||||
PTHREAD_MUTEX_T = "pthread_mutex_t"
|
||||
PTHREAD_RWLOCK_T = "pthread_rwlock_t"
|
||||
|
||||
@staticmethod
|
||||
def get_mutex_type(thread_id, top_line):
|
||||
"""
|
||||
Returns the probable mutex type, based on the first line
|
||||
of the thread's stack. Returns None if not found.
|
||||
"""
|
||||
|
||||
WAITLIST = [
|
||||
"__lll_lock_wait",
|
||||
"futex_abstimed_wait",
|
||||
"futex_abstimed_wait_cancelable",
|
||||
"futex_reltimed_wait",
|
||||
"futex_reltimed_wait_cancelable",
|
||||
"futex_wait",
|
||||
"futex_wait_cancelable",
|
||||
]
|
||||
|
||||
if is_thread_blocked_with_frame(thread_id, top_line, WAITLIST, "pthread_mutex"):
|
||||
return MutexType.PTHREAD_MUTEX_T
|
||||
if is_thread_blocked_with_frame(
|
||||
thread_id, top_line, WAITLIST, "pthread_rwlock"
|
||||
):
|
||||
return MutexType.PTHREAD_RWLOCK_T
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def get_mutex_owner_and_address_func_for_type(mutex_type):
|
||||
"""
|
||||
Returns a function to resolve the mutex owner and address for
|
||||
the given type. The returned function f has the following
|
||||
signature:
|
||||
|
||||
f: args: (map of thread lwp -> thread id), blocked thread lwp
|
||||
returns: (lwp of thread owning mutex, mutex address)
|
||||
or (None, None) if not found.
|
||||
|
||||
Returns None if there is no function for this mutex_type.
|
||||
"""
|
||||
if mutex_type == MutexType.PTHREAD_MUTEX_T:
|
||||
return get_pthread_mutex_t_owner_and_address
|
||||
if mutex_type == MutexType.PTHREAD_RWLOCK_T:
|
||||
return get_pthread_rwlock_t_owner_and_address
|
||||
return None
|
||||
|
||||
|
||||
def print_cycle(graph, lwp_to_thread_id, cycle):
|
||||
"""Prints the threads and mutexes involved in the deadlock."""
|
||||
for m, n in cycle:
|
||||
print(
|
||||
"Thread %d (LWP %d) is waiting on %s (0x%016x) held by "
|
||||
"Thread %d (LWP %d)"
|
||||
% (
|
||||
lwp_to_thread_id[m],
|
||||
m,
|
||||
graph.attributes(m, n)["mutex_type"].value,
|
||||
graph.attributes(m, n)["mutex"],
|
||||
lwp_to_thread_id[n],
|
||||
n,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def get_thread_info():
|
||||
"""
|
||||
Returns a pair of:
|
||||
- map of LWP -> thread ID
|
||||
- map of blocked threads LWP -> potential mutex type
|
||||
"""
|
||||
# LWP -> thread ID
|
||||
lwp_to_thread_id = {}
|
||||
|
||||
# LWP -> potential mutex type it is blocked on
|
||||
blocked_threads = {}
|
||||
|
||||
output = gdb.execute("info threads", from_tty=False, to_string=True)
|
||||
lines = output.strip().split("\n")[1:]
|
||||
regex = re.compile(r"[\s\*]*(\d+).*Thread.*\(LWP (\d+)\).*")
|
||||
for line in lines:
|
||||
try:
|
||||
thread_id = int(regex.match(line).group(1))
|
||||
thread_lwp = int(regex.match(line).group(2))
|
||||
lwp_to_thread_id[thread_lwp] = thread_id
|
||||
mutex_type = MutexType.get_mutex_type(thread_id, line)
|
||||
if mutex_type:
|
||||
blocked_threads[thread_lwp] = mutex_type
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
return (lwp_to_thread_id, blocked_threads)
|
||||
|
||||
|
||||
def get_pthread_mutex_t_owner_and_address(lwp_to_thread_id, thread_lwp):
|
||||
"""
|
||||
Finds the thread holding the mutex that this thread is blocked on.
|
||||
Returns a pair of (lwp of thread owning mutex, mutex address),
|
||||
or (None, None) if not found.
|
||||
"""
|
||||
# Go up the stack to the pthread_mutex_lock frame
|
||||
gdb.execute(
|
||||
"thread %d" % lwp_to_thread_id[thread_lwp], from_tty=False, to_string=True
|
||||
)
|
||||
gdb.execute("frame 1", from_tty=False, to_string=True)
|
||||
|
||||
# Get the owner of the mutex by inspecting the internal
|
||||
# fields of the mutex.
|
||||
try:
|
||||
mutex_info = gdb.parse_and_eval("mutex").dereference()
|
||||
mutex_owner_lwp = int(mutex_info["__data"]["__owner"])
|
||||
return (mutex_owner_lwp, int(mutex_info.address))
|
||||
except gdb.error:
|
||||
return (None, None)
|
||||
|
||||
|
||||
def get_pthread_rwlock_t_owner_and_address(lwp_to_thread_id, thread_lwp):
|
||||
"""
|
||||
If the thread is waiting on a write-locked pthread_rwlock_t, this will
|
||||
return the pair of:
|
||||
(lwp of thread that is write-owning the mutex, mutex address)
|
||||
or (None, None) if not found, or if the mutex is read-locked.
|
||||
"""
|
||||
# Go up the stack to the pthread_rwlock_{rd|wr}lock frame
|
||||
gdb.execute(
|
||||
"thread %d" % lwp_to_thread_id[thread_lwp], from_tty=False, to_string=True
|
||||
)
|
||||
gdb.execute("frame 2", from_tty=False, to_string=True)
|
||||
|
||||
# Get the owner of the mutex by inspecting the internal
|
||||
# fields of the mutex.
|
||||
try:
|
||||
rwlock_info = gdb.parse_and_eval("rwlock").dereference()
|
||||
rwlock_data = rwlock_info["__data"]
|
||||
field_names = ["__cur_writer", "__writer"]
|
||||
fields = rwlock_data.type.fields()
|
||||
field = [f for f in fields if f.name in field_names][0]
|
||||
rwlock_owner_lwp = int(rwlock_data[field])
|
||||
# We can only track the owner if it is currently write-locked.
|
||||
# If it is not write-locked or if it is currently read-locked,
|
||||
# possibly by multiple threads, we cannot find the owner.
|
||||
if rwlock_owner_lwp != 0:
|
||||
return (rwlock_owner_lwp, int(rwlock_info.address))
|
||||
else:
|
||||
return (None, None)
|
||||
except gdb.error:
|
||||
return (None, None)
|
||||
|
||||
|
||||
class Deadlock(gdb.Command):
|
||||
"""Detects deadlocks"""
|
||||
|
||||
def __init__(self):
|
||||
super(Deadlock, self).__init__("deadlock", gdb.COMMAND_NONE)
|
||||
|
||||
def invoke(self, arg, from_tty):
|
||||
"""Prints the threads and mutexes in a deadlock, if it exists."""
|
||||
lwp_to_thread_id, blocked_threads = get_thread_info()
|
||||
|
||||
# Nodes represent threads. Edge (A,B) exists if thread A
|
||||
# is waiting on a mutex held by thread B.
|
||||
graph = DiGraph()
|
||||
|
||||
# Go through all the blocked threads and see which threads
|
||||
# they are blocked on, and build the thread wait graph.
|
||||
for thread_lwp, mutex_type in blocked_threads.items():
|
||||
get_owner_and_address_func = (
|
||||
MutexType.get_mutex_owner_and_address_func_for_type(mutex_type)
|
||||
)
|
||||
if not get_owner_and_address_func:
|
||||
continue
|
||||
mutex_owner_lwp, mutex_address = get_owner_and_address_func(
|
||||
lwp_to_thread_id, thread_lwp
|
||||
)
|
||||
if mutex_owner_lwp and mutex_address:
|
||||
graph.add_edge(
|
||||
thread_lwp,
|
||||
mutex_owner_lwp,
|
||||
mutex=mutex_address,
|
||||
mutex_type=mutex_type,
|
||||
)
|
||||
|
||||
# A deadlock exists if there is a cycle in the graph.
|
||||
cycle = find_cycle(graph)
|
||||
if cycle:
|
||||
print("Found deadlock!")
|
||||
print_cycle(graph, lwp_to_thread_id, cycle)
|
||||
else:
|
||||
print("No deadlock detected. " "Do you have debug symbols installed?")
|
||||
|
||||
|
||||
def load():
|
||||
# instantiate the Deadlock command
|
||||
Deadlock()
|
||||
print('Type "deadlock" to detect deadlocks.')
|
||||
|
||||
|
||||
def info():
|
||||
return "Detect deadlocks"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
load()
|
Loading…
Reference in New Issue