ยปCore Development>Code coverage>Doc/tools/extensions/suspicious.py

Python code coverage for Doc/tools/extensions/suspicious.py

#countcontent
1n/a"""
2n/aTry to detect suspicious constructs, resembling markup
3n/athat has leaked into the final output.
4n/a
5n/aSuspicious lines are reported in a comma-separated-file,
6n/a``suspicious.csv``, located in the output directory.
7n/a
8n/aThe file is utf-8 encoded, and each line contains four fields:
9n/a
10n/a * document name (normalized)
11n/a * line number in the source document
12n/a * problematic text
13n/a * complete line showing the problematic text in context
14n/a
15n/aIt is common to find many false positives. To avoid reporting them
16n/aagain and again, they may be added to the ``ignored.csv`` file
17n/a(located in the configuration directory). The file has the same
18n/aformat as ``suspicious.csv`` with a few differences:
19n/a
20n/a - each line defines a rule; if the rule matches, the issue
21n/a is ignored.
22n/a - line number may be empty (that is, nothing between the
23n/a commas: ",,"). In this case, line numbers are ignored (the
24n/a rule matches anywhere in the file).
25n/a - the last field does not have to be a complete line; some
26n/a surrounding text (never more than a line) is enough for
27n/a context.
28n/a
29n/aRules are processed sequentially. A rule matches when:
30n/a
31n/a * document names are the same
32n/a * problematic texts are the same
33n/a * line numbers are close to each other (5 lines up or down)
34n/a * the rule text is completely contained into the source line
35n/a
36n/aThe simplest way to create the ignored.csv file is by copying
37n/aundesired entries from suspicious.csv (possibly trimming the last
38n/afield.)
39n/a
40n/aCopyright 2009 Gabriel A. Genellina
41n/a
42n/a"""
43n/a
44n/aimport os
45n/aimport re
46n/aimport csv
47n/aimport sys
48n/a
49n/afrom docutils import nodes
50n/afrom sphinx.builders import Builder
51n/a
52n/adetect_all = re.compile(r'''
53n/a ::(?=[^=])| # two :: (but NOT ::=)
54n/a :[a-zA-Z][a-zA-Z0-9]+| # :foo
55n/a `| # ` (seldom used by itself)
56n/a (?<!\.)\.\.[ \t]*\w+: # .. foo: (but NOT ... else:)
57n/a ''', re.UNICODE | re.VERBOSE).finditer
58n/a
59n/apy3 = sys.version_info >= (3, 0)
60n/a
61n/a
62n/aclass Rule:
63n/a def __init__(self, docname, lineno, issue, line):
64n/a """A rule for ignoring issues"""
65n/a self.docname = docname # document to which this rule applies
66n/a self.lineno = lineno # line number in the original source;
67n/a # this rule matches only near that.
68n/a # None -> don't care
69n/a self.issue = issue # the markup fragment that triggered this rule
70n/a self.line = line # text of the container element (single line only)
71n/a self.used = False
72n/a
73n/a def __repr__(self):
74n/a return '{0.docname},,{0.issue},{0.line}'.format(self)
75n/a
76n/a
77n/a
78n/aclass dialect(csv.excel):
79n/a """Our dialect: uses only linefeed as newline."""
80n/a lineterminator = '\n'
81n/a
82n/a
83n/aclass CheckSuspiciousMarkupBuilder(Builder):
84n/a """
85n/a Checks for possibly invalid markup that may leak into the output.
86n/a """
87n/a name = 'suspicious'
88n/a
89n/a def init(self):
90n/a # create output file
91n/a self.log_file_name = os.path.join(self.outdir, 'suspicious.csv')
92n/a open(self.log_file_name, 'w').close()
93n/a # load database of previously ignored issues
94n/a self.load_rules(os.path.join(os.path.dirname(__file__), '..',
95n/a 'susp-ignored.csv'))
96n/a
97n/a def get_outdated_docs(self):
98n/a return self.env.found_docs
99n/a
100n/a def get_target_uri(self, docname, typ=None):
101n/a return ''
102n/a
103n/a def prepare_writing(self, docnames):
104n/a pass
105n/a
106n/a def write_doc(self, docname, doctree):
107n/a # set when any issue is encountered in this document
108n/a self.any_issue = False
109n/a self.docname = docname
110n/a visitor = SuspiciousVisitor(doctree, self)
111n/a doctree.walk(visitor)
112n/a
113n/a def finish(self):
114n/a unused_rules = [rule for rule in self.rules if not rule.used]
115n/a if unused_rules:
116n/a self.warn('Found %s/%s unused rules:' %
117n/a (len(unused_rules), len(self.rules)))
118n/a for rule in unused_rules:
119n/a self.info(repr(rule))
120n/a return
121n/a
122n/a def check_issue(self, line, lineno, issue):
123n/a if not self.is_ignored(line, lineno, issue):
124n/a self.report_issue(line, lineno, issue)
125n/a
126n/a def is_ignored(self, line, lineno, issue):
127n/a """Determine whether this issue should be ignored."""
128n/a docname = self.docname
129n/a for rule in self.rules:
130n/a if rule.docname != docname: continue
131n/a if rule.issue != issue: continue
132n/a # Both lines must match *exactly*. This is rather strict,
133n/a # and probably should be improved.
134n/a # Doing fuzzy matches with levenshtein distance could work,
135n/a # but that means bringing other libraries...
136n/a # Ok, relax that requirement: just check if the rule fragment
137n/a # is contained in the document line
138n/a if rule.line not in line: continue
139n/a # Check both line numbers. If they're "near"
140n/a # this rule matches. (lineno=None means "don't care")
141n/a if (rule.lineno is not None) and \
142n/a abs(rule.lineno - lineno) > 5: continue
143n/a # if it came this far, the rule matched
144n/a rule.used = True
145n/a return True
146n/a return False
147n/a
148n/a def report_issue(self, text, lineno, issue):
149n/a if not self.any_issue: self.info()
150n/a self.any_issue = True
151n/a self.write_log_entry(lineno, issue, text)
152n/a if py3:
153n/a self.warn('[%s:%d] "%s" found in "%-.120s"' %
154n/a (self.docname, lineno, issue, text))
155n/a else:
156n/a self.warn('[%s:%d] "%s" found in "%-.120s"' % (
157n/a self.docname.encode(sys.getdefaultencoding(),'replace'),
158n/a lineno,
159n/a issue.encode(sys.getdefaultencoding(),'replace'),
160n/a text.strip().encode(sys.getdefaultencoding(),'replace')))
161n/a self.app.statuscode = 1
162n/a
163n/a def write_log_entry(self, lineno, issue, text):
164n/a if py3:
165n/a f = open(self.log_file_name, 'a')
166n/a writer = csv.writer(f, dialect)
167n/a writer.writerow([self.docname, lineno, issue, text.strip()])
168n/a f.close()
169n/a else:
170n/a f = open(self.log_file_name, 'ab')
171n/a writer = csv.writer(f, dialect)
172n/a writer.writerow([self.docname.encode('utf-8'),
173n/a lineno,
174n/a issue.encode('utf-8'),
175n/a text.strip().encode('utf-8')])
176n/a f.close()
177n/a
178n/a def load_rules(self, filename):
179n/a """Load database of previously ignored issues.
180n/a
181n/a A csv file, with exactly the same format as suspicious.csv
182n/a Fields: document name (normalized), line number, issue, surrounding text
183n/a """
184n/a self.info("loading ignore rules... ", nonl=1)
185n/a self.rules = rules = []
186n/a try:
187n/a if py3:
188n/a f = open(filename, 'r')
189n/a else:
190n/a f = open(filename, 'rb')
191n/a except IOError:
192n/a return
193n/a for i, row in enumerate(csv.reader(f)):
194n/a if len(row) != 4:
195n/a raise ValueError(
196n/a "wrong format in %s, line %d: %s" % (filename, i+1, row))
197n/a docname, lineno, issue, text = row
198n/a if lineno:
199n/a lineno = int(lineno)
200n/a else:
201n/a lineno = None
202n/a if not py3:
203n/a docname = docname.decode('utf-8')
204n/a issue = issue.decode('utf-8')
205n/a text = text.decode('utf-8')
206n/a rule = Rule(docname, lineno, issue, text)
207n/a rules.append(rule)
208n/a f.close()
209n/a self.info('done, %d rules loaded' % len(self.rules))
210n/a
211n/a
212n/adef get_lineno(node):
213n/a """Obtain line number information for a node."""
214n/a lineno = None
215n/a while lineno is None and node:
216n/a node = node.parent
217n/a lineno = node.line
218n/a return lineno
219n/a
220n/a
221n/adef extract_line(text, index):
222n/a """text may be a multiline string; extract
223n/a only the line containing the given character index.
224n/a
225n/a >>> extract_line("abc\ndefgh\ni", 6)
226n/a >>> 'defgh'
227n/a >>> for i in (0, 2, 3, 4, 10):
228n/a ... print extract_line("abc\ndefgh\ni", i)
229n/a abc
230n/a abc
231n/a abc
232n/a defgh
233n/a defgh
234n/a i
235n/a """
236n/a p = text.rfind('\n', 0, index) + 1
237n/a q = text.find('\n', index)
238n/a if q < 0:
239n/a q = len(text)
240n/a return text[p:q]
241n/a
242n/a
243n/aclass SuspiciousVisitor(nodes.GenericNodeVisitor):
244n/a
245n/a lastlineno = 0
246n/a
247n/a def __init__(self, document, builder):
248n/a nodes.GenericNodeVisitor.__init__(self, document)
249n/a self.builder = builder
250n/a
251n/a def default_visit(self, node):
252n/a if isinstance(node, (nodes.Text, nodes.image)): # direct text containers
253n/a text = node.astext()
254n/a # lineno seems to go backwards sometimes (?)
255n/a self.lastlineno = lineno = max(get_lineno(node) or 0, self.lastlineno)
256n/a seen = set() # don't report the same issue more than only once per line
257n/a for match in detect_all(text):
258n/a issue = match.group()
259n/a line = extract_line(text, match.start())
260n/a if (issue, line) not in seen:
261n/a self.builder.check_issue(line, lineno, issue)
262n/a seen.add((issue, line))
263n/a
264n/a unknown_visit = default_visit
265n/a
266n/a def visit_document(self, node):
267n/a self.lastlineno = 0
268n/a
269n/a def visit_comment(self, node):
270n/a # ignore comments -- too much false positives.
271n/a # (although doing this could miss some errors;
272n/a # there were two sections "commented-out" by mistake
273n/a # in the Python docs that would not be caught)
274n/a raise nodes.SkipNode