»Core Development>Code coverage>Tools/i18n/pygettext.py

Python code coverage for Tools/i18n/pygettext.py

1n/a#! /usr/bin/env python3
2n/a# -*- coding: iso-8859-1 -*-
3n/a# Originally written by Barry Warsaw <barry@python.org>
5n/a# Minimally patched to make it even more xgettext compatible
6n/a# by Peter Funk <pf@artcom-gmbh.de>
8n/a# 2002-11-22 Jürgen Hermann <jh@web.de>
9n/a# Added checks that _() only contains string literals, and
10n/a# command line args are resolved to module lists, i.e. you
11n/a# can now pass a filename, a module or package name, or a
12n/a# directory (including globbing chars, important for Win32).
13n/a# Made docstring fit in 80 chars wide displays using pydoc.
16n/a# for selftesting
18n/a import fintl
19n/a _ = fintl.gettext
20n/aexcept ImportError:
21n/a _ = lambda s: s
23n/a__doc__ = _("""pygettext -- Python equivalent of xgettext(1)
25n/aMany systems (Solaris, Linux, Gnu) provide extensive tools that ease the
26n/ainternationalization of C programs. Most of these tools are independent of
27n/athe programming language and can be used from within Python programs.
28n/aMartin von Loewis' work[1] helps considerably in this regard.
30n/aThere's one problem though; xgettext is the program that scans source code
31n/alooking for message strings, but it groks only C (or C++). Python
32n/aintroduces a few wrinkles, such as dual quoting characters, triple quoted
33n/astrings, and raw strings. xgettext understands none of this.
35n/aEnter pygettext, which uses Python's standard tokenize module to scan
36n/aPython source code, generating .pot files identical to what GNU xgettext[2]
37n/agenerates for C and C++ code. From there, the standard GNU tools can be
40n/aA word about marking Python strings as candidates for translation. GNU
41n/axgettext recognizes the following keywords: gettext, dgettext, dcgettext,
42n/aand gettext_noop. But those can be a lot of text to include all over your
43n/acode. C and C++ have a trick: they use the C preprocessor. Most
44n/ainternationalized C source includes a #define for gettext() to _() so that
45n/awhat has to be written in the source is much less. Thus these are both
46n/atranslatable strings:
48n/a gettext("Translatable String")
49n/a _("Translatable String")
51n/aPython of course has no preprocessor so this doesn't work so well. Thus,
52n/apygettext searches only for _() by default, but see the -k/--keyword flag
53n/abelow for how to augment this.
55n/a [1] http://www.python.org/workshops/1997-10/proceedings/loewis.html
56n/a [2] http://www.gnu.org/software/gettext/gettext.html
58n/aNOTE: pygettext attempts to be option and feature compatible with GNU
59n/axgettext where ever possible. However some options are still missing or are
60n/anot fully implemented. Also, xgettext's use of command line switches with
61n/aoption arguments is broken, and in these cases, pygettext just defines
62n/aadditional switches.
64n/aUsage: pygettext [options] inputfile ...
68n/a -a
69n/a --extract-all
70n/a Extract all strings.
72n/a -d name
73n/a --default-domain=name
74n/a Rename the default output file from messages.pot to name.pot.
76n/a -E
77n/a --escape
78n/a Replace non-ASCII characters with octal escape sequences.
80n/a -D
81n/a --docstrings
82n/a Extract module, class, method, and function docstrings. These do
83n/a not need to be wrapped in _() markers, and in fact cannot be for
84n/a Python to consider them docstrings. (See also the -X option).
86n/a -h
87n/a --help
88n/a Print this help message and exit.
90n/a -k word
91n/a --keyword=word
92n/a Keywords to look for in addition to the default set, which are:
95n/a You can have multiple -k flags on the command line.
97n/a -K
98n/a --no-default-keywords
99n/a Disable the default set of keywords (see above). Any keywords
100n/a explicitly added with the -k/--keyword option are still recognized.
102n/a --no-location
103n/a Do not write filename/lineno location comments.
105n/a -n
106n/a --add-location
107n/a Write filename/lineno location comments indicating where each
108n/a extracted string is found in the source. These lines appear before
109n/a each msgid. The style of comments is controlled by the -S/--style
110n/a option. This is the default.
112n/a -o filename
113n/a --output=filename
114n/a Rename the default output file from messages.pot to filename. If
115n/a filename is `-' then the output is sent to standard out.
117n/a -p dir
118n/a --output-dir=dir
119n/a Output files will be placed in directory dir.
121n/a -S stylename
122n/a --style stylename
123n/a Specify which style to use for location comments. Two styles are
124n/a supported:
126n/a Solaris # File: filename, line: line-number
127n/a GNU #: filename:line
129n/a The style name is case insensitive. GNU style is the default.
131n/a -v
132n/a --verbose
133n/a Print the names of the files being processed.
135n/a -V
136n/a --version
137n/a Print the version of pygettext and exit.
139n/a -w columns
140n/a --width=columns
141n/a Set width of output to columns.
143n/a -x filename
144n/a --exclude-file=filename
145n/a Specify a file that contains a list of strings that are not be
146n/a extracted from the input files. Each string to be excluded must
147n/a appear on a line by itself in the file.
149n/a -X filename
150n/a --no-docstrings=filename
151n/a Specify a file that contains a list of files (one per line) that
152n/a should not have their docstrings extracted. This is only useful in
153n/a conjunction with the -D option above.
155n/aIf `inputfile' is -, standard input is read.
158n/aimport os
159n/aimport importlib.machinery
160n/aimport importlib.util
161n/aimport sys
162n/aimport glob
163n/aimport time
164n/aimport getopt
165n/aimport token
166n/aimport tokenize
168n/a__version__ = '1.5'
170n/adefault_keywords = ['_']
171n/aDEFAULTKEYWORDS = ', '.join(default_keywords)
173n/aEMPTYSTRING = ''
177n/a# The normal pot-file header. msgmerge and Emacs's po-mode work better if it's
178n/a# there.
179n/apot_header = _('''\
181n/a# Copyright (C) YEAR ORGANIZATION
184n/amsgid ""
185n/amsgstr ""
186n/a"Project-Id-Version: PACKAGE VERSION\\n"
187n/a"POT-Creation-Date: %(time)s\\n"
188n/a"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n"
189n/a"Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n"
190n/a"Language-Team: LANGUAGE <LL@li.org>\\n"
191n/a"MIME-Version: 1.0\\n"
192n/a"Content-Type: text/plain; charset=%(charset)s\\n"
193n/a"Content-Transfer-Encoding: %(encoding)s\\n"
194n/a"Generated-By: pygettext.py %(version)s\\n"
199n/adef usage(code, msg=''):
200n/a print(__doc__ % globals(), file=sys.stderr)
201n/a if msg:
202n/a print(msg, file=sys.stderr)
203n/a sys.exit(code)
207n/adef make_escapes(pass_nonascii):
208n/a global escapes, escape
209n/a if pass_nonascii:
210n/a # Allow non-ascii characters to pass through so that e.g. 'msgid
211n/a # "Höhe"' would result not result in 'msgid "H\366he"'. Otherwise we
212n/a # escape any character outside the 32..126 range.
213n/a mod = 128
214n/a escape = escape_ascii
215n/a else:
216n/a mod = 256
217n/a escape = escape_nonascii
218n/a escapes = [r"\%03o" % i for i in range(mod)]
219n/a for i in range(32, 127):
220n/a escapes[i] = chr(i)
221n/a escapes[ord('\\')] = r'\\'
222n/a escapes[ord('\t')] = r'\t'
223n/a escapes[ord('\r')] = r'\r'
224n/a escapes[ord('\n')] = r'\n'
225n/a escapes[ord('\"')] = r'\"'
228n/adef escape_ascii(s, encoding):
229n/a return ''.join(escapes[ord(c)] if ord(c) < 128 else c for c in s)
231n/adef escape_nonascii(s, encoding):
232n/a return ''.join(escapes[b] for b in s.encode(encoding))
235n/adef safe_eval(s):
236n/a # unwrap quotes, safely
237n/a return eval(s, {'__builtins__':{}}, {})
240n/adef normalize(s, encoding):
241n/a # This converts the various Python string types into a format that is
242n/a # appropriate for .po files, namely much closer to C style.
243n/a lines = s.split('\n')
244n/a if len(lines) == 1:
245n/a s = '"' + escape(s, encoding) + '"'
246n/a else:
247n/a if not lines[-1]:
248n/a del lines[-1]
249n/a lines[-1] = lines[-1] + '\n'
250n/a for i in range(len(lines)):
251n/a lines[i] = escape(lines[i], encoding)
252n/a lineterm = '\\n"\n"'
253n/a s = '""\n"' + lineterm.join(lines) + '"'
254n/a return s
257n/adef containsAny(str, set):
258n/a """Check whether 'str' contains ANY of the chars in 'set'"""
259n/a return 1 in [c in str for c in set]
262n/adef _visit_pyfiles(list, dirname, names):
263n/a """Helper for getFilesForName()."""
264n/a # get extension for python source files
265n/a if '_py_ext' not in globals():
266n/a global _py_ext
267n/a _py_ext = importlib.machinery.SOURCE_SUFFIXES[0]
269n/a # don't recurse into CVS directories
270n/a if 'CVS' in names:
271n/a names.remove('CVS')
273n/a # add all *.py files to list
274n/a list.extend(
275n/a [os.path.join(dirname, file) for file in names
276n/a if os.path.splitext(file)[1] == _py_ext]
277n/a )
280n/adef getFilesForName(name):
281n/a """Get a list of module files for a filename, a module or package name,
282n/a or a directory.
283n/a """
284n/a if not os.path.exists(name):
285n/a # check for glob chars
286n/a if containsAny(name, "*?[]"):
287n/a files = glob.glob(name)
288n/a list = []
289n/a for file in files:
290n/a list.extend(getFilesForName(file))
291n/a return list
293n/a # try to find module or package
294n/a try:
295n/a spec = importlib.util.find_spec(name)
296n/a name = spec.origin
297n/a except ImportError:
298n/a name = None
299n/a if not name:
300n/a return []
302n/a if os.path.isdir(name):
303n/a # find all python files in directory
304n/a list = []
305n/a os.walk(name, _visit_pyfiles, list)
306n/a return list
307n/a elif os.path.exists(name):
308n/a # a single file
309n/a return [name]
311n/a return []
314n/aclass TokenEater:
315n/a def __init__(self, options):
316n/a self.__options = options
317n/a self.__messages = {}
318n/a self.__state = self.__waiting
319n/a self.__data = []
320n/a self.__lineno = -1
321n/a self.__freshmodule = 1
322n/a self.__curfile = None
324n/a def __call__(self, ttype, tstring, stup, etup, line):
325n/a # dispatch
326n/a## import token
327n/a## print >> sys.stderr, 'ttype:', token.tok_name[ttype], \
328n/a## 'tstring:', tstring
329n/a self.__state(ttype, tstring, stup[0])
331n/a def __waiting(self, ttype, tstring, lineno):
332n/a opts = self.__options
333n/a # Do docstring extractions, if enabled
334n/a if opts.docstrings and not opts.nodocstrings.get(self.__curfile):
335n/a # module docstring?
336n/a if self.__freshmodule:
337n/a if ttype == tokenize.STRING:
338n/a self.__addentry(safe_eval(tstring), lineno, isdocstring=1)
339n/a self.__freshmodule = 0
340n/a elif ttype not in (tokenize.COMMENT, tokenize.NL):
341n/a self.__freshmodule = 0
342n/a return
343n/a # class docstring?
344n/a if ttype == tokenize.NAME and tstring in ('class', 'def'):
345n/a self.__state = self.__suiteseen
346n/a return
347n/a if ttype == tokenize.NAME and tstring in opts.keywords:
348n/a self.__state = self.__keywordseen
350n/a def __suiteseen(self, ttype, tstring, lineno):
351n/a # ignore anything until we see the colon
352n/a if ttype == tokenize.OP and tstring == ':':
353n/a self.__state = self.__suitedocstring
355n/a def __suitedocstring(self, ttype, tstring, lineno):
356n/a # ignore any intervening noise
357n/a if ttype == tokenize.STRING:
358n/a self.__addentry(safe_eval(tstring), lineno, isdocstring=1)
359n/a self.__state = self.__waiting
360n/a elif ttype not in (tokenize.NEWLINE, tokenize.INDENT,
361n/a tokenize.COMMENT):
362n/a # there was no class docstring
363n/a self.__state = self.__waiting
365n/a def __keywordseen(self, ttype, tstring, lineno):
366n/a if ttype == tokenize.OP and tstring == '(':
367n/a self.__data = []
368n/a self.__lineno = lineno
369n/a self.__state = self.__openseen
370n/a else:
371n/a self.__state = self.__waiting
373n/a def __openseen(self, ttype, tstring, lineno):
374n/a if ttype == tokenize.OP and tstring == ')':
375n/a # We've seen the last of the translatable strings. Record the
376n/a # line number of the first line of the strings and update the list
377n/a # of messages seen. Reset state for the next batch. If there
378n/a # were no strings inside _(), then just ignore this entry.
379n/a if self.__data:
380n/a self.__addentry(EMPTYSTRING.join(self.__data))
381n/a self.__state = self.__waiting
382n/a elif ttype == tokenize.STRING:
383n/a self.__data.append(safe_eval(tstring))
384n/a elif ttype not in [tokenize.COMMENT, token.INDENT, token.DEDENT,
385n/a token.NEWLINE, tokenize.NL]:
386n/a # warn if we see anything else than STRING or whitespace
387n/a print(_(
388n/a '*** %(file)s:%(lineno)s: Seen unexpected token "%(token)s"'
389n/a ) % {
390n/a 'token': tstring,
391n/a 'file': self.__curfile,
392n/a 'lineno': self.__lineno
393n/a }, file=sys.stderr)
394n/a self.__state = self.__waiting
396n/a def __addentry(self, msg, lineno=None, isdocstring=0):
397n/a if lineno is None:
398n/a lineno = self.__lineno
399n/a if not msg in self.__options.toexclude:
400n/a entry = (self.__curfile, lineno)
401n/a self.__messages.setdefault(msg, {})[entry] = isdocstring
403n/a def set_filename(self, filename):
404n/a self.__curfile = filename
405n/a self.__freshmodule = 1
407n/a def write(self, fp):
408n/a options = self.__options
409n/a timestamp = time.strftime('%Y-%m-%d %H:%M%z')
410n/a encoding = fp.encoding if fp.encoding else 'UTF-8'
411n/a print(pot_header % {'time': timestamp, 'version': __version__,
412n/a 'charset': encoding,
413n/a 'encoding': '8bit'}, file=fp)
414n/a # Sort the entries. First sort each particular entry's keys, then
415n/a # sort all the entries by their first item.
416n/a reverse = {}
417n/a for k, v in self.__messages.items():
418n/a keys = sorted(v.keys())
419n/a reverse.setdefault(tuple(keys), []).append((k, v))
420n/a rkeys = sorted(reverse.keys())
421n/a for rkey in rkeys:
422n/a rentries = reverse[rkey]
423n/a rentries.sort()
424n/a for k, v in rentries:
425n/a # If the entry was gleaned out of a docstring, then add a
426n/a # comment stating so. This is to aid translators who may wish
427n/a # to skip translating some unimportant docstrings.
428n/a isdocstring = any(v.values())
429n/a # k is the message string, v is a dictionary-set of (filename,
430n/a # lineno) tuples. We want to sort the entries in v first by
431n/a # file name and then by line number.
432n/a v = sorted(v.keys())
433n/a if not options.writelocations:
434n/a pass
435n/a # location comments are different b/w Solaris and GNU:
436n/a elif options.locationstyle == options.SOLARIS:
437n/a for filename, lineno in v:
438n/a d = {'filename': filename, 'lineno': lineno}
439n/a print(_(
440n/a '# File: %(filename)s, line: %(lineno)d') % d, file=fp)
441n/a elif options.locationstyle == options.GNU:
442n/a # fit as many locations on one line, as long as the
443n/a # resulting line length doesn't exceed 'options.width'
444n/a locline = '#:'
445n/a for filename, lineno in v:
446n/a d = {'filename': filename, 'lineno': lineno}
447n/a s = _(' %(filename)s:%(lineno)d') % d
448n/a if len(locline) + len(s) <= options.width:
449n/a locline = locline + s
450n/a else:
451n/a print(locline, file=fp)
452n/a locline = "#:" + s
453n/a if len(locline) > 2:
454n/a print(locline, file=fp)
455n/a if isdocstring:
456n/a print('#, docstring', file=fp)
457n/a print('msgid', normalize(k, encoding), file=fp)
458n/a print('msgstr ""\n', file=fp)
462n/adef main():
463n/a global default_keywords
464n/a try:
465n/a opts, args = getopt.getopt(
466n/a sys.argv[1:],
467n/a 'ad:DEhk:Kno:p:S:Vvw:x:X:',
468n/a ['extract-all', 'default-domain=', 'escape', 'help',
469n/a 'keyword=', 'no-default-keywords',
470n/a 'add-location', 'no-location', 'output=', 'output-dir=',
471n/a 'style=', 'verbose', 'version', 'width=', 'exclude-file=',
472n/a 'docstrings', 'no-docstrings',
473n/a ])
474n/a except getopt.error as msg:
475n/a usage(1, msg)
477n/a # for holding option values
478n/a class Options:
479n/a # constants
480n/a GNU = 1
481n/a SOLARIS = 2
482n/a # defaults
483n/a extractall = 0 # FIXME: currently this option has no effect at all.
484n/a escape = 0
485n/a keywords = []
486n/a outpath = ''
487n/a outfile = 'messages.pot'
488n/a writelocations = 1
489n/a locationstyle = GNU
490n/a verbose = 0
491n/a width = 78
492n/a excludefilename = ''
493n/a docstrings = 0
494n/a nodocstrings = {}
496n/a options = Options()
497n/a locations = {'gnu' : options.GNU,
498n/a 'solaris' : options.SOLARIS,
499n/a }
501n/a # parse options
502n/a for opt, arg in opts:
503n/a if opt in ('-h', '--help'):
504n/a usage(0)
505n/a elif opt in ('-a', '--extract-all'):
506n/a options.extractall = 1
507n/a elif opt in ('-d', '--default-domain'):
508n/a options.outfile = arg + '.pot'
509n/a elif opt in ('-E', '--escape'):
510n/a options.escape = 1
511n/a elif opt in ('-D', '--docstrings'):
512n/a options.docstrings = 1
513n/a elif opt in ('-k', '--keyword'):
514n/a options.keywords.append(arg)
515n/a elif opt in ('-K', '--no-default-keywords'):
516n/a default_keywords = []
517n/a elif opt in ('-n', '--add-location'):
518n/a options.writelocations = 1
519n/a elif opt in ('--no-location',):
520n/a options.writelocations = 0
521n/a elif opt in ('-S', '--style'):
522n/a options.locationstyle = locations.get(arg.lower())
523n/a if options.locationstyle is None:
524n/a usage(1, _('Invalid value for --style: %s') % arg)
525n/a elif opt in ('-o', '--output'):
526n/a options.outfile = arg
527n/a elif opt in ('-p', '--output-dir'):
528n/a options.outpath = arg
529n/a elif opt in ('-v', '--verbose'):
530n/a options.verbose = 1
531n/a elif opt in ('-V', '--version'):
532n/a print(_('pygettext.py (xgettext for Python) %s') % __version__)
533n/a sys.exit(0)
534n/a elif opt in ('-w', '--width'):
535n/a try:
536n/a options.width = int(arg)
537n/a except ValueError:
538n/a usage(1, _('--width argument must be an integer: %s') % arg)
539n/a elif opt in ('-x', '--exclude-file'):
540n/a options.excludefilename = arg
541n/a elif opt in ('-X', '--no-docstrings'):
542n/a fp = open(arg)
543n/a try:
544n/a while 1:
545n/a line = fp.readline()
546n/a if not line:
547n/a break
548n/a options.nodocstrings[line[:-1]] = 1
549n/a finally:
550n/a fp.close()
552n/a # calculate escapes
553n/a make_escapes(not options.escape)
555n/a # calculate all keywords
556n/a options.keywords.extend(default_keywords)
558n/a # initialize list of strings to exclude
559n/a if options.excludefilename:
560n/a try:
561n/a fp = open(options.excludefilename)
562n/a options.toexclude = fp.readlines()
563n/a fp.close()
564n/a except IOError:
565n/a print(_(
566n/a "Can't read --exclude-file: %s") % options.excludefilename, file=sys.stderr)
567n/a sys.exit(1)
568n/a else:
569n/a options.toexclude = []
571n/a # resolve args to module lists
572n/a expanded = []
573n/a for arg in args:
574n/a if arg == '-':
575n/a expanded.append(arg)
576n/a else:
577n/a expanded.extend(getFilesForName(arg))
578n/a args = expanded
580n/a # slurp through all the files
581n/a eater = TokenEater(options)
582n/a for filename in args:
583n/a if filename == '-':
584n/a if options.verbose:
585n/a print(_('Reading standard input'))
586n/a fp = sys.stdin.buffer
587n/a closep = 0
588n/a else:
589n/a if options.verbose:
590n/a print(_('Working on %s') % filename)
591n/a fp = open(filename, 'rb')
592n/a closep = 1
593n/a try:
594n/a eater.set_filename(filename)
595n/a try:
596n/a tokens = tokenize.tokenize(fp.readline)
597n/a for _token in tokens:
598n/a eater(*_token)
599n/a except tokenize.TokenError as e:
600n/a print('%s: %s, line %d, column %d' % (
601n/a e.args[0], filename, e.args[1][0], e.args[1][1]),
602n/a file=sys.stderr)
603n/a finally:
604n/a if closep:
605n/a fp.close()
607n/a # write the output
608n/a if options.outfile == '-':
609n/a fp = sys.stdout
610n/a closep = 0
611n/a else:
612n/a if options.outpath:
613n/a options.outfile = os.path.join(options.outpath, options.outfile)
614n/a fp = open(options.outfile, 'w')
615n/a closep = 1
616n/a try:
617n/a eater.write(fp)
618n/a finally:
619n/a if closep:
620n/a fp.close()
623n/aif __name__ == '__main__':
624n/a main()
625n/a # some more test strings
626n/a # this one creates a warning
627n/a _('*** Seen unexpected token "%(token)s"') % {'token': 'test'}
628n/a _('more' 'than' 'one' 'string')