Python code coverage for Lib/gzip.py

#	count	content
1	n/a	"""Functions that read and write gzipped files.
2	n/a
3	n/a	The user of the file doesn't have to worry about the compression,
4	n/a	but random access is not allowed."""
5	n/a
6	n/a	# based on Andrew Kuchling's minigzip.py distributed with the zlib module
7	n/a
8	n/a	import struct, sys, time, os
9	n/a	import zlib
10	n/a	import builtins
11	n/a	import io
12	n/a	import _compression
13	n/a
14	n/a	__all__ = ["GzipFile", "open", "compress", "decompress"]
15	n/a
16	n/a	FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
17	n/a
18	n/a	READ, WRITE = 1, 2
19	n/a
20	n/a	def open(filename, mode="rb", compresslevel=9,
21	n/a	encoding=None, errors=None, newline=None):
22	n/a	"""Open a gzip-compressed file in binary or text mode.
23	n/a
24	n/a	The filename argument can be an actual filename (a str or bytes object), or
25	n/a	an existing file object to read from or write to.
26	n/a
27	n/a	The mode argument can be "r", "rb", "w", "wb", "x", "xb", "a" or "ab" for
28	n/a	binary mode, or "rt", "wt", "xt" or "at" for text mode. The default mode is
29	n/a	"rb", and the default compresslevel is 9.
30	n/a
31	n/a	For binary mode, this function is equivalent to the GzipFile constructor:
32	n/a	GzipFile(filename, mode, compresslevel). In this case, the encoding, errors
33	n/a	and newline arguments must not be provided.
34	n/a
35	n/a	For text mode, a GzipFile object is created, and wrapped in an
36	n/a	io.TextIOWrapper instance with the specified encoding, error handling
37	n/a	behavior, and line ending(s).
38	n/a
39	n/a	"""
40	n/a	if "t" in mode:
41	n/a	if "b" in mode:
42	n/a	raise ValueError("Invalid mode: %r" % (mode,))
43	n/a	else:
44	n/a	if encoding is not None:
45	n/a	raise ValueError("Argument 'encoding' not supported in binary mode")
46	n/a	if errors is not None:
47	n/a	raise ValueError("Argument 'errors' not supported in binary mode")
48	n/a	if newline is not None:
49	n/a	raise ValueError("Argument 'newline' not supported in binary mode")
50	n/a
51	n/a	gz_mode = mode.replace("t", "")
52	n/a	if isinstance(filename, (str, bytes, os.PathLike)):
53	n/a	binary_file = GzipFile(filename, gz_mode, compresslevel)
54	n/a	elif hasattr(filename, "read") or hasattr(filename, "write"):
55	n/a	binary_file = GzipFile(None, gz_mode, compresslevel, filename)
56	n/a	else:
57	n/a	raise TypeError("filename must be a str or bytes object, or a file")
58	n/a
59	n/a	if "t" in mode:
60	n/a	return io.TextIOWrapper(binary_file, encoding, errors, newline)
61	n/a	else:
62	n/a	return binary_file
63	n/a
64	n/a	def write32u(output, value):
65	n/a	# The L format writes the bit pattern correctly whether signed
66	n/a	# or unsigned.
67	n/a	output.write(struct.pack("<L", value))
68	n/a
69	n/a	class _PaddedFile:
70	n/a	"""Minimal read-only file object that prepends a string to the contents
71	n/a	of an actual file. Shouldn't be used outside of gzip.py, as it lacks
72	n/a	essential functionality."""
73	n/a
74	n/a	def __init__(self, f, prepend=b''):
75	n/a	self._buffer = prepend
76	n/a	self._length = len(prepend)
77	n/a	self.file = f
78	n/a	self._read = 0
79	n/a
80	n/a	def read(self, size):
81	n/a	if self._read is None:
82	n/a	return self.file.read(size)
83	n/a	if self._read + size <= self._length:
84	n/a	read = self._read
85	n/a	self._read += size
86	n/a	return self._buffer[read:self._read]
87	n/a	else:
88	n/a	read = self._read
89	n/a	self._read = None
90	n/a	return self._buffer[read:] + \
91	n/a	self.file.read(size-self._length+read)
92	n/a
93	n/a	def prepend(self, prepend=b''):
94	n/a	if self._read is None:
95	n/a	self._buffer = prepend
96	n/a	else: # Assume data was read since the last prepend() call
97	n/a	self._read -= len(prepend)
98	n/a	return
99	n/a	self._length = len(self._buffer)
100	n/a	self._read = 0
101	n/a
102	n/a	def seek(self, off):
103	n/a	self._read = None
104	n/a	self._buffer = None
105	n/a	return self.file.seek(off)
106	n/a
107	n/a	def seekable(self):
108	n/a	return True # Allows fast-forwarding even in unseekable streams
109	n/a
110	n/a	class GzipFile(_compression.BaseStream):
111	n/a	"""The GzipFile class simulates most of the methods of a file object with
112	n/a	the exception of the truncate() method.
113	n/a
114	n/a	This class only supports opening files in binary mode. If you need to open a
115	n/a	compressed file in text mode, use the gzip.open() function.
116	n/a
117	n/a	"""
118	n/a
119	n/a	# Overridden with internal file object to be closed, if only a filename
120	n/a	# is passed in
121	n/a	myfileobj = None
122	n/a
123	n/a	def __init__(self, filename=None, mode=None,
124	n/a	compresslevel=9, fileobj=None, mtime=None):
125	n/a	"""Constructor for the GzipFile class.
126	n/a
127	n/a	At least one of fileobj and filename must be given a
128	n/a	non-trivial value.
129	n/a
130	n/a	The new class instance is based on fileobj, which can be a regular
131	n/a	file, an io.BytesIO object, or any other object which simulates a file.
132	n/a	It defaults to None, in which case filename is opened to provide
133	n/a	a file object.
134	n/a
135	n/a	When fileobj is not None, the filename argument is only used to be
136	n/a	included in the gzip file header, which may include the original
137	n/a	filename of the uncompressed file. It defaults to the filename of
138	n/a	fileobj, if discernible; otherwise, it defaults to the empty string,
139	n/a	and in this case the original filename is not included in the header.
140	n/a
141	n/a	The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', 'wb', 'x', or
142	n/a	'xb' depending on whether the file will be read or written. The default
143	n/a	is the mode of fileobj if discernible; otherwise, the default is 'rb'.
144	n/a	A mode of 'r' is equivalent to one of 'rb', and similarly for 'w' and
145	n/a	'wb', 'a' and 'ab', and 'x' and 'xb'.
146	n/a
147	n/a	The compresslevel argument is an integer from 0 to 9 controlling the
148	n/a	level of compression; 1 is fastest and produces the least compression,
149	n/a	and 9 is slowest and produces the most compression. 0 is no compression
150	n/a	at all. The default is 9.
151	n/a
152	n/a	The mtime argument is an optional numeric timestamp to be written
153	n/a	to the last modification time field in the stream when compressing.
154	n/a	If omitted or None, the current time is used.
155	n/a
156	n/a	"""
157	n/a
158	n/a	if mode and ('t' in mode or 'U' in mode):
159	n/a	raise ValueError("Invalid mode: {!r}".format(mode))
160	n/a	if mode and 'b' not in mode:
161	n/a	mode += 'b'
162	n/a	if fileobj is None:
163	n/a	fileobj = self.myfileobj = builtins.open(filename, mode or 'rb')
164	n/a	if filename is None:
165	n/a	filename = getattr(fileobj, 'name', '')
166	n/a	if not isinstance(filename, (str, bytes)):
167	n/a	filename = ''
168	n/a	else:
169	n/a	filename = os.fspath(filename)
170	n/a	if mode is None:
171	n/a	mode = getattr(fileobj, 'mode', 'rb')
172	n/a
173	n/a	if mode.startswith('r'):
174	n/a	self.mode = READ
175	n/a	raw = _GzipReader(fileobj)
176	n/a	self._buffer = io.BufferedReader(raw)
177	n/a	self.name = filename
178	n/a
179	n/a	elif mode.startswith(('w', 'a', 'x')):
180	n/a	self.mode = WRITE
181	n/a	self._init_write(filename)
182	n/a	self.compress = zlib.compressobj(compresslevel,
183	n/a	zlib.DEFLATED,
184	n/a	-zlib.MAX_WBITS,
185	n/a	zlib.DEF_MEM_LEVEL,
186	n/a	0)
187	n/a	self._write_mtime = mtime
188	n/a	else:
189	n/a	raise ValueError("Invalid mode: {!r}".format(mode))
190	n/a
191	n/a	self.fileobj = fileobj
192	n/a
193	n/a	if self.mode == WRITE:
194	n/a	self._write_gzip_header()
195	n/a
196	n/a	@property
197	n/a	def filename(self):
198	n/a	import warnings
199	n/a	warnings.warn("use the name attribute", DeprecationWarning, 2)
200	n/a	if self.mode == WRITE and self.name[-3:] != ".gz":
201	n/a	return self.name + ".gz"
202	n/a	return self.name
203	n/a
204	n/a	@property
205	n/a	def mtime(self):
206	n/a	"""Last modification time read from stream, or None"""
207	n/a	return self._buffer.raw._last_mtime
208	n/a
209	n/a	def __repr__(self):
210	n/a	s = repr(self.fileobj)
211	n/a	return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
212	n/a
213	n/a	def _init_write(self, filename):
214	n/a	self.name = filename
215	n/a	self.crc = zlib.crc32(b"")
216	n/a	self.size = 0
217	n/a	self.writebuf = []
218	n/a	self.bufsize = 0
219	n/a	self.offset = 0 # Current file offset for seek(), tell(), etc
220	n/a
221	n/a	def _write_gzip_header(self):
222	n/a	self.fileobj.write(b'\037\213') # magic header
223	n/a	self.fileobj.write(b'\010') # compression method
224	n/a	try:
225	n/a	# RFC 1952 requires the FNAME field to be Latin-1. Do not
226	n/a	# include filenames that cannot be represented that way.
227	n/a	fname = os.path.basename(self.name)
228	n/a	if not isinstance(fname, bytes):
229	n/a	fname = fname.encode('latin-1')
230	n/a	if fname.endswith(b'.gz'):
231	n/a	fname = fname[:-3]
232	n/a	except UnicodeEncodeError:
233	n/a	fname = b''
234	n/a	flags = 0
235	n/a	if fname:
236	n/a	flags = FNAME
237	n/a	self.fileobj.write(chr(flags).encode('latin-1'))
238	n/a	mtime = self._write_mtime
239	n/a	if mtime is None:
240	n/a	mtime = time.time()
241	n/a	write32u(self.fileobj, int(mtime))
242	n/a	self.fileobj.write(b'\002')
243	n/a	self.fileobj.write(b'\377')
244	n/a	if fname:
245	n/a	self.fileobj.write(fname + b'\000')
246	n/a
247	n/a	def write(self,data):
248	n/a	self._check_not_closed()
249	n/a	if self.mode != WRITE:
250	n/a	import errno
251	n/a	raise OSError(errno.EBADF, "write() on read-only GzipFile object")
252	n/a
253	n/a	if self.fileobj is None:
254	n/a	raise ValueError("write() on closed GzipFile object")
255	n/a
256	n/a	if isinstance(data, bytes):
257	n/a	length = len(data)
258	n/a	else:
259	n/a	# accept any data that supports the buffer protocol
260	n/a	data = memoryview(data)
261	n/a	length = data.nbytes
262	n/a
263	n/a	if length > 0:
264	n/a	self.fileobj.write(self.compress.compress(data))
265	n/a	self.size += length
266	n/a	self.crc = zlib.crc32(data, self.crc)
267	n/a	self.offset += length
268	n/a
269	n/a	return length
270	n/a
271	n/a	def read(self, size=-1):
272	n/a	self._check_not_closed()
273	n/a	if self.mode != READ:
274	n/a	import errno
275	n/a	raise OSError(errno.EBADF, "read() on write-only GzipFile object")
276	n/a	return self._buffer.read(size)
277	n/a
278	n/a	def read1(self, size=-1):
279	n/a	"""Implements BufferedIOBase.read1()
280	n/a
281	n/a	Reads up to a buffer's worth of data is size is negative."""
282	n/a	self._check_not_closed()
283	n/a	if self.mode != READ:
284	n/a	import errno
285	n/a	raise OSError(errno.EBADF, "read1() on write-only GzipFile object")
286	n/a
287	n/a	if size < 0:
288	n/a	size = io.DEFAULT_BUFFER_SIZE
289	n/a	return self._buffer.read1(size)
290	n/a
291	n/a	def peek(self, n):
292	n/a	self._check_not_closed()
293	n/a	if self.mode != READ:
294	n/a	import errno
295	n/a	raise OSError(errno.EBADF, "peek() on write-only GzipFile object")
296	n/a	return self._buffer.peek(n)
297	n/a
298	n/a	@property
299	n/a	def closed(self):
300	n/a	return self.fileobj is None
301	n/a
302	n/a	def close(self):
303	n/a	fileobj = self.fileobj
304	n/a	if fileobj is None:
305	n/a	return
306	n/a	self.fileobj = None
307	n/a	try:
308	n/a	if self.mode == WRITE:
309	n/a	fileobj.write(self.compress.flush())
310	n/a	write32u(fileobj, self.crc)
311	n/a	# self.size may exceed 2GB, or even 4GB
312	n/a	write32u(fileobj, self.size & 0xffffffff)
313	n/a	elif self.mode == READ:
314	n/a	self._buffer.close()
315	n/a	finally:
316	n/a	myfileobj = self.myfileobj
317	n/a	if myfileobj:
318	n/a	self.myfileobj = None
319	n/a	myfileobj.close()
320	n/a
321	n/a	def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
322	n/a	self._check_not_closed()
323	n/a	if self.mode == WRITE:
324	n/a	# Ensure the compressor's buffer is flushed
325	n/a	self.fileobj.write(self.compress.flush(zlib_mode))
326	n/a	self.fileobj.flush()
327	n/a
328	n/a	def fileno(self):
329	n/a	"""Invoke the underlying file object's fileno() method.
330	n/a
331	n/a	This will raise AttributeError if the underlying file object
332	n/a	doesn't support fileno().
333	n/a	"""
334	n/a	return self.fileobj.fileno()
335	n/a
336	n/a	def rewind(self):
337	n/a	'''Return the uncompressed stream file position indicator to the
338	n/a	beginning of the file'''
339	n/a	if self.mode != READ:
340	n/a	raise OSError("Can't rewind in write mode")
341	n/a	self._buffer.seek(0)
342	n/a
343	n/a	def readable(self):
344	n/a	return self.mode == READ
345	n/a
346	n/a	def writable(self):
347	n/a	return self.mode == WRITE
348	n/a
349	n/a	def seekable(self):
350	n/a	return True
351	n/a
352	n/a	def seek(self, offset, whence=io.SEEK_SET):
353	n/a	if self.mode == WRITE:
354	n/a	if whence != io.SEEK_SET:
355	n/a	if whence == io.SEEK_CUR:
356	n/a	offset = self.offset + offset
357	n/a	else:
358	n/a	raise ValueError('Seek from end not supported')
359	n/a	if offset < self.offset:
360	n/a	raise OSError('Negative seek in write mode')
361	n/a	count = offset - self.offset
362	n/a	chunk = b'\0' * 1024
363	n/a	for i in range(count // 1024):
364	n/a	self.write(chunk)
365	n/a	self.write(b'\0' * (count % 1024))
366	n/a	elif self.mode == READ:
367	n/a	self._check_not_closed()
368	n/a	return self._buffer.seek(offset, whence)
369	n/a
370	n/a	return self.offset
371	n/a
372	n/a	def readline(self, size=-1):
373	n/a	self._check_not_closed()
374	n/a	return self._buffer.readline(size)
375	n/a
376	n/a
377	n/a	class _GzipReader(_compression.DecompressReader):
378	n/a	def __init__(self, fp):
379	n/a	super().__init__(_PaddedFile(fp), zlib.decompressobj,
380	n/a	wbits=-zlib.MAX_WBITS)
381	n/a	# Set flag indicating start of a new member
382	n/a	self._new_member = True
383	n/a	self._last_mtime = None
384	n/a
385	n/a	def _init_read(self):
386	n/a	self._crc = zlib.crc32(b"")
387	n/a	self._stream_size = 0 # Decompressed size of unconcatenated stream
388	n/a
389	n/a	def _read_exact(self, n):
390	n/a	'''Read exactly n bytes from `self._fp`
391	n/a
392	n/a	This method is required because self._fp may be unbuffered,
393	n/a	i.e. return short reads.
394	n/a	'''
395	n/a
396	n/a	data = self._fp.read(n)
397	n/a	while len(data) < n:
398	n/a	b = self._fp.read(n - len(data))
399	n/a	if not b:
400	n/a	raise EOFError("Compressed file ended before the "
401	n/a	"end-of-stream marker was reached")
402	n/a	data += b
403	n/a	return data
404	n/a
405	n/a	def _read_gzip_header(self):
406	n/a	magic = self._fp.read(2)
407	n/a	if magic == b'':
408	n/a	return False
409	n/a
410	n/a	if magic != b'\037\213':
411	n/a	raise OSError('Not a gzipped file (%r)' % magic)
412	n/a
413	n/a	(method, flag,
414	n/a	self._last_mtime) = struct.unpack("<BBIxx", self._read_exact(8))
415	n/a	if method != 8:
416	n/a	raise OSError('Unknown compression method')
417	n/a
418	n/a	if flag & FEXTRA:
419	n/a	# Read & discard the extra field, if present
420	n/a	extra_len, = struct.unpack("<H", self._read_exact(2))
421	n/a	self._read_exact(extra_len)
422	n/a	if flag & FNAME:
423	n/a	# Read and discard a null-terminated string containing the filename
424	n/a	while True:
425	n/a	s = self._fp.read(1)
426	n/a	if not s or s==b'\000':
427	n/a	break
428	n/a	if flag & FCOMMENT:
429	n/a	# Read and discard a null-terminated string containing a comment
430	n/a	while True:
431	n/a	s = self._fp.read(1)
432	n/a	if not s or s==b'\000':
433	n/a	break
434	n/a	if flag & FHCRC:
435	n/a	self._read_exact(2) # Read & discard the 16-bit header CRC
436	n/a	return True
437	n/a
438	n/a	def read(self, size=-1):
439	n/a	if size < 0:
440	n/a	return self.readall()
441	n/a	# size=0 is special because decompress(max_length=0) is not supported
442	n/a	if not size:
443	n/a	return b""
444	n/a
445	n/a	# For certain input data, a single
446	n/a	# call to decompress() may not return
447	n/a	# any data. In this case, retry until we get some data or reach EOF.
448	n/a	while True:
449	n/a	if self._decompressor.eof:
450	n/a	# Ending case: we've come to the end of a member in the file,
451	n/a	# so finish up this member, and read a new gzip header.
452	n/a	# Check the CRC and file size, and set the flag so we read
453	n/a	# a new member
454	n/a	self._read_eof()
455	n/a	self._new_member = True
456	n/a	self._decompressor = self._decomp_factory(
457	n/a	**self._decomp_args)
458	n/a
459	n/a	if self._new_member:
460	n/a	# If the _new_member flag is set, we have to
461	n/a	# jump to the next member, if there is one.
462	n/a	self._init_read()
463	n/a	if not self._read_gzip_header():
464	n/a	self._size = self._pos
465	n/a	return b""
466	n/a	self._new_member = False
467	n/a
468	n/a	# Read a chunk of data from the file
469	n/a	buf = self._fp.read(io.DEFAULT_BUFFER_SIZE)
470	n/a
471	n/a	uncompress = self._decompressor.decompress(buf, size)
472	n/a	if self._decompressor.unconsumed_tail != b"":
473	n/a	self._fp.prepend(self._decompressor.unconsumed_tail)
474	n/a	elif self._decompressor.unused_data != b"":
475	n/a	# Prepend the already read bytes to the fileobj so they can
476	n/a	# be seen by _read_eof() and _read_gzip_header()
477	n/a	self._fp.prepend(self._decompressor.unused_data)
478	n/a
479	n/a	if uncompress != b"":
480	n/a	break
481	n/a	if buf == b"":
482	n/a	raise EOFError("Compressed file ended before the "
483	n/a	"end-of-stream marker was reached")
484	n/a
485	n/a	self._add_read_data( uncompress )
486	n/a	self._pos += len(uncompress)
487	n/a	return uncompress
488	n/a
489	n/a	def _add_read_data(self, data):
490	n/a	self._crc = zlib.crc32(data, self._crc)
491	n/a	self._stream_size = self._stream_size + len(data)
492	n/a
493	n/a	def _read_eof(self):
494	n/a	# We've read to the end of the file
495	n/a	# We check the that the computed CRC and size of the
496	n/a	# uncompressed data matches the stored values. Note that the size
497	n/a	# stored is the true file size mod 2**32.
498	n/a	crc32, isize = struct.unpack("<II", self._read_exact(8))
499	n/a	if crc32 != self._crc:
500	n/a	raise OSError("CRC check failed %s != %s" % (hex(crc32),
501	n/a	hex(self._crc)))
502	n/a	elif isize != (self._stream_size & 0xffffffff):
503	n/a	raise OSError("Incorrect length of data produced")
504	n/a
505	n/a	# Gzip files can be padded with zeroes and still have archives.
506	n/a	# Consume all zero bytes and set the file position to the first
507	n/a	# non-zero byte. See http://www.gzip.org/#faq8
508	n/a	c = b"\x00"
509	n/a	while c == b"\x00":
510	n/a	c = self._fp.read(1)
511	n/a	if c:
512	n/a	self._fp.prepend(c)
513	n/a
514	n/a	def _rewind(self):
515	n/a	super()._rewind()
516	n/a	self._new_member = True
517	n/a
518	n/a	def compress(data, compresslevel=9):
519	n/a	"""Compress data in one shot and return the compressed string.
520	n/a	Optional argument is the compression level, in range of 0-9.
521	n/a	"""
522	n/a	buf = io.BytesIO()
523	n/a	with GzipFile(fileobj=buf, mode='wb', compresslevel=compresslevel) as f:
524	n/a	f.write(data)
525	n/a	return buf.getvalue()
526	n/a
527	n/a	def decompress(data):
528	n/a	"""Decompress a gzip compressed string in one shot.
529	n/a	Return the decompressed string.
530	n/a	"""
531	n/a	with GzipFile(fileobj=io.BytesIO(data)) as f:
532	n/a	return f.read()
533	n/a
534	n/a
535	n/a	def _test():
536	n/a	# Act like gzip; with -d, act like gunzip.
537	n/a	# The input file is not deleted, however, nor are any other gzip
538	n/a	# options or features supported.
539	n/a	args = sys.argv[1:]
540	n/a	decompress = args and args[0] == "-d"
541	n/a	if decompress:
542	n/a	args = args[1:]
543	n/a	if not args:
544	n/a	args = ["-"]
545	n/a	for arg in args:
546	n/a	if decompress:
547	n/a	if arg == "-":
548	n/a	f = GzipFile(filename="", mode="rb", fileobj=sys.stdin.buffer)
549	n/a	g = sys.stdout.buffer
550	n/a	else:
551	n/a	if arg[-3:] != ".gz":
552	n/a	print("filename doesn't end in .gz:", repr(arg))
553	n/a	continue
554	n/a	f = open(arg, "rb")
555	n/a	g = builtins.open(arg[:-3], "wb")
556	n/a	else:
557	n/a	if arg == "-":
558	n/a	f = sys.stdin.buffer
559	n/a	g = GzipFile(filename="", mode="wb", fileobj=sys.stdout.buffer)
560	n/a	else:
561	n/a	f = builtins.open(arg, "rb")
562	n/a	g = open(arg + ".gz", "wb")
563	n/a	while True:
564	n/a	chunk = f.read(1024)
565	n/a	if not chunk:
566	n/a	break
567	n/a	g.write(chunk)
568	n/a	if g is not sys.stdout.buffer:
569	n/a	g.close()
570	n/a	if f is not sys.stdin.buffer:
571	n/a	f.close()
572	n/a
573	n/a	if __name__ == '__main__':
574	n/a	_test()