Skip to content

Commit 4b44b34

Browse files
authored
gh-134938: Add set_pledged_input_size() to ZstdCompressor (GH-135010)
1 parent 3d396ab commit 4b44b34

File tree

6 files changed

+285
-2
lines changed

6 files changed

+285
-2
lines changed

Doc/library/compression.zstd.rst

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -247,6 +247,27 @@ Compressing and decompressing data in memory
247247
The *mode* argument is a :class:`ZstdCompressor` attribute, either
248248
:attr:`~.FLUSH_BLOCK`, or :attr:`~.FLUSH_FRAME`.
249249

250+
.. method:: set_pledged_input_size(size)
251+
252+
Specify the amount of uncompressed data *size* that will be provided for
253+
the next frame. *size* will be written into the frame header of the next
254+
frame unless :attr:`CompressionParameter.content_size_flag` is ``False``
255+
or ``0``. A size of ``0`` means that the frame is empty. If *size* is
256+
``None``, the frame header will omit the frame size. Frames that include
257+
the uncompressed data size require less memory to decompress, especially
258+
at higher compression levels.
259+
260+
If :attr:`last_mode` is not :attr:`FLUSH_FRAME`, a
261+
:exc:`ValueError` is raised as the compressor is not at the start of
262+
a frame. If the pledged size does not match the actual size of data
263+
provided to :meth:`.compress`, future calls to :meth:`!compress` or
264+
:meth:`flush` may raise :exc:`ZstdError` and the last chunk of data may
265+
be lost.
266+
267+
After :meth:`flush` or :meth:`.compress` are called with mode
268+
:attr:`FLUSH_FRAME`, the next frame will not include the frame size into
269+
the header unless :meth:`!set_pledged_input_size` is called again.
270+
250271
.. attribute:: CONTINUE
251272

252273
Collect more data for compression, which may or may not generate output
@@ -266,6 +287,13 @@ Compressing and decompressing data in memory
266287
:meth:`~.compress` will be written into a new frame and
267288
*cannot* reference past data.
268289

290+
.. attribute:: last_mode
291+
292+
The last mode passed to either :meth:`~.compress` or :meth:`~.flush`.
293+
The value can be one of :attr:`~.CONTINUE`, :attr:`~.FLUSH_BLOCK`, or
294+
:attr:`~.FLUSH_FRAME`. The initial value is :attr:`~.FLUSH_FRAME`,
295+
signifying that the compressor is at the start of a new frame.
296+
269297

270298
.. class:: ZstdDecompressor(zstd_dict=None, options=None)
271299

@@ -620,12 +648,17 @@ Advanced parameter control
620648
Write the size of the data to be compressed into the Zstandard frame
621649
header when known prior to compressing.
622650

623-
This flag only takes effect under the following two scenarios:
651+
This flag only takes effect under the following scenarios:
624652

625653
* Calling :func:`compress` for one-shot compression
626654
* Providing all of the data to be compressed in the frame in a single
627655
:meth:`ZstdCompressor.compress` call, with the
628656
:attr:`ZstdCompressor.FLUSH_FRAME` mode.
657+
* Calling :meth:`ZstdCompressor.set_pledged_input_size` with the exact
658+
amount of data that will be provided to the compressor prior to any
659+
calls to :meth:`ZstdCompressor.compress` for the current frame.
660+
:meth:`!ZstdCompressor.set_pledged_input_size` must be called for each
661+
new frame.
629662

630663
All other compression calls may not write the size information into the
631664
frame header.

Lib/test/test_zstd.py

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -395,6 +395,115 @@ def test_compress_empty(self):
395395
c = ZstdCompressor()
396396
self.assertNotEqual(c.compress(b'', c.FLUSH_FRAME), b'')
397397

398+
def test_set_pledged_input_size(self):
399+
DAT = DECOMPRESSED_100_PLUS_32KB
400+
CHUNK_SIZE = len(DAT) // 3
401+
402+
# wrong value
403+
c = ZstdCompressor()
404+
with self.assertRaisesRegex(ValueError,
405+
r'should be a positive int less than \d+'):
406+
c.set_pledged_input_size(-300)
407+
# overflow
408+
with self.assertRaisesRegex(ValueError,
409+
r'should be a positive int less than \d+'):
410+
c.set_pledged_input_size(2**64)
411+
# ZSTD_CONTENTSIZE_ERROR is invalid
412+
with self.assertRaisesRegex(ValueError,
413+
r'should be a positive int less than \d+'):
414+
c.set_pledged_input_size(2**64-2)
415+
# ZSTD_CONTENTSIZE_UNKNOWN should use None
416+
with self.assertRaisesRegex(ValueError,
417+
r'should be a positive int less than \d+'):
418+
c.set_pledged_input_size(2**64-1)
419+
420+
# check valid values are settable
421+
c.set_pledged_input_size(2**63)
422+
c.set_pledged_input_size(2**64-3)
423+
424+
# check that zero means empty frame
425+
c = ZstdCompressor(level=1)
426+
c.set_pledged_input_size(0)
427+
c.compress(b'')
428+
dat = c.flush()
429+
ret = get_frame_info(dat)
430+
self.assertEqual(ret.decompressed_size, 0)
431+
432+
433+
# wrong mode
434+
c = ZstdCompressor(level=1)
435+
c.compress(b'123456')
436+
self.assertEqual(c.last_mode, c.CONTINUE)
437+
with self.assertRaisesRegex(ValueError,
438+
r'last_mode == FLUSH_FRAME'):
439+
c.set_pledged_input_size(300)
440+
441+
# None value
442+
c = ZstdCompressor(level=1)
443+
c.set_pledged_input_size(None)
444+
dat = c.compress(DAT) + c.flush()
445+
446+
ret = get_frame_info(dat)
447+
self.assertEqual(ret.decompressed_size, None)
448+
449+
# correct value
450+
c = ZstdCompressor(level=1)
451+
c.set_pledged_input_size(len(DAT))
452+
453+
chunks = []
454+
posi = 0
455+
while posi < len(DAT):
456+
dat = c.compress(DAT[posi:posi+CHUNK_SIZE])
457+
posi += CHUNK_SIZE
458+
chunks.append(dat)
459+
460+
dat = c.flush()
461+
chunks.append(dat)
462+
chunks = b''.join(chunks)
463+
464+
ret = get_frame_info(chunks)
465+
self.assertEqual(ret.decompressed_size, len(DAT))
466+
self.assertEqual(decompress(chunks), DAT)
467+
468+
c.set_pledged_input_size(len(DAT)) # the second frame
469+
dat = c.compress(DAT) + c.flush()
470+
471+
ret = get_frame_info(dat)
472+
self.assertEqual(ret.decompressed_size, len(DAT))
473+
self.assertEqual(decompress(dat), DAT)
474+
475+
# not enough data
476+
c = ZstdCompressor(level=1)
477+
c.set_pledged_input_size(len(DAT)+1)
478+
479+
for start in range(0, len(DAT), CHUNK_SIZE):
480+
end = min(start+CHUNK_SIZE, len(DAT))
481+
_dat = c.compress(DAT[start:end])
482+
483+
with self.assertRaises(ZstdError):
484+
c.flush()
485+
486+
# too much data
487+
c = ZstdCompressor(level=1)
488+
c.set_pledged_input_size(len(DAT))
489+
490+
for start in range(0, len(DAT), CHUNK_SIZE):
491+
end = min(start+CHUNK_SIZE, len(DAT))
492+
_dat = c.compress(DAT[start:end])
493+
494+
with self.assertRaises(ZstdError):
495+
c.compress(b'extra', ZstdCompressor.FLUSH_FRAME)
496+
497+
# content size not set if content_size_flag == 0
498+
c = ZstdCompressor(options={CompressionParameter.content_size_flag: 0})
499+
c.set_pledged_input_size(10)
500+
dat1 = c.compress(b"hello")
501+
dat2 = c.compress(b"world")
502+
dat3 = c.flush()
503+
frame_data = get_frame_info(dat1 + dat2 + dat3)
504+
self.assertIsNone(frame_data.decompressed_size)
505+
506+
398507
class DecompressorTestCase(unittest.TestCase):
399508

400509
def test_simple_decompress_bad_args(self):

Modules/_zstd/_zstdmodule.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,9 @@ set_zstd_error(const _zstd_state *state, error_type type, size_t zstd_ret)
7272
case ERR_COMPRESS:
7373
msg = "Unable to compress Zstandard data: %s";
7474
break;
75+
case ERR_SET_PLEDGED_INPUT_SIZE:
76+
msg = "Unable to set pledged uncompressed content size: %s";
77+
break;
7578

7679
case ERR_LOAD_D_DICT:
7780
msg = "Unable to load Zstandard dictionary or prefix for "

Modules/_zstd/_zstdmodule.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ typedef struct {
2727
typedef enum {
2828
ERR_DECOMPRESS,
2929
ERR_COMPRESS,
30+
ERR_SET_PLEDGED_INPUT_SIZE,
3031

3132
ERR_LOAD_D_DICT,
3233
ERR_LOAD_C_DICT,

Modules/_zstd/clinic/compressor.c.h

Lines changed: 40 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Modules/_zstd/compressor.c

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,52 @@ typedef struct {
4545

4646
#define ZstdCompressor_CAST(op) ((ZstdCompressor *)op)
4747

48+
/*[python input]
49+
50+
class zstd_contentsize_converter(CConverter):
51+
type = 'unsigned long long'
52+
converter = 'zstd_contentsize_converter'
53+
54+
[python start generated code]*/
55+
/*[python end generated code: output=da39a3ee5e6b4b0d input=0932c350d633c7de]*/
56+
57+
58+
static int
59+
zstd_contentsize_converter(PyObject *size, unsigned long long *p)
60+
{
61+
// None means the user indicates the size is unknown.
62+
if (size == Py_None) {
63+
*p = ZSTD_CONTENTSIZE_UNKNOWN;
64+
}
65+
else {
66+
/* ZSTD_CONTENTSIZE_UNKNOWN is 0ULL - 1
67+
ZSTD_CONTENTSIZE_ERROR is 0ULL - 2
68+
Users should only pass values < ZSTD_CONTENTSIZE_ERROR */
69+
unsigned long long pledged_size = PyLong_AsUnsignedLongLong(size);
70+
/* Here we check for (unsigned long long)-1 as a sign of an error in
71+
PyLong_AsUnsignedLongLong */
72+
if (pledged_size == (unsigned long long)-1 && PyErr_Occurred()) {
73+
*p = ZSTD_CONTENTSIZE_ERROR;
74+
if (PyErr_ExceptionMatches(PyExc_OverflowError)) {
75+
PyErr_Format(PyExc_ValueError,
76+
"size argument should be a positive int less "
77+
"than %ull", ZSTD_CONTENTSIZE_ERROR);
78+
return 0;
79+
}
80+
return 0;
81+
}
82+
if (pledged_size >= ZSTD_CONTENTSIZE_ERROR) {
83+
*p = ZSTD_CONTENTSIZE_ERROR;
84+
PyErr_Format(PyExc_ValueError,
85+
"size argument should be a positive int less "
86+
"than %ull", ZSTD_CONTENTSIZE_ERROR);
87+
return 0;
88+
}
89+
*p = pledged_size;
90+
}
91+
return 1;
92+
}
93+
4894
#include "clinic/compressor.c.h"
4995

5096
static int
@@ -643,9 +689,61 @@ _zstd_ZstdCompressor_flush_impl(ZstdCompressor *self, int mode)
643689
return ret;
644690
}
645691

692+
693+
/*[clinic input]
694+
_zstd.ZstdCompressor.set_pledged_input_size
695+
696+
size: zstd_contentsize
697+
The size of the uncompressed data to be provided to the compressor.
698+
/
699+
700+
Set the uncompressed content size to be written into the frame header.
701+
702+
This method can be used to ensure the header of the frame about to be written
703+
includes the size of the data, unless the CompressionParameter.content_size_flag
704+
is set to False. If last_mode != FLUSH_FRAME, then a RuntimeError is raised.
705+
706+
It is important to ensure that the pledged data size matches the actual data
707+
size. If they do not match the compressed output data may be corrupted and the
708+
final chunk written may be lost.
709+
[clinic start generated code]*/
710+
711+
static PyObject *
712+
_zstd_ZstdCompressor_set_pledged_input_size_impl(ZstdCompressor *self,
713+
unsigned long long size)
714+
/*[clinic end generated code: output=3a09e55cc0e3b4f9 input=afd8a7d78cff2eb5]*/
715+
{
716+
// Error occured while converting argument, should be unreachable
717+
assert(size != ZSTD_CONTENTSIZE_ERROR);
718+
719+
/* Thread-safe code */
720+
PyMutex_Lock(&self->lock);
721+
722+
/* Check the current mode */
723+
if (self->last_mode != ZSTD_e_end) {
724+
PyErr_SetString(PyExc_ValueError,
725+
"set_pledged_input_size() method must be called "
726+
"when last_mode == FLUSH_FRAME");
727+
PyMutex_Unlock(&self->lock);
728+
return NULL;
729+
}
730+
731+
/* Set pledged content size */
732+
size_t zstd_ret = ZSTD_CCtx_setPledgedSrcSize(self->cctx, size);
733+
PyMutex_Unlock(&self->lock);
734+
if (ZSTD_isError(zstd_ret)) {
735+
_zstd_state* mod_state = PyType_GetModuleState(Py_TYPE(self));
736+
set_zstd_error(mod_state, ERR_SET_PLEDGED_INPUT_SIZE, zstd_ret);
737+
return NULL;
738+
}
739+
740+
Py_RETURN_NONE;
741+
}
742+
646743
static PyMethodDef ZstdCompressor_methods[] = {
647744
_ZSTD_ZSTDCOMPRESSOR_COMPRESS_METHODDEF
648745
_ZSTD_ZSTDCOMPRESSOR_FLUSH_METHODDEF
746+
_ZSTD_ZSTDCOMPRESSOR_SET_PLEDGED_INPUT_SIZE_METHODDEF
649747
{NULL, NULL}
650748
};
651749

0 commit comments

Comments
 (0)