python · serhiy-storchaka · Jun 5, 2025 · Jun 1, 2025 · Jun 1, 2025 · Jun 1, 2025
diff --git a/Doc/library/compression.zstd.rst b/Doc/library/compression.zstd.rst
@@ -247,6 +247,25 @@ Compressing and decompressing data in memory
       The *mode* argument is a :class:`ZstdCompressor` attribute, either
       :attr:`~.FLUSH_BLOCK`, or :attr:`~.FLUSH_FRAME`.
 
+   .. method:: set_pledged_input_size(size)
+
+      Specify the amount of uncompressed data *size* that will be provided for
+      the next frame. *size* will be written into the frame header of the next
+      frame unless :attr:`CompressionParameter.content_size_flag` is ``False``
+      or ``0``. A size of ``0`` means that the frame is empty. If *size* is
+      ``None``, the frame header will omit the frame size.
+
+      If :attr:`last_mode` is not :attr:`FLUSH_FRAME`, a
+      :exc:`RuntimeError` is raised as the compressor is not at the start of
+      a frame. If the pledged size does not match the actual size of data
+      provided to :meth:`~.compress`, future calls to :meth:`~.compress` or
+      :meth:`flush` may raise :exc:`ZstdError` and the last chunk of data may
+      be lost.
+
+      After :meth:`flush` or :meth:`~.compress` are called with mode
+      :attr:`FLUSH_FRAME`, the next frame will not include the frame size into
+      the header unless :meth:`!set_pledged_input_size` is called again.
+
    .. attribute:: CONTINUE
 
       Collect more data for compression, which may or may not generate output
@@ -266,6 +285,13 @@ Compressing and decompressing data in memory
       :meth:`~.compress` will be written into a new frame and
       *cannot* reference past data.
 
+   .. attribute:: last_mode
+
+      The last mode passed to either :meth:`~.compress` or :meth:`~.flush`.
+      The value can be one of :attr:`~.CONTINUE`, :attr:`~.FLUSH_BLOCK`, or
+      :attr:`~.FLUSH_FRAME`. The initial value is :attr:`~.FLUSH_FRAME`,
+      signifying that the compressor is at the start of a new frame.
+
 
 .. class:: ZstdDecompressor(zstd_dict=None, options=None)
 
@@ -620,12 +646,17 @@ Advanced parameter control
       Write the size of the data to be compressed into the Zstandard frame
       header when known prior to compressing.
 
-      This flag only takes effect under the following two scenarios:
+      This flag only takes effect under the following three scenarios:
 
       * Calling :func:`compress` for one-shot compression
       * Providing all of the data to be compressed in the frame in a single
         :meth:`ZstdCompressor.compress` call, with the
         :attr:`ZstdCompressor.FLUSH_FRAME` mode.
+      * Calling :meth:`ZstdCompressor.set_pledged_input_size` with the exact
+        amount of data that will be provided to the compressor prior to any
+        calls to :meth:`ZstdCompressor.compress` for the current frame.
+        :meth:`!ZstdCompressor.set_pledged_input_size` must be called for each
+        new frame.
 
       All other compression calls may not write the size information into the
       frame header.

diff --git a/Lib/test/test_zstd.py b/Lib/test/test_zstd.py
@@ -395,6 +395,115 @@ def test_compress_empty(self):
         c = ZstdCompressor()
         self.assertNotEqual(c.compress(b'', c.FLUSH_FRAME), b'')
 
+    def test_set_pledged_input_size(self):
+        DAT = DECOMPRESSED_100_PLUS_32KB
+        CHUNK_SIZE = len(DAT) // 3
+
+        # wrong value
+        c = ZstdCompressor()
+        with self.assertRaisesRegex(ValueError,
+                                    r'should be a positive int less than \d+'):
+            c.set_pledged_input_size(-300)
+        # overflow
+        with self.assertRaisesRegex(ValueError,
+                                    r'should be a positive int less than \d+'):
+            c.set_pledged_input_size(2**64)
+        # ZSTD_CONTENTSIZE_ERROR is invalid
+        with self.assertRaisesRegex(ValueError,
+                                    r'should be a positive int less than \d+'):
+            c.set_pledged_input_size(2**64-2)
+        # ZSTD_CONTENTSIZE_UNKNOWN should use None
+        with self.assertRaisesRegex(ValueError,
+                                    r'should be a positive int less than \d+'):
+            c.set_pledged_input_size(2**64-1)
+
+        # check valid values are settable
+        c.set_pledged_input_size(2**63)
+        c.set_pledged_input_size(2**64-3)
+
+        # check that zero means empty frame
+        c = ZstdCompressor(level=1)
+        c.set_pledged_input_size(0)
+        c.compress(b'')
+        dat = c.flush()
+        ret = get_frame_info(dat)
+        self.assertEqual(ret.decompressed_size, 0)
+
+
+        # wrong mode
+        c = ZstdCompressor(level=1)
+        c.compress(b'123456')
+        self.assertEqual(c.last_mode, c.CONTINUE)
+        with self.assertRaisesRegex(RuntimeError,
+                                    r'last_mode == FLUSH_FRAME'):
+            c.set_pledged_input_size(300)
+
+        # None value
+        c = ZstdCompressor(level=1)
+        c.set_pledged_input_size(None)
+        dat = c.compress(DAT) + c.flush()
+
+        ret = get_frame_info(dat)
+        self.assertEqual(ret.decompressed_size, None)
+
+        # correct value
+        c = ZstdCompressor(level=1)
+        c.set_pledged_input_size(len(DAT))
+
+        chunks = []
+        posi = 0
+        while posi < len(DAT):
+            dat = c.compress(DAT[posi:posi+CHUNK_SIZE])
+            posi += CHUNK_SIZE
+            chunks.append(dat)
+
+        dat = c.flush()
+        chunks.append(dat)
+        chunks = b''.join(chunks)
+
+        ret = get_frame_info(chunks)
+        self.assertEqual(ret.decompressed_size, len(DAT))
+        self.assertEqual(decompress(chunks), DAT)
+
+        c.set_pledged_input_size(len(DAT)) # the second frame
+        dat = c.compress(DAT) + c.flush()
+
+        ret = get_frame_info(dat)
+        self.assertEqual(ret.decompressed_size, len(DAT))
+        self.assertEqual(decompress(dat), DAT)
+
+        # not enough data
+        c = ZstdCompressor(level=1)
+        c.set_pledged_input_size(len(DAT)+1)
+
+        for start in range(0, len(DAT), CHUNK_SIZE):
+            end = min(start+CHUNK_SIZE, len(DAT))
+            _dat = c.compress(DAT[start:end])
+
+        with self.assertRaises(ZstdError):
+            c.flush()
+
+        # too much data
+        c = ZstdCompressor(level=1)
+        c.set_pledged_input_size(len(DAT))
+
+        for start in range(0, len(DAT), CHUNK_SIZE):
+            end = min(start+CHUNK_SIZE, len(DAT))
+            _dat = c.compress(DAT[start:end])
+
+        with self.assertRaises(ZstdError):
+            c.compress(b'extra', ZstdCompressor.FLUSH_FRAME)
+
+        # content size not set if content_size_flag == 0
+        c = ZstdCompressor(options={CompressionParameter.content_size_flag: 0})
+        c.set_pledged_input_size(10)
+        dat1 = c.compress(b"hello")
+        dat2 = c.compress(b"world")
+        dat3 = c.flush()
+        frame_data = get_frame_info(dat1 + dat2 + dat3)
+        self.assertIsNone(frame_data.decompressed_size)
+
+
 class DecompressorTestCase(unittest.TestCase):
 
     def test_simple_decompress_bad_args(self):

diff --git a/Modules/_zstd/_zstdmodule.c b/Modules/_zstd/_zstdmodule.c
@@ -35,6 +35,9 @@ set_zstd_error(const _zstd_state* const state,
         case ERR_COMPRESS:
             msg = "Unable to compress Zstandard data: %s";
             break;
+        case ERR_SET_PLEDGED_INPUT_SIZE:
+            msg = "Unable to set pledged uncompressed content size: %s";
+            break;
 
         case ERR_LOAD_D_DICT:
             msg = "Unable to load Zstandard dictionary or prefix for "

diff --git a/Modules/_zstd/_zstdmodule.h b/Modules/_zstd/_zstdmodule.h
@@ -25,6 +25,7 @@ typedef struct {
 typedef enum {
     ERR_DECOMPRESS,
     ERR_COMPRESS,
+    ERR_SET_PLEDGED_INPUT_SIZE,
 
     ERR_LOAD_D_DICT,
     ERR_LOAD_C_DICT,

diff --git a/Modules/_zstd/clinic/compressor.c.h b/Modules/_zstd/clinic/compressor.c.h
diff --git a/Modules/_zstd/compressor.c b/Modules/_zstd/compressor.c
@@ -46,6 +46,54 @@ typedef struct {
 
 #define ZstdCompressor_CAST(op) ((ZstdCompressor *)op)
 
+/*[python input]
+
+class zstd_contentsize_converter(CConverter):
+    type = 'unsigned long long'
+    converter = 'zstd_contentsize_converter'
+
+[python start generated code]*/
+/*[python end generated code: output=da39a3ee5e6b4b0d input=0932c350d633c7de]*/
+
+static inline int
+raise_wrong_contentsize(void)
+{
+    PyErr_Format(PyExc_ValueError,
+                 "size argument should be a positive int less "
+                 "than %ull", ZSTD_CONTENTSIZE_ERROR);
+    return 0;
+}
+
+static int
+zstd_contentsize_converter(PyObject *size, unsigned long long *p)
+{
+    // None means the user indicates the size is unknown.
+    if (size == Py_None) {
+        *p = ZSTD_CONTENTSIZE_UNKNOWN;
+    }
+    else {
+        /* ZSTD_CONTENTSIZE_UNKNOWN is 0ULL - 1
+           ZSTD_CONTENTSIZE_ERROR   is 0ULL - 2
+           Users should only pass values < ZSTD_CONTENTSIZE_ERROR */
+        unsigned long long pledged_size = PyLong_AsUnsignedLongLong(size);
+        /* Here we check for (unsigned long long)-1 as a sign of an error in
+           PyLong_AsUnsignedLongLong */
+        if (pledged_size == (unsigned long long)-1 && PyErr_Occurred()) {
+            *p = ZSTD_CONTENTSIZE_ERROR;
+            if (PyErr_ExceptionMatches(PyExc_OverflowError)) {
+                return raise_wrong_contentsize();
+            }
+            return 0;
+        }
+        if (pledged_size >= ZSTD_CONTENTSIZE_ERROR) {
+            *p = ZSTD_CONTENTSIZE_ERROR;
+            return raise_wrong_contentsize();
+        }
+        *p = pledged_size;
+    }
+    return 1;
+}
+
 #include "clinic/compressor.c.h"
 
 static int
@@ -690,9 +738,63 @@ _zstd_ZstdCompressor_flush_impl(ZstdCompressor *self, int mode)
     return ret;
 }
 
+
+/*[clinic input]
+_zstd.ZstdCompressor.set_pledged_input_size
+
+    size: zstd_contentsize
+        The size of the uncompressed data to be provided to the compressor.
+    /
+
+Set the uncompressed content size to be written into the frame header.
+
+This method can be used to ensure the header of the frame about to be written
+includes the size of the data, unless CompressionParameter.content_size_flag is
+set to False. If last_mode != FLUSH_FRAME, then a RuntimeError is raised.
+
+It is important to ensure that the pledged data size matches the actual data
+size. If they do not match the compressed output data may be corrupted and the
+final chunk written may be lost.
+[clinic start generated code]*/
+
+static PyObject *
+_zstd_ZstdCompressor_set_pledged_input_size_impl(ZstdCompressor *self,
+                                                 unsigned long long size)
+/*[clinic end generated code: output=3a09e55cc0e3b4f9 input=563b9a1ddd4facc3]*/
+{
+    size_t zstd_ret;
+
+    // Error occured while converting argument, should be unreachable
+    assert(size != ZSTD_CONTENTSIZE_ERROR);
+
+    /* Thread-safe code */
+    PyMutex_Lock(&self->lock);
+
+    /* Check the current mode */
+    if (self->last_mode != ZSTD_e_end) {
+        PyErr_SetString(PyExc_RuntimeError,
+                        "set_pledged_input_size() method must be called "
+                        "when last_mode == FLUSH_FRAME");
+        PyMutex_Unlock(&self->lock);
+        return NULL;
+    }
+
+    /* Set pledged content size */
+    zstd_ret = ZSTD_CCtx_setPledgedSrcSize(self->cctx, size);
+    PyMutex_Unlock(&self->lock);
+    if (ZSTD_isError(zstd_ret)) {
+        _zstd_state* mod_state = PyType_GetModuleState(Py_TYPE(self));
+        set_zstd_error(mod_state, ERR_SET_PLEDGED_INPUT_SIZE, zstd_ret);
+        return NULL;
+    }
+
+    Py_RETURN_NONE;
+}
+
 static PyMethodDef ZstdCompressor_methods[] = {
     _ZSTD_ZSTDCOMPRESSOR_COMPRESS_METHODDEF
     _ZSTD_ZSTDCOMPRESSOR_FLUSH_METHODDEF
+    _ZSTD_ZSTDCOMPRESSOR_SET_PLEDGED_INPUT_SIZE_METHODDEF
     {NULL, NULL}
 };