From 6969658543a73ec7f81a37d5ae9f27952d9c531b Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Tue, 21 May 2024 20:40:25 +0300
Subject: [PATCH 1/3] gh-119342: Fix OOM vulnerability in plistlib

Reading a specially prepared small Plist file could cause OOM because file's
read(n) preallocates a bytes object for reading the specified amount of
data. Now plistlib reads large data by chunks, therefore the upper limit of
consumed memory is proportional to the size of the input file.
---
 Lib/plistlib.py                               | 35 ++++++++++++------
 Lib/test/test_plistlib.py                     | 37 +++++++++++++++++--
 ...-05-21-22-11-31.gh-issue-119342.BTFj4Z.rst |  2 +
 3 files changed, 60 insertions(+), 14 deletions(-)
 create mode 100644 Misc/NEWS.d/next/Security/2024-05-21-22-11-31.gh-issue-119342.BTFj4Z.rst

diff --git a/Lib/plistlib.py b/Lib/plistlib.py
index 67e832db217319..0365dc71090334 100644
--- a/Lib/plistlib.py
+++ b/Lib/plistlib.py
@@ -508,12 +508,31 @@ def _get_size(self, tokenL):
 
         return tokenL
 
+    def _read(self, size):
+        if 0:
+            data = self._fp.read(size)
+            if len(data) != size:
+                raise InvalidFileException
+            return data
+        cursize = min(size, 1 << 20)
+        data = self._fp.read(cursize)
+        while True:
+            if len(data) != cursize:
+                raise InvalidFileException
+            if cursize == size:
+                return data
+            delta = min(cursize, size - cursize)
+            data += self._fp.read(delta)
+            cursize += delta
+
     def _read_ints(self, n, size):
-        data = self._fp.read(size * n)
+        data = self._read(size * n)
+        #print('XXX', n, size)
+        #assert n < 100
         if size in _BINARY_FORMAT:
             return struct.unpack(f'>{n}{_BINARY_FORMAT[size]}', data)
         else:
-            if not size or len(data) != size * n:
+            if not size:
                 raise InvalidFileException()
             return tuple(int.from_bytes(data[i: i + size], 'big')
                          for i in range(0, size * n, size))
@@ -573,22 +592,16 @@ def _read_object(self, ref):
 
         elif tokenH == 0x40:  # data
             s = self._get_size(tokenL)
-            result = self._fp.read(s)
-            if len(result) != s:
-                raise InvalidFileException()
+            result = self._read(s)
 
         elif tokenH == 0x50:  # ascii string
             s = self._get_size(tokenL)
-            data = self._fp.read(s)
-            if len(data) != s:
-                raise InvalidFileException()
+            data = self._read(s)
             result = data.decode('ascii')
 
         elif tokenH == 0x60:  # unicode string
             s = self._get_size(tokenL) * 2
-            data = self._fp.read(s)
-            if len(data) != s:
-                raise InvalidFileException()
+            data = self._read(s)
             result = data.decode('utf-16be')
 
         elif tokenH == 0x80:  # UID
diff --git a/Lib/test/test_plistlib.py b/Lib/test/test_plistlib.py
index 001f86f2893f2f..3813227bd93964 100644
--- a/Lib/test/test_plistlib.py
+++ b/Lib/test/test_plistlib.py
@@ -904,8 +904,7 @@ def test_dump_naive_datetime_with_aware_datetime_option(self):
 
 class TestBinaryPlistlib(unittest.TestCase):
 
-    @staticmethod
-    def decode(*objects, offset_size=1, ref_size=1):
+    def build(self, *objects, offset_size=1, ref_size=1):
         data = [b'bplist00']
         offset = 8
         offsets = []
@@ -917,7 +916,11 @@ def decode(*objects, offset_size=1, ref_size=1):
                            len(objects), 0, offset)
         data.extend(offsets)
         data.append(tail)
-        return plistlib.loads(b''.join(data), fmt=plistlib.FMT_BINARY)
+        return b''.join(data)
+
+    def decode(self, *objects, offset_size=1, ref_size=1):
+        data = self.build(*objects, offset_size=offset_size, ref_size=ref_size)
+        return plistlib.loads(data, fmt=plistlib.FMT_BINARY)
 
     def test_nonstandard_refs_size(self):
         # Issue #21538: Refs and offsets are 24-bit integers
@@ -1025,6 +1028,34 @@ def test_invalid_binary(self):
                 with self.assertRaises(plistlib.InvalidFileException):
                     plistlib.loads(b'bplist00' + data, fmt=plistlib.FMT_BINARY)
 
+    def test_truncated_large_data(self):
+        self.addCleanup(os_helper.unlink, os_helper.TESTFN)
+        def check(data):
+            with open(os_helper.TESTFN, 'wb') as f:
+                f.write(data)
+            # buffered file
+            with open(os_helper.TESTFN, 'rb') as f:
+                with self.assertRaises(plistlib.InvalidFileException):
+                    plistlib.load(f, fmt=plistlib.FMT_BINARY)
+            # unbuffered file
+            with open(os_helper.TESTFN, 'rb', buffering=0) as f:
+                with self.assertRaises(plistlib.InvalidFileException):
+                    plistlib.load(f, fmt=plistlib.FMT_BINARY)
+        for w in range(20, 64):
+            s = 1 << w
+            # data
+            check(self.build(b'\x4f\x13' + s.to_bytes(8, 'big')))
+            # ascii string
+            check(self.build(b'\x5f\x13' + s.to_bytes(8, 'big')))
+            # unicode string
+            check(self.build(b'\x6f\x13' + s.to_bytes(8, 'big')))
+            # array
+            check(self.build(b'\xaf\x13' + s.to_bytes(8, 'big')))
+            # dict
+            check(self.build(b'\xdf\x13' + s.to_bytes(8, 'big')))
+            # number of objects
+            check(b'bplist00' + struct.pack('>6xBBQQQ', 1, 1, s, 0, 8))
+
     def test_load_aware_datetime(self):
         data = (b'bplist003B\x04>\xd0d\x00\x00\x00\x08\x00\x00\x00\x00\x00\x00'
                 b'\x01\x01\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00'
diff --git a/Misc/NEWS.d/next/Security/2024-05-21-22-11-31.gh-issue-119342.BTFj4Z.rst b/Misc/NEWS.d/next/Security/2024-05-21-22-11-31.gh-issue-119342.BTFj4Z.rst
new file mode 100644
index 00000000000000..937a963f8581d8
--- /dev/null
+++ b/Misc/NEWS.d/next/Security/2024-05-21-22-11-31.gh-issue-119342.BTFj4Z.rst
@@ -0,0 +1,2 @@
+Fix OOM vulnerability in :mod:`plistlib`, when reading a specially prepared
+small Plist file could cause consuming an arbitrary amount of memory.

From efcb8c5cf71a6604e4b5378feaa3c70f65abfedc Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Wed, 22 May 2024 16:00:50 +0300
Subject: [PATCH 2/3] Apply suggestions from code review
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Bénédikt Tran <10796600+picnixz@users.noreply.github.com>
---
 Lib/plistlib.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/Lib/plistlib.py b/Lib/plistlib.py
index 0365dc71090334..e81889c1ff845c 100644
--- a/Lib/plistlib.py
+++ b/Lib/plistlib.py
@@ -509,11 +509,6 @@ def _get_size(self, tokenL):
         return tokenL
 
     def _read(self, size):
-        if 0:
-            data = self._fp.read(size)
-            if len(data) != size:
-                raise InvalidFileException
-            return data
         cursize = min(size, 1 << 20)
         data = self._fp.read(cursize)
         while True:
@@ -527,8 +522,6 @@ def _read(self, size):
 
     def _read_ints(self, n, size):
         data = self._read(size * n)
-        #print('XXX', n, size)
-        #assert n < 100
         if size in _BINARY_FORMAT:
             return struct.unpack(f'>{n}{_BINARY_FORMAT[size]}', data)
         else:

From 412032bde93c6d42e46440fdda6e4c845109630a Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Wed, 22 May 2024 16:25:06 +0300
Subject: [PATCH 3/3] Use a named constant.

---
 Lib/plistlib.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/Lib/plistlib.py b/Lib/plistlib.py
index e81889c1ff845c..9d13a42c2a6c07 100644
--- a/Lib/plistlib.py
+++ b/Lib/plistlib.py
@@ -73,6 +73,9 @@
 PlistFormat = enum.Enum('PlistFormat', 'FMT_XML FMT_BINARY', module=__name__)
 globals().update(PlistFormat.__members__)
 
+# Data larger than this will be read in chunks, to prevent extreme
+# overallocation.
+_SAFE_BUF_SIZE = 1 << 20
 
 class UID:
     def __init__(self, data):
@@ -509,7 +512,7 @@ def _get_size(self, tokenL):
         return tokenL
 
     def _read(self, size):
-        cursize = min(size, 1 << 20)
+        cursize = min(size, _SAFE_BUF_SIZE)
         data = self._fp.read(cursize)
         while True:
             if len(data) != cursize: