bpo-45850: Implement deep-freeze on Windows (#29648)

Implement changes to build with deep-frozen modules on Windows.
Note that we now require Python 3.10 as the "bootstrap" or "host" Python.
This causes a modest startup speed (around 7%) on Windows.
This commit is contained in:
Guido van Rossum 2021-11-22 10:09:48 -08:00 committed by GitHub
parent 4d6c0c0cce
commit 1037ca5a8e
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
9 changed files with 500 additions and 24 deletions

View file

@ -0,0 +1,2 @@
Implement changes to build with deep-frozen modules on Windows.
Note that we now require Python 3.10 as the "bootstrap" or "host" Python.

View file

@ -236,101 +236,141 @@
<ModName>importlib._bootstrap</ModName>
<IntFile>$(IntDir)importlib._bootstrap.g.h</IntFile>
<OutFile>$(PySourcePath)Python\frozen_modules\importlib._bootstrap.h</OutFile>
<DeepIntFile>$(IntDir)importlib._bootstrap.g.c</DeepIntFile>
<DeepOutFile>$(PySourcePath)Python\deepfreeze\df.importlib._bootstrap.c</DeepOutFile>
</None>
<None Include="..\Lib\importlib\_bootstrap_external.py">
<ModName>importlib._bootstrap_external</ModName>
<IntFile>$(IntDir)importlib._bootstrap_external.g.h</IntFile>
<OutFile>$(PySourcePath)Python\frozen_modules\importlib._bootstrap_external.h</OutFile>
<DeepIntFile>$(IntDir)importlib._bootstrap_external.g.c</DeepIntFile>
<DeepOutFile>$(PySourcePath)Python\deepfreeze\df.importlib._bootstrap_external.c</DeepOutFile>
</None>
<None Include="..\Lib\zipimport.py">
<ModName>zipimport</ModName>
<IntFile>$(IntDir)zipimport.g.h</IntFile>
<OutFile>$(PySourcePath)Python\frozen_modules\zipimport.h</OutFile>
<DeepIntFile>$(IntDir)zipimport.g.c</DeepIntFile>
<DeepOutFile>$(PySourcePath)Python\deepfreeze\df.zipimport.c</DeepOutFile>
</None>
<None Include="..\Lib\abc.py">
<ModName>abc</ModName>
<IntFile>$(IntDir)abc.g.h</IntFile>
<OutFile>$(PySourcePath)Python\frozen_modules\abc.h</OutFile>
<DeepIntFile>$(IntDir)abc.g.c</DeepIntFile>
<DeepOutFile>$(PySourcePath)Python\deepfreeze\df.abc.c</DeepOutFile>
</None>
<None Include="..\Lib\codecs.py">
<ModName>codecs</ModName>
<IntFile>$(IntDir)codecs.g.h</IntFile>
<OutFile>$(PySourcePath)Python\frozen_modules\codecs.h</OutFile>
<DeepIntFile>$(IntDir)codecs.g.c</DeepIntFile>
<DeepOutFile>$(PySourcePath)Python\deepfreeze\df.codecs.c</DeepOutFile>
</None>
<None Include="..\Lib\io.py">
<ModName>io</ModName>
<IntFile>$(IntDir)io.g.h</IntFile>
<OutFile>$(PySourcePath)Python\frozen_modules\io.h</OutFile>
<DeepIntFile>$(IntDir)io.g.c</DeepIntFile>
<DeepOutFile>$(PySourcePath)Python\deepfreeze\df.io.c</DeepOutFile>
</None>
<None Include="..\Lib\_collections_abc.py">
<ModName>_collections_abc</ModName>
<IntFile>$(IntDir)_collections_abc.g.h</IntFile>
<OutFile>$(PySourcePath)Python\frozen_modules\_collections_abc.h</OutFile>
<DeepIntFile>$(IntDir)_collections_abc.g.c</DeepIntFile>
<DeepOutFile>$(PySourcePath)Python\deepfreeze\df._collections_abc.c</DeepOutFile>
</None>
<None Include="..\Lib\_sitebuiltins.py">
<ModName>_sitebuiltins</ModName>
<IntFile>$(IntDir)_sitebuiltins.g.h</IntFile>
<OutFile>$(PySourcePath)Python\frozen_modules\_sitebuiltins.h</OutFile>
<DeepIntFile>$(IntDir)_sitebuiltins.g.c</DeepIntFile>
<DeepOutFile>$(PySourcePath)Python\deepfreeze\df._sitebuiltins.c</DeepOutFile>
</None>
<None Include="..\Lib\genericpath.py">
<ModName>genericpath</ModName>
<IntFile>$(IntDir)genericpath.g.h</IntFile>
<OutFile>$(PySourcePath)Python\frozen_modules\genericpath.h</OutFile>
<DeepIntFile>$(IntDir)genericpath.g.c</DeepIntFile>
<DeepOutFile>$(PySourcePath)Python\deepfreeze\df.genericpath.c</DeepOutFile>
</None>
<None Include="..\Lib\ntpath.py">
<ModName>ntpath</ModName>
<IntFile>$(IntDir)ntpath.g.h</IntFile>
<OutFile>$(PySourcePath)Python\frozen_modules\ntpath.h</OutFile>
<DeepIntFile>$(IntDir)ntpath.g.c</DeepIntFile>
<DeepOutFile>$(PySourcePath)Python\deepfreeze\df.ntpath.c</DeepOutFile>
</None>
<None Include="..\Lib\posixpath.py">
<ModName>posixpath</ModName>
<IntFile>$(IntDir)posixpath.g.h</IntFile>
<OutFile>$(PySourcePath)Python\frozen_modules\posixpath.h</OutFile>
<DeepIntFile>$(IntDir)posixpath.g.c</DeepIntFile>
<DeepOutFile>$(PySourcePath)Python\deepfreeze\df.posixpath.c</DeepOutFile>
</None>
<None Include="..\Lib\os.py">
<ModName>os</ModName>
<IntFile>$(IntDir)os.g.h</IntFile>
<OutFile>$(PySourcePath)Python\frozen_modules\os.h</OutFile>
<DeepIntFile>$(IntDir)os.g.c</DeepIntFile>
<DeepOutFile>$(PySourcePath)Python\deepfreeze\df.os.c</DeepOutFile>
</None>
<None Include="..\Lib\site.py">
<ModName>site</ModName>
<IntFile>$(IntDir)site.g.h</IntFile>
<OutFile>$(PySourcePath)Python\frozen_modules\site.h</OutFile>
<DeepIntFile>$(IntDir)site.g.c</DeepIntFile>
<DeepOutFile>$(PySourcePath)Python\deepfreeze\df.site.c</DeepOutFile>
</None>
<None Include="..\Lib\stat.py">
<ModName>stat</ModName>
<IntFile>$(IntDir)stat.g.h</IntFile>
<OutFile>$(PySourcePath)Python\frozen_modules\stat.h</OutFile>
<DeepIntFile>$(IntDir)stat.g.c</DeepIntFile>
<DeepOutFile>$(PySourcePath)Python\deepfreeze\df.stat.c</DeepOutFile>
</None>
<None Include="..\Lib\__hello__.py">
<ModName>__hello__</ModName>
<IntFile>$(IntDir)__hello__.g.h</IntFile>
<OutFile>$(PySourcePath)Python\frozen_modules\__hello__.h</OutFile>
<DeepIntFile>$(IntDir)__hello__.g.c</DeepIntFile>
<DeepOutFile>$(PySourcePath)Python\deepfreeze\df.__hello__.c</DeepOutFile>
</None>
<None Include="..\Lib\__phello__\__init__.py">
<ModName>__phello__</ModName>
<IntFile>$(IntDir)__phello__.g.h</IntFile>
<OutFile>$(PySourcePath)Python\frozen_modules\__phello__.h</OutFile>
<DeepIntFile>$(IntDir)__phello__.g.c</DeepIntFile>
<DeepOutFile>$(PySourcePath)Python\deepfreeze\df.__phello__.c</DeepOutFile>
</None>
<None Include="..\Lib\__phello__\ham\__init__.py">
<ModName>__phello__.ham</ModName>
<IntFile>$(IntDir)__phello__.ham.g.h</IntFile>
<OutFile>$(PySourcePath)Python\frozen_modules\__phello__.ham.h</OutFile>
<DeepIntFile>$(IntDir)__phello__.ham.g.c</DeepIntFile>
<DeepOutFile>$(PySourcePath)Python\deepfreeze\df.__phello__.ham.c</DeepOutFile>
</None>
<None Include="..\Lib\__phello__\ham\eggs.py">
<ModName>__phello__.ham.eggs</ModName>
<IntFile>$(IntDir)__phello__.ham.eggs.g.h</IntFile>
<OutFile>$(PySourcePath)Python\frozen_modules\__phello__.ham.eggs.h</OutFile>
<DeepIntFile>$(IntDir)__phello__.ham.eggs.g.c</DeepIntFile>
<DeepOutFile>$(PySourcePath)Python\deepfreeze\df.__phello__.ham.eggs.c</DeepOutFile>
</None>
<None Include="..\Lib\__phello__\spam.py">
<ModName>__phello__.spam</ModName>
<IntFile>$(IntDir)__phello__.spam.g.h</IntFile>
<OutFile>$(PySourcePath)Python\frozen_modules\__phello__.spam.h</OutFile>
<DeepIntFile>$(IntDir)__phello__.spam.g.c</DeepIntFile>
<DeepOutFile>$(PySourcePath)Python\deepfreeze\df.__phello__.spam.c</DeepOutFile>
</None>
<None Include="..\Tools\freeze\flag.py">
<ModName>frozen_only</ModName>
<IntFile>$(IntDir)frozen_only.g.h</IntFile>
<OutFile>$(PySourcePath)Python\frozen_modules\frozen_only.h</OutFile>
<DeepIntFile>$(IntDir)frozen_only.g.c</DeepIntFile>
<DeepOutFile>$(PySourcePath)Python\deepfreeze\df.frozen_only.c</DeepOutFile>
</None>
<!-- END frozen modules -->
</ItemGroup>
@ -338,17 +378,29 @@
<ImportGroup Label="ExtensionTargets">
</ImportGroup>
<Target Name="_RebuildFrozen" AfterTargets="AfterBuild" Condition="$(Configuration) != 'PGUpdate'">
<Exec Command='"$(TargetPath)" "%(None.ModName)" "%(None.FullPath)" "%(None.IntFile)"' />
<Exec Command='"$(TargetPath)" "%(None.ModName)" "%(None.FullPath)" "%(None.DeepIntFile)"' />
<Copy SourceFiles="%(None.IntFile)"
<Copy SourceFiles="%(None.DeepIntFile)"
DestinationFiles="%(None.OutFile)"
Condition="!Exists(%(None.OutFile)) or (Exists(%(None.IntFile)) and '$([System.IO.File]::ReadAllText(%(None.OutFile)).Replace(`&#x0D;&#x0A;`, `&#x0A;`))' != '$([System.IO.File]::ReadAllText(%(None.IntFile)).Replace(`&#x0D;&#x0A;`, `&#x0A;`))')">
Condition="!Exists(%(None.OutFile)) or (Exists(%(None.DeepIntFile)) and '$([System.IO.File]::ReadAllText(%(None.OutFile)).Replace(`&#x0D;&#x0A;`, `&#x0A;`))' != '$([System.IO.File]::ReadAllText(%(None.DeepIntFile)).Replace(`&#x0D;&#x0A;`, `&#x0A;`))')">
<Output TaskParameter="CopiedFiles" ItemName="_Updated" />
</Copy>
<Message Text="Updated files: @(_Updated->'%(Filename)%(Extension)',', ')"
Condition="'@(_Updated)' != ''" Importance="high" />
</Target>
<Target Name="_RebuildDeepFrozen" AfterTargets="_RebuildFrozen" Condition="$(Configuration) != 'PGUpdate'">
<Exec Command='$(PythonForBuild) "$(PySourcePath)Tools\scripts\deepfreeze.py" "%(None.OutFile)" "-m" "%(None.ModName)" -o "%(None.IntFile)"' />
<Copy SourceFiles="%(None.IntFile)"
DestinationFiles="%(None.DeepOutFile)"
Condition="!Exists(%(None.DeepOutFile)) or (Exists(%(None.IntFile)) and '$([System.IO.File]::ReadAllText(%(None.DeepOutFile)).Replace(`&#x0D;&#x0A;`, `&#x0A;`))' != '$([System.IO.File]::ReadAllText(%(None.IntFile)).Replace(`&#x0D;&#x0A;`, `&#x0A;`))')">
<Output TaskParameter="CopiedFiles" ItemName="_DeepUpdated" />
</Copy>
<Message Text="Updated files: @(_DeepUpdated->'%(Filename)%(Extension)',', ')"
Condition="'@(_DeepUpdated)' != ''" Importance="high" />
</Target>
<Target Name="_CleanFrozen" BeforeTargets="CoreClean" Condition="$(Configuration) != 'PGUpdate'">
<ItemGroup>
<Clean Include="%(None.IntFile)" />

View file

@ -31,13 +31,13 @@
@if "%_Py_EXTERNALS_DIR%"=="" (set _Py_EXTERNALS_DIR=%~dp0\..\externals)
@rem If we have Python in externals, use that one
@if exist "%_Py_EXTERNALS_DIR%\pythonx86\tools\python.exe" (set PYTHON="%_Py_EXTERNALS_DIR%\pythonx86\tools\python.exe") & (set _Py_Python_Source=found in externals directory) & goto :found
@if exist "%_Py_EXTERNALS_DIR%\pythonx86\tools\python.exe" ("%_Py_EXTERNALS_DIR%\pythonx86\tools\python.exe" -Ec "import sys; assert sys.version_info[:2] >= (3, 10)" >nul 2>nul) && (set PYTHON="%_Py_EXTERNALS_DIR%\pythonx86\tools\python.exe") && (set _Py_Python_Source=found in externals directory) && goto :found || rmdir /Q /S "%_Py_EXTERNALS_DIR%\pythonx86"
@rem If HOST_PYTHON is recent enough, use that
@if NOT "%HOST_PYTHON%"=="" @%HOST_PYTHON% -Ec "import sys; assert sys.version_info[:2] >= (3, 8)" >nul 2>nul && (set PYTHON="%HOST_PYTHON%") && (set _Py_Python_Source=found as HOST_PYTHON) && goto :found
@if NOT "%HOST_PYTHON%"=="" @%HOST_PYTHON% -Ec "import sys; assert sys.version_info[:2] >= (3, 10)" >nul 2>nul && (set PYTHON="%HOST_PYTHON%") && (set _Py_Python_Source=found as HOST_PYTHON) && goto :found
@rem If py.exe finds a recent enough version, use that one
@for %%p in (3.9 3.8) do @py -%%p -EV >nul 2>&1 && (set PYTHON=py -%%p) && (set _Py_Python_Source=found %%p with py.exe) && goto :found
@for %%p in (3.10) do @py -%%p -EV >nul 2>&1 && (set PYTHON=py -%%p) && (set _Py_Python_Source=found %%p with py.exe) && goto :found
@if NOT exist "%_Py_EXTERNALS_DIR%" mkdir "%_Py_EXTERNALS_DIR%"
@set _Py_NUGET=%NUGET%

View file

@ -502,6 +502,30 @@
<ClCompile Include="..\Python\thread.c" />
<ClCompile Include="..\Python\traceback.c" />
</ItemGroup>
<ItemGroup>
<!-- BEGIN deepfreeze -->
<ClCompile Include="..\Python\deepfreeze\df.importlib._bootstrap.c" />
<ClCompile Include="..\Python\deepfreeze\df.importlib._bootstrap_external.c" />
<ClCompile Include="..\Python\deepfreeze\df.zipimport.c" />
<ClCompile Include="..\Python\deepfreeze\df.abc.c" />
<ClCompile Include="..\Python\deepfreeze\df.codecs.c" />
<ClCompile Include="..\Python\deepfreeze\df.io.c" />
<ClCompile Include="..\Python\deepfreeze\df._collections_abc.c" />
<ClCompile Include="..\Python\deepfreeze\df._sitebuiltins.c" />
<ClCompile Include="..\Python\deepfreeze\df.genericpath.c" />
<ClCompile Include="..\Python\deepfreeze\df.ntpath.c" />
<ClCompile Include="..\Python\deepfreeze\df.posixpath.c" />
<ClCompile Include="..\Python\deepfreeze\df.os.c" />
<ClCompile Include="..\Python\deepfreeze\df.site.c" />
<ClCompile Include="..\Python\deepfreeze\df.stat.c" />
<ClCompile Include="..\Python\deepfreeze\df.__hello__.c" />
<ClCompile Include="..\Python\deepfreeze\df.__phello__.c" />
<ClCompile Include="..\Python\deepfreeze\df.__phello__.ham.c" />
<ClCompile Include="..\Python\deepfreeze\df.__phello__.ham.eggs.c" />
<ClCompile Include="..\Python\deepfreeze\df.__phello__.spam.c" />
<ClCompile Include="..\Python\deepfreeze\df.frozen_only.c" />
<!-- END deepfreeze -->
</ItemGroup>
<ItemGroup Condition="$(IncludeExternals)">
<ClCompile Include="..\Modules\zlibmodule.c" />
<ClCompile Include="$(zlibDir)\adler32.c" />

View file

@ -61,12 +61,7 @@
#include "frozen_modules/frozen_only.h"
/* End includes */
#ifdef MS_WINDOWS
/* Deepfreeze isn't supported on Windows yet. */
#define GET_CODE(name) NULL
#else
#define GET_CODE(name) _Py_get_##name##_toplevel
#endif
/* Start extern declarations */
extern PyObject *_Py_get_importlib__bootstrap_toplevel(void);

View file

@ -1,13 +1,16 @@
import argparse
import ast
import builtins
import collections
import contextlib
import os
import sys
import re
import time
import types
import typing
import umarshal
verbose = False
@ -55,7 +58,8 @@ def get_localsplus_counts(code: types.CodeType,
nplaincellvars += 1
elif kind & CO_FAST_FREE:
nfreevars += 1
assert nlocals == len(code.co_varnames) == code.co_nlocals
assert nlocals == len(code.co_varnames) == code.co_nlocals, \
(nlocals, len(code.co_varnames), code.co_nlocals)
assert ncellvars == len(code.co_cellvars)
assert nfreevars == len(code.co_freevars)
assert len(names) == nlocals + nplaincellvars + nfreevars
@ -274,14 +278,7 @@ def generate_tuple(self, name: str, t: tuple[object, ...]) -> str:
self.write(item + ",")
return f"& {name}._object.ob_base.ob_base"
def generate_int(self, name: str, i: int) -> str:
maxint = sys.maxsize
if maxint == 2**31 - 1:
digit = 2**15
elif maxint == 2**63 - 1:
digit = 2**30
else:
assert False, f"What int size is this system?!? {maxint=}"
def _generate_int_for_bits(self, name: str, i: int, digit: int) -> None:
sign = -1 if i < 0 else 0 if i == 0 else +1
i = abs(i)
digits: list[int] = []
@ -298,6 +295,20 @@ def generate_int(self, name: str, i: int) -> str:
if digits:
ds = ", ".join(map(str, digits))
self.write(f".ob_digit = {{ {ds} }},")
def generate_int(self, name: str, i: int) -> str:
if abs(i) < 2**15:
self._generate_int_for_bits(name, i, 2**15)
else:
connective = "if"
for bits_in_digit in 15, 30:
self.write(f"#{connective} PYLONG_BITS_IN_DIGIT == {bits_in_digit}")
self._generate_int_for_bits(name, i, 2**bits_in_digit)
connective = "elif"
self.write("#else")
self.write('#error "PYLONG_BITS_IN_DIGIT should be 15 or 30"')
self.write("#endif")
# If neither clause applies, it won't compile
return f"& {name}.ob_base.ob_base"
def generate_float(self, name: str, x: float) -> str:
@ -326,7 +337,7 @@ def generate(self, name: str, obj: object) -> str:
return self.cache[key]
self.misses += 1
match obj:
case types.CodeType() as code:
case types.CodeType() | umarshal.Code() as code:
val = self.generate_code(name, code)
case tuple(t):
val = self.generate_tuple(name, t)
@ -367,8 +378,31 @@ def generate(self, name: str, obj: object) -> str:
}
"""
FROZEN_COMMENT = "/* Auto-generated by Programs/_freeze_module.c */"
FROZEN_DATA_LINE = r"\s*(\d+,\s*)+\s*"
def is_frozen_header(source: str) -> bool:
return source.startswith(FROZEN_COMMENT)
def decode_frozen_data(source: str) -> types.CodeType:
lines = source.splitlines()
while lines and re.match(FROZEN_DATA_LINE, lines[0]) is None:
del lines[0]
while lines and re.match(FROZEN_DATA_LINE, lines[-1]) is None:
del lines[-1]
values: tuple[int, ...] = ast.literal_eval("".join(lines))
data = bytes(values)
return umarshal.loads(data)
def generate(source: str, filename: str, modname: str, file: typing.TextIO) -> None:
code = compile(source, filename, "exec")
if is_frozen_header(source):
code = decode_frozen_data(source)
else:
code = compile(source, filename, "exec")
printer = Printer(file)
printer.generate("toplevel", code)
printer.write("")

View file

@ -11,7 +11,6 @@
import platform
import subprocess
import sys
import textwrap
import time
from update_file import updating_file_with_tmpfile, update_file_with_tmpfile
@ -55,6 +54,7 @@ def find_tool():
MAKEFILE = os.path.join(ROOT_DIR, 'Makefile.pre.in')
PCBUILD_PROJECT = os.path.join(ROOT_DIR, 'PCbuild', '_freeze_module.vcxproj')
PCBUILD_FILTERS = os.path.join(ROOT_DIR, 'PCbuild', '_freeze_module.vcxproj.filters')
PCBUILD_PYTHONCORE = os.path.join(ROOT_DIR, 'PCbuild', 'pythoncore.vcxproj')
OS_PATH = 'ntpath' if os.name == 'nt' else 'posixpath'
@ -717,20 +717,28 @@ def regen_makefile(modules):
def regen_pcbuild(modules):
projlines = []
filterlines = []
corelines = []
for src in _iter_sources(modules):
pyfile = relpath_for_windows_display(src.pyfile, ROOT_DIR)
header = relpath_for_windows_display(src.frozenfile, ROOT_DIR)
deepbase = "df." + src.id
deepoutfile = f"Python\\deepfreeze\\{deepbase}.c"
intfile = ntpath.splitext(ntpath.basename(header))[0] + '.g.h'
deepintfile = ntpath.splitext(ntpath.basename(header))[0] + '.g.c'
projlines.append(f' <None Include="..\\{pyfile}">')
projlines.append(f' <ModName>{src.frozenid}</ModName>')
projlines.append(f' <IntFile>$(IntDir){intfile}</IntFile>')
projlines.append(f' <OutFile>$(PySourcePath){header}</OutFile>')
projlines.append(f' <DeepIntFile>$(IntDir){deepintfile}</DeepIntFile>')
projlines.append(f' <DeepOutFile>$(PySourcePath){deepoutfile}</DeepOutFile>')
projlines.append(f' </None>')
filterlines.append(f' <None Include="..\\{pyfile}">')
filterlines.append(' <Filter>Python Files</Filter>')
filterlines.append(' </None>')
corelines.append(f' <ClCompile Include="..\\{deepoutfile}" />')
print(f'# Updating {os.path.relpath(PCBUILD_PROJECT)}')
with updating_file_with_tmpfile(PCBUILD_PROJECT) as (infile, outfile):
lines = infile.readlines()
@ -753,6 +761,17 @@ def regen_pcbuild(modules):
PCBUILD_FILTERS,
)
outfile.writelines(lines)
print(f'# Updating {os.path.relpath(PCBUILD_PYTHONCORE)}')
with updating_file_with_tmpfile(PCBUILD_PYTHONCORE) as (infile, outfile):
lines = infile.readlines()
lines = replace_block(
lines,
'<!-- BEGIN deepfreeze -->',
'<!-- END deepfreeze -->',
corelines,
PCBUILD_FILTERS,
)
outfile.writelines(lines)
#######################################

View file

@ -0,0 +1,22 @@
# Quick script to time startup for various binaries
import subprocess
import sys
import time
NREPS = 100
def main():
binaries = sys.argv[1:]
for bin in binaries:
t0 = time.time()
for _ in range(NREPS):
result = subprocess.run([bin, "-c", "pass"])
result.check_returncode()
t1 = time.time()
print(f"{(t1-t0)/NREPS:6.3f} {bin}")
if __name__ == "__main__":
main()

328
Tools/scripts/umarshal.py Normal file
View file

@ -0,0 +1,328 @@
# Implementat marshal.loads() in pure Python
import ast
from typing import Any
class Type:
# Adapted from marshal.c
NULL = ord('0')
NONE = ord('N')
FALSE = ord('F')
TRUE = ord('T')
STOPITER = ord('S')
ELLIPSIS = ord('.')
INT = ord('i')
INT64 = ord('I')
FLOAT = ord('f')
BINARY_FLOAT = ord('g')
COMPLEX = ord('x')
BINARY_COMPLEX = ord('y')
LONG = ord('l')
STRING = ord('s')
INTERNED = ord('t')
REF = ord('r')
TUPLE = ord('(')
LIST = ord('[')
DICT = ord('{')
CODE = ord('c')
UNICODE = ord('u')
UNKNOWN = ord('?')
SET = ord('<')
FROZENSET = ord('>')
ASCII = ord('a')
ASCII_INTERNED = ord('A')
SMALL_TUPLE = ord(')')
SHORT_ASCII = ord('z')
SHORT_ASCII_INTERNED = ord('Z')
FLAG_REF = 0x80 # with a type, add obj to index
NULL = object() # marker
# Cell kinds
CO_FAST_LOCAL = 0x20
CO_FAST_CELL = 0x40
CO_FAST_FREE = 0x80
class Code:
def __init__(self, **kwds: Any):
self.__dict__.update(kwds)
def __repr__(self) -> str:
return f"Code(**{self.__dict__})"
co_localsplusnames: tuple[str]
co_localspluskinds: tuple[int]
def get_localsplus_names(self, select_kind: int) -> tuple[str, ...]:
varnames: list[str] = []
for name, kind in zip(self.co_localsplusnames,
self.co_localspluskinds):
if kind & select_kind:
varnames.append(name)
return tuple(varnames)
@property
def co_varnames(self) -> tuple[str, ...]:
return self.get_localsplus_names(CO_FAST_LOCAL)
@property
def co_cellvars(self) -> tuple[str, ...]:
return self.get_localsplus_names(CO_FAST_CELL)
@property
def co_freevars(self) -> tuple[str, ...]:
return self.get_localsplus_names(CO_FAST_FREE)
@property
def co_nlocals(self) -> int:
return len(self.co_varnames)
class Reader:
# A fairly literal translation of the marshal reader.
def __init__(self, data: bytes):
self.data: bytes = data
self.end: int = len(self.data)
self.pos: int = 0
self.refs: list[Any] = []
self.level: int = 0
def r_string(self, n: int) -> bytes:
assert 0 <= n <= self.end - self.pos
buf = self.data[self.pos : self.pos + n]
self.pos += n
return buf
def r_byte(self) -> int:
buf = self.r_string(1)
return buf[0]
def r_short(self) -> int:
buf = self.r_string(2)
x = buf[0]
x |= buf[1] << 8
x |= -(x & (1<<15)) # Sign-extend
return x
def r_long(self) -> int:
buf = self.r_string(4)
x = buf[0]
x |= buf[1] << 8
x |= buf[2] << 16
x |= buf[3] << 24
x |= -(x & (1<<31)) # Sign-extend
return x
def r_long64(self) -> int:
buf = self.r_string(8)
x = buf[0]
x |= buf[1] << 8
x |= buf[2] << 16
x |= buf[3] << 24
x |= buf[1] << 32
x |= buf[1] << 40
x |= buf[1] << 48
x |= buf[1] << 56
x |= -(x & (1<<63)) # Sign-extend
return x
def r_PyLong(self) -> int:
n = self.r_long()
size = abs(n)
x = 0
# Pray this is right
for i in range(size):
x |= self.r_short() << i*15
if n < 0:
x = -x
return x
def r_float_bin(self) -> float:
buf = self.r_string(8)
import struct # Lazy import to avoid breaking UNIX build
return struct.unpack("d", buf)[0]
def r_float_str(self) -> float:
n = self.r_byte()
buf = self.r_string(n)
return ast.literal_eval(buf.decode("ascii"))
def r_ref_reserve(self, flag: int) -> int:
if flag:
idx = len(self.refs)
self.refs.append(None)
return idx
else:
return 0
def r_ref_insert(self, obj: Any, idx: int, flag: int) -> Any:
if flag:
self.refs[idx] = obj
return obj
def r_ref(self, obj: Any, flag: int) -> Any:
assert flag & FLAG_REF
self.refs.append(obj)
return obj
def r_object(self) -> Any:
old_level = self.level
try:
return self._r_object()
finally:
self.level = old_level
def _r_object(self) -> Any:
code = self.r_byte()
flag = code & FLAG_REF
type = code & ~FLAG_REF
# print(" "*self.level + f"{code} {flag} {type} {chr(type)!r}")
self.level += 1
def R_REF(obj: Any) -> Any:
if flag:
obj = self.r_ref(obj, flag)
return obj
match type:
case Type.NULL:
return NULL
case Type.NONE:
return None
case Type.ELLIPSIS:
return Ellipsis
case Type.FALSE:
return False
case Type.TRUE:
return True
case Type.INT:
return R_REF(self.r_long())
case Type.INT64:
return R_REF(self.r_long64())
case Type.LONG:
return R_REF(self.r_PyLong())
case Type.FLOAT:
return R_REF(self.r_float_str())
case Type.BINARY_FLOAT:
return R_REF(self.r_float_bin())
case Type.COMPLEX:
return R_REF(complex(self.r_float_str(),
self.r_float_str()))
case Type.BINARY_COMPLEX:
return R_REF(complex(self.r_float_bin(),
self.r_float_bin()))
case Type.STRING:
n = self.r_long()
return R_REF(self.r_string(n))
case Type.ASCII_INTERNED | Type.ASCII:
n = self.r_long()
return R_REF(self.r_string(n).decode("ascii"))
case Type.SHORT_ASCII_INTERNED | Type.SHORT_ASCII:
n = self.r_byte()
return R_REF(self.r_string(n).decode("ascii"))
case Type.INTERNED | Type.UNICODE:
n = self.r_long()
return R_REF(self.r_string(n).decode("utf8", "surrogatepass"))
case Type.SMALL_TUPLE:
n = self.r_byte()
idx = self.r_ref_reserve(flag)
retval: Any = tuple(self.r_object() for _ in range(n))
self.r_ref_insert(retval, idx, flag)
return retval
case Type.TUPLE:
n = self.r_long()
idx = self.r_ref_reserve(flag)
retval = tuple(self.r_object() for _ in range(n))
self.r_ref_insert(retval, idx, flag)
return retval
case Type.LIST:
n = self.r_long()
retval = R_REF([])
for _ in range(n):
retval.append(self.r_object())
return retval
case Type.DICT:
retval = R_REF({})
while True:
key = self.r_object()
if key == NULL:
break
val = self.r_object()
retval[key] = val
return retval
case Type.SET:
n = self.r_long()
retval = R_REF(set())
for _ in range(n):
v = self.r_object()
retval.add(v)
return retval
case Type.FROZENSET:
n = self.r_long()
s: set[Any] = set()
idx = self.r_ref_reserve(flag)
for _ in range(n):
v = self.r_object()
s.add(v)
retval = frozenset(s)
self.r_ref_insert(retval, idx, flag)
return retval
case Type.CODE:
retval = R_REF(Code())
retval.co_argcount = self.r_long()
retval.co_posonlyargcount = self.r_long()
retval.co_kwonlyargcount = self.r_long()
retval.co_stacksize = self.r_long()
retval.co_flags = self.r_long()
retval.co_code = self.r_object()
retval.co_consts = self.r_object()
retval.co_names = self.r_object()
retval.co_localsplusnames = self.r_object()
retval.co_localspluskinds = self.r_object()
retval.co_filename = self.r_object()
retval.co_name = self.r_object()
retval.co_qualname = self.r_object()
retval.co_firstlineno = self.r_long()
retval.co_linetable = self.r_object()
retval.co_endlinetable = self.r_object()
retval.co_columntable = self.r_object()
retval.co_exceptiontable = self.r_object()
return retval
case Type.REF:
n = self.r_long()
retval = self.refs[n]
assert retval is not None
return retval
case _:
breakpoint()
raise AssertionError(f"Unknown type {type} {chr(type)!r}")
def loads(data: bytes) -> Any:
assert isinstance(data, bytes)
r = Reader(data)
return r.r_object()
def main():
# Test
import marshal, pprint
sample = {'foo': {(42, "bar", 3.14)}}
data = marshal.dumps(sample)
retval = loads(data)
assert retval == sample, retval
sample = main.__code__
data = marshal.dumps(sample)
retval = loads(data)
assert isinstance(retval, Code), retval
pprint.pprint(retval.__dict__)
if __name__ == "__main__":
main()