Skip to content

Commit 3d8c38f

Browse files
authored
GH-135904: Improve the JIT's performance on macOS (GH-136528)
1 parent a68ddea commit 3d8c38f

File tree

6 files changed

+73
-61
lines changed

6 files changed

+73
-61
lines changed

Python/jit.c

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -431,8 +431,10 @@ void patch_aarch64_trampoline(unsigned char *location, int ordinal, jit_state *s
431431

432432
#if defined(__aarch64__) || defined(_M_ARM64)
433433
#define TRAMPOLINE_SIZE 16
434+
#define DATA_ALIGN 8
434435
#else
435436
#define TRAMPOLINE_SIZE 0
437+
#define DATA_ALIGN 1
436438
#endif
437439

438440
// Generate and patch AArch64 trampolines. The symbols to jump to are stored
@@ -522,8 +524,9 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], siz
522524
// Round up to the nearest page:
523525
size_t page_size = get_page_size();
524526
assert((page_size & (page_size - 1)) == 0);
525-
size_t padding = page_size - ((code_size + state.trampolines.size + data_size) & (page_size - 1));
526-
size_t total_size = code_size + state.trampolines.size + data_size + padding;
527+
size_t code_padding = DATA_ALIGN - ((code_size + state.trampolines.size) & (DATA_ALIGN - 1));
528+
size_t padding = page_size - ((code_size + state.trampolines.size + code_padding + data_size) & (page_size - 1));
529+
size_t total_size = code_size + state.trampolines.size + code_padding + data_size + padding;
527530
unsigned char *memory = jit_alloc(total_size);
528531
if (memory == NULL) {
529532
return -1;
@@ -545,7 +548,7 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], siz
545548
// Loop again to emit the code:
546549
unsigned char *code = memory;
547550
state.trampolines.mem = memory + code_size;
548-
unsigned char *data = memory + code_size + state.trampolines.size;
551+
unsigned char *data = memory + code_size + state.trampolines.size + code_padding;
549552
// Compile the shim, which handles converting between the native
550553
// calling convention and the calling convention used by jitted code
551554
// (which may be different for efficiency reasons).
@@ -567,7 +570,7 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], siz
567570
code += group->code_size;
568571
data += group->data_size;
569572
assert(code == memory + code_size);
570-
assert(data == memory + code_size + state.trampolines.size + data_size);
573+
assert(data == memory + code_size + state.trampolines.size + code_padding + data_size);
571574
#ifdef MAP_JIT
572575
pthread_jit_write_protect_np(1);
573576
#endif

Tools/jit/_optimizers.py

Lines changed: 12 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -70,21 +70,21 @@ class Optimizer:
7070

7171
path: pathlib.Path
7272
_: dataclasses.KW_ONLY
73-
# prefix used to mangle symbols on some platforms:
74-
prefix: str = ""
73+
# Prefixes used to mangle local labels and symbols:
74+
label_prefix: str
75+
symbol_prefix: str
7576
# The first block in the linked list:
7677
_root: _Block = dataclasses.field(init=False, default_factory=_Block)
7778
_labels: dict[str, _Block] = dataclasses.field(init=False, default_factory=dict)
7879
# No groups:
7980
_re_noninstructions: typing.ClassVar[re.Pattern[str]] = re.compile(
80-
r"\s*(?:\.|#|//|$)"
81+
r"\s*(?:\.|#|//|;|$)"
8182
)
8283
# One group (label):
8384
_re_label: typing.ClassVar[re.Pattern[str]] = re.compile(
8485
r'\s*(?P<label>[\w."$?@]+):'
8586
)
8687
# Override everything that follows in subclasses:
87-
_alignment: typing.ClassVar[int] = 1
8888
_branches: typing.ClassVar[dict[str, str | None]] = {}
8989
# Two groups (instruction and target):
9090
_re_branch: typing.ClassVar[re.Pattern[str]] = _RE_NEVER_MATCH
@@ -131,8 +131,12 @@ def __post_init__(self) -> None:
131131
block.fallthrough = False
132132

133133
def _preprocess(self, text: str) -> str:
134-
# Override this method to do preprocessing of the textual assembly:
135-
return text
134+
# Override this method to do preprocessing of the textual assembly.
135+
# In all cases, replace references to the _JIT_CONTINUE symbol with
136+
# references to a local _JIT_CONTINUE label (which we will add later):
137+
continue_symbol = rf"\b{re.escape(self.symbol_prefix)}_JIT_CONTINUE\b"
138+
continue_label = f"{self.label_prefix}_JIT_CONTINUE"
139+
return re.sub(continue_symbol, continue_label, text)
136140

137141
@classmethod
138142
def _invert_branch(cls, line: str, target: str) -> str | None:
@@ -197,15 +201,12 @@ def _insert_continue_label(self) -> None:
197201
# jmp FOO
198202
# After:
199203
# jmp FOO
200-
# .balign 8
201204
# _JIT_CONTINUE:
202205
# This lets the assembler encode _JIT_CONTINUE jumps at build time!
203-
align = _Block()
204-
align.noninstructions.append(f"\t.balign\t{self._alignment}")
205-
continuation = self._lookup_label(f"{self.prefix}_JIT_CONTINUE")
206+
continuation = self._lookup_label(f"{self.label_prefix}_JIT_CONTINUE")
206207
assert continuation.label
207208
continuation.noninstructions.append(f"{continuation.label}:")
208-
end.link, align.link, continuation.link = align, continuation, end.link
209+
end.link, continuation.link = continuation, end.link
209210

210211
def _mark_hot_blocks(self) -> None:
211212
# Start with the last block, and perform a DFS to find all blocks that
@@ -285,8 +286,6 @@ def run(self) -> None:
285286
class OptimizerAArch64(Optimizer): # pylint: disable = too-few-public-methods
286287
"""aarch64-apple-darwin/aarch64-pc-windows-msvc/aarch64-unknown-linux-gnu"""
287288

288-
# TODO: @diegorusso
289-
_alignment = 8
290289
# https://developer.arm.com/documentation/ddi0602/2025-03/Base-Instructions/B--Branch-
291290
_re_jump = re.compile(r"\s*b\s+(?P<target>[\w.]+)")
292291

@@ -302,18 +301,3 @@ class OptimizerX86(Optimizer): # pylint: disable = too-few-public-methods
302301
_re_jump = re.compile(r"\s*jmp\s+(?P<target>[\w.]+)")
303302
# https://www.felixcloutier.com/x86/ret
304303
_re_return = re.compile(r"\s*ret\b")
305-
306-
307-
class OptimizerX8664Windows(OptimizerX86): # pylint: disable = too-few-public-methods
308-
"""x86_64-pc-windows-msvc"""
309-
310-
def _preprocess(self, text: str) -> str:
311-
text = super()._preprocess(text)
312-
# Before:
313-
# rex64 jmpq *__imp__JIT_CONTINUE(%rip)
314-
# After:
315-
# jmp _JIT_CONTINUE
316-
far_indirect_jump = (
317-
rf"rex64\s+jmpq\s+\*__imp_(?P<target>{self.prefix}_JIT_\w+)\(%rip\)"
318-
)
319-
return re.sub(far_indirect_jump, r"jmp\t\g<target>", text)

Tools/jit/_targets.py

Lines changed: 44 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,8 @@ class _Target(typing.Generic[_S, _R]):
4444
_: dataclasses.KW_ONLY
4545
args: typing.Sequence[str] = ()
4646
optimizer: type[_optimizers.Optimizer] = _optimizers.Optimizer
47-
prefix: str = ""
47+
label_prefix: typing.ClassVar[str]
48+
symbol_prefix: typing.ClassVar[str]
4849
stable: bool = False
4950
debug: bool = False
5051
verbose: bool = False
@@ -172,7 +173,9 @@ async def _compile(
172173
*shlex.split(self.cflags),
173174
]
174175
await _llvm.run("clang", args_s, echo=self.verbose)
175-
self.optimizer(s, prefix=self.prefix).run()
176+
self.optimizer(
177+
s, label_prefix=self.label_prefix, symbol_prefix=self.symbol_prefix
178+
).run()
176179
args_o = [f"--target={self.triple}", "-c", "-o", f"{o}", f"{s}"]
177180
await _llvm.run("clang", args_o, echo=self.verbose)
178181
return await self._parse(o)
@@ -274,7 +277,7 @@ def _handle_section(
274277
symbol = wrapped_symbol["Symbol"]
275278
offset = base + symbol["Value"]
276279
name = symbol["Name"]
277-
name = name.removeprefix(self.prefix)
280+
name = name.removeprefix(self.symbol_prefix)
278281
if name not in group.symbols:
279282
group.symbols[name] = value, offset
280283
for wrapped_relocation in section["Relocations"]:
@@ -285,9 +288,9 @@ def _handle_section(
285288
def _unwrap_dllimport(self, name: str) -> tuple[_stencils.HoleValue, str | None]:
286289
if name.startswith("__imp_"):
287290
name = name.removeprefix("__imp_")
288-
name = name.removeprefix(self.prefix)
291+
name = name.removeprefix(self.symbol_prefix)
289292
return _stencils.HoleValue.GOT, name
290-
name = name.removeprefix(self.prefix)
293+
name = name.removeprefix(self.symbol_prefix)
291294
return _stencils.symbol_to_value(name)
292295

293296
def _handle_relocation(
@@ -335,9 +338,24 @@ def _handle_relocation(
335338
return _stencils.Hole(offset, kind, value, symbol, addend)
336339

337340

341+
class _COFF32(_COFF):
342+
# These mangle like Mach-O and other "older" formats:
343+
label_prefix = "L"
344+
symbol_prefix = "_"
345+
346+
347+
class _COFF64(_COFF):
348+
# These mangle like ELF and other "newer" formats:
349+
label_prefix = ".L"
350+
symbol_prefix = ""
351+
352+
338353
class _ELF(
339354
_Target[_schema.ELFSection, _schema.ELFRelocation]
340355
): # pylint: disable = too-few-public-methods
356+
label_prefix = ".L"
357+
symbol_prefix = ""
358+
341359
def _handle_section(
342360
self, section: _schema.ELFSection, group: _stencils.StencilGroup
343361
) -> None:
@@ -374,7 +392,7 @@ def _handle_section(
374392
symbol = wrapped_symbol["Symbol"]
375393
offset = len(stencil.body) + symbol["Value"]
376394
name = symbol["Name"]["Name"]
377-
name = name.removeprefix(self.prefix)
395+
name = name.removeprefix(self.symbol_prefix)
378396
group.symbols[name] = value, offset
379397
stencil.body.extend(section["SectionData"]["Bytes"])
380398
assert not section["Relocations"]
@@ -409,7 +427,7 @@ def _handle_relocation(
409427
},
410428
}:
411429
offset += base
412-
s = s.removeprefix(self.prefix)
430+
s = s.removeprefix(self.symbol_prefix)
413431
value, symbol = _stencils.HoleValue.GOT, s
414432
case {
415433
"Addend": addend,
@@ -418,7 +436,7 @@ def _handle_relocation(
418436
"Type": {"Name": kind},
419437
}:
420438
offset += base
421-
s = s.removeprefix(self.prefix)
439+
s = s.removeprefix(self.symbol_prefix)
422440
value, symbol = _stencils.symbol_to_value(s)
423441
case _:
424442
raise NotImplementedError(relocation)
@@ -428,17 +446,20 @@ def _handle_relocation(
428446
class _MachO(
429447
_Target[_schema.MachOSection, _schema.MachORelocation]
430448
): # pylint: disable = too-few-public-methods
449+
label_prefix = "L"
450+
symbol_prefix = "_"
451+
431452
def _handle_section(
432453
self, section: _schema.MachOSection, group: _stencils.StencilGroup
433454
) -> None:
434455
assert section["Address"] >= len(group.code.body)
435456
assert "SectionData" in section
436457
flags = {flag["Name"] for flag in section["Attributes"]["Flags"]}
437458
name = section["Name"]["Value"]
438-
name = name.removeprefix(self.prefix)
459+
name = name.removeprefix(self.symbol_prefix)
439460
if "Debug" in flags:
440461
return
441-
if "SomeInstructions" in flags:
462+
if "PureInstructions" in flags:
442463
value = _stencils.HoleValue.CODE
443464
stencil = group.code
444465
start_address = 0
@@ -459,7 +480,7 @@ def _handle_section(
459480
symbol = wrapped_symbol["Symbol"]
460481
offset = symbol["Value"] - start_address
461482
name = symbol["Name"]["Name"]
462-
name = name.removeprefix(self.prefix)
483+
name = name.removeprefix(self.symbol_prefix)
463484
group.symbols[name] = value, offset
464485
assert "Relocations" in section
465486
for wrapped_relocation in section["Relocations"]:
@@ -484,7 +505,7 @@ def _handle_relocation(
484505
},
485506
}:
486507
offset += base
487-
s = s.removeprefix(self.prefix)
508+
s = s.removeprefix(self.symbol_prefix)
488509
value, symbol = _stencils.HoleValue.GOT, s
489510
addend = 0
490511
case {
@@ -493,7 +514,7 @@ def _handle_relocation(
493514
"Type": {"Name": "X86_64_RELOC_GOT" | "X86_64_RELOC_GOT_LOAD" as kind},
494515
}:
495516
offset += base
496-
s = s.removeprefix(self.prefix)
517+
s = s.removeprefix(self.symbol_prefix)
497518
value, symbol = _stencils.HoleValue.GOT, s
498519
addend = (
499520
int.from_bytes(raw[offset : offset + 4], "little", signed=True) - 4
@@ -508,7 +529,7 @@ def _handle_relocation(
508529
"Type": {"Name": "X86_64_RELOC_BRANCH" | "X86_64_RELOC_SIGNED" as kind},
509530
}:
510531
offset += base
511-
s = s.removeprefix(self.prefix)
532+
s = s.removeprefix(self.symbol_prefix)
512533
value, symbol = _stencils.symbol_to_value(s)
513534
addend = (
514535
int.from_bytes(raw[offset : offset + 4], "little", signed=True) - 4
@@ -523,27 +544,27 @@ def _handle_relocation(
523544
"Type": {"Name": kind},
524545
}:
525546
offset += base
526-
s = s.removeprefix(self.prefix)
547+
s = s.removeprefix(self.symbol_prefix)
527548
value, symbol = _stencils.symbol_to_value(s)
528549
addend = 0
529550
case _:
530551
raise NotImplementedError(relocation)
531552
return _stencils.Hole(offset, kind, value, symbol, addend)
532553

533554

534-
def get_target(host: str) -> _COFF | _ELF | _MachO:
555+
def get_target(host: str) -> _COFF32 | _COFF64 | _ELF | _MachO:
535556
"""Build a _Target for the given host "triple" and options."""
536557
optimizer: type[_optimizers.Optimizer]
537-
target: _COFF | _ELF | _MachO
558+
target: _COFF32 | _COFF64 | _ELF | _MachO
538559
if re.fullmatch(r"aarch64-apple-darwin.*", host):
539560
condition = "defined(__aarch64__) && defined(__APPLE__)"
540561
optimizer = _optimizers.OptimizerAArch64
541-
target = _MachO(host, condition, optimizer=optimizer, prefix="_")
562+
target = _MachO(host, condition, optimizer=optimizer)
542563
elif re.fullmatch(r"aarch64-pc-windows-msvc", host):
543564
args = ["-fms-runtime-lib=dll", "-fplt"]
544565
condition = "defined(_M_ARM64)"
545566
optimizer = _optimizers.OptimizerAArch64
546-
target = _COFF(host, condition, args=args, optimizer=optimizer)
567+
target = _COFF64(host, condition, args=args, optimizer=optimizer)
547568
elif re.fullmatch(r"aarch64-.*-linux-gnu", host):
548569
# -mno-outline-atomics: Keep intrinsics from being emitted.
549570
args = ["-fpic", "-mno-outline-atomics"]
@@ -555,16 +576,16 @@ def get_target(host: str) -> _COFF | _ELF | _MachO:
555576
args = ["-DPy_NO_ENABLE_SHARED", "-Wno-ignored-attributes"]
556577
optimizer = _optimizers.OptimizerX86
557578
condition = "defined(_M_IX86)"
558-
target = _COFF(host, condition, args=args, optimizer=optimizer, prefix="_")
579+
target = _COFF32(host, condition, args=args, optimizer=optimizer)
559580
elif re.fullmatch(r"x86_64-apple-darwin.*", host):
560581
condition = "defined(__x86_64__) && defined(__APPLE__)"
561582
optimizer = _optimizers.OptimizerX86
562-
target = _MachO(host, condition, optimizer=optimizer, prefix="_")
583+
target = _MachO(host, condition, optimizer=optimizer)
563584
elif re.fullmatch(r"x86_64-pc-windows-msvc", host):
564585
args = ["-fms-runtime-lib=dll"]
565586
condition = "defined(_M_X64)"
566-
optimizer = _optimizers.OptimizerX8664Windows
567-
target = _COFF(host, condition, args=args, optimizer=optimizer)
587+
optimizer = _optimizers.OptimizerX86
588+
target = _COFF64(host, condition, args=args, optimizer=optimizer)
568589
elif re.fullmatch(r"x86_64-.*-linux-gnu", host):
569590
args = ["-fno-pic", "-mcmodel=medium", "-mlarge-data-threshold=0"]
570591
condition = "defined(__x86_64__) && defined(__linux__)"

Tools/jit/jit.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,7 @@ typedef jit_func __attribute__((preserve_none)) jit_func_preserve_none;
66
#define PATCH_VALUE(TYPE, NAME, ALIAS) \
77
PyAPI_DATA(void) ALIAS; \
88
TYPE NAME = (TYPE)(uintptr_t)&ALIAS;
9+
10+
#define DECLARE_TARGET(NAME) \
11+
_Py_CODEUNIT *__attribute__((preserve_none, visibility("hidden"))) \
12+
NAME(_PyInterpreterFrame *frame, _PyStackRef *stack_pointer, PyThreadState *tstate);

Tools/jit/shim.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,6 @@ _Py_CODEUNIT *
1010
_JIT_ENTRY(_PyInterpreterFrame *frame, _PyStackRef *stack_pointer, PyThreadState *tstate)
1111
{
1212
// Note that this is *not* a tail call:
13-
PATCH_VALUE(jit_func_preserve_none, call, _JIT_CONTINUE);
14-
return call(frame, stack_pointer, tstate);
13+
DECLARE_TARGET(_JIT_CONTINUE);
14+
return _JIT_CONTINUE(frame, stack_pointer, tstate);
1515
}

Tools/jit/template.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -74,10 +74,10 @@ do { \
7474
do { \
7575
} while (0)
7676

77-
#define PATCH_JUMP(ALIAS) \
78-
do { \
79-
PATCH_VALUE(jit_func_preserve_none, jump, ALIAS); \
80-
__attribute__((musttail)) return jump(frame, stack_pointer, tstate); \
77+
#define PATCH_JUMP(ALIAS) \
78+
do { \
79+
DECLARE_TARGET(ALIAS); \
80+
__attribute__((musttail)) return ALIAS(frame, stack_pointer, tstate); \
8181
} while (0)
8282

8383
#undef JUMP_TO_JUMP_TARGET

0 commit comments

Comments
 (0)