From 687114abe37832994041776933ccb639915a4e5f Mon Sep 17 00:00:00 2001 From: Xavier Leroy Date: Mon, 18 May 2026 13:33:53 +0200 Subject: [PATCH 1/3] AArch64: use a preindexed store pair in function prologues When applicable, this produces 2 instructions (instead of 4) for allocating the stack frame and saving the link and the return address. --- aarch64/Asm.v | 16 +++++++++++----- aarch64/Asmexpand.ml | 23 ++++++++++++++++++----- aarch64/Asmgen.v | 7 +++---- aarch64/Asmgenproof.v | 32 +++++++++----------------------- aarch64/TargetPrinter.ml | 3 ++- test | 2 +- 6 files changed, 44 insertions(+), 39 deletions(-) diff --git a/aarch64/Asm.v b/aarch64/Asm.v index 07e3edb4b1..e86af1de2f 100644 --- a/aarch64/Asm.v +++ b/aarch64/Asm.v @@ -136,7 +136,8 @@ Inductive addressing: Type := | ADsxt (base: iregsp) (r: ireg) (n: int) (**r base plus SIGN-EXT(reg) LSL n *) | ADuxt (base: iregsp) (r: ireg) (n: int) (**r base plus ZERO-EXT(reg) LSL n *) | ADadr (base: iregsp) (id: ident) (ofs: ptrofs) (**r base plus low address of [id + ofs] *) - | ADpostincr (base: iregsp) (n: int64). (**r base plus offset; base is updated after *) + | ADpostincr (base: iregsp) (n: int64) (**r base plus offset; base is updated after *) + | ADpreincr (base: iregsp) (n: int64). (**r base plus offset; base is updated before *) Inductive shift_op: Type := | SOnone @@ -292,7 +293,7 @@ Inductive instruction: Type := (** Floating-point conditional select *) | Pfsel (rd r1 r2: freg) (cond: testcond) (** Pseudo-instructions *) - | Pallocframe (sz: Z) (linkofs: ptrofs) (**r allocate new stack frame *) + | Pallocframe (sz: Z) (linkofs: ptrofs) (retaddrofs: ptrofs) (**r allocate new stack frame *) | Pfreeframe (sz: Z) (linkofs: ptrofs) (**r deallocate stack frame and restore previous frame *) | Plabel (lbl: label) (**r define a code label *) | Ploadsymbol (rd: ireg) (id: ident) (**r load the address of [id] *) @@ -542,6 +543,7 @@ Definition eval_addressing (a: addressing) (rs: regset): val := | ADuxt base r n => Val.addl rs#base (Val.shll (Val.longofintu rs#r) (Vint n)) | ADadr base id ofs => Val.addl rs#base (symbol_low ge id ofs) | ADpostincr base n => Vundef (* not modeled yet *) + | ADpreincr base n => Vundef (* not modeled yet *) end. (** Auxiliaries for memory accesses *) @@ -1063,12 +1065,16 @@ Definition exec_instr (f: function) (i: instruction) (rs: regset) (m: mem) : out end in Next (nextinstr (rs#rd <- v)) m (** Pseudo-instructions *) - | Pallocframe sz pos => + | Pallocframe sz linkpos retaddrpos => let (m1, stk) := Mem.alloc m 0 sz in let sp := (Vptr stk Ptrofs.zero) in - match Mem.storev Mint64 m1 (Val.offset_ptr sp pos) rs#SP with + match Mem.storev Mint64 m1 (Val.offset_ptr sp linkpos) rs#SP with | None => Stuck - | Some m2 => Next (nextinstr (rs #X15 <- (rs#SP) #SP <- sp #X16 <- Vundef)) m2 + | Some m2 => + match Mem.storev Mint64 m2 (Val.offset_ptr sp retaddrpos) rs#RA with + | None => Stuck + | Some m3 => Next (nextinstr (rs #X15 <- (rs#SP) #SP <- sp #X16 <- Vundef)) m3 + end end | Pfreeframe sz pos => match Mem.loadv Mint64 m (Val.offset_ptr rs#SP pos) with diff --git a/aarch64/Asmexpand.ml b/aarch64/Asmexpand.ml index fe8108f76d..7df7c21164 100644 --- a/aarch64/Asmexpand.ml +++ b/aarch64/Asmexpand.ml @@ -40,7 +40,7 @@ let expand_addimm64 (dst: iregsp) (src: iregsp) n = List.iter emit (Asmgen.addimm64 dst src n []) let expand_storeptr (src: ireg) (base: iregsp) ofs = - List.iter emit (Asmgen.storeptr src base ofs []) + List.iter emit (Asmgen.indexed_memory_access (fun a -> Pstrx(src, a)) (Z.of_uint 8) base ofs []) (* Handling of varargs *) @@ -490,7 +490,7 @@ module BRelax = Branch_relaxation (BInfo) let expand_instruction instr = match instr with - | Pallocframe (sz, ofs) -> + | Pallocframe (sz, linkofs, retaddrofs) -> emit (Pmov (RR1 X15, XSP)); if is_current_function_variadic() && Archi.abi = Archi.AAPCS64 then begin let (ir, fr, _) = @@ -501,9 +501,22 @@ let expand_instruction instr = end else begin current_function_stacksize := Z.to_int64 sz end; - expand_addimm64 XSP XSP (Ptrofs.repr (Z.neg sz)); - emit (Pcfi_adjust sz); - expand_storeptr X15 XSP ofs + let sz1 = Z.(sub sz linkofs) in + if retaddrofs = Z.(add linkofs (of_uint 8)) + && Z.(le sz1 (of_uint 512)) then begin + emit (Pstp(X15, X30, ADpreincr(XSP, Z.neg sz1))); + emit (Pcfi_adjust sz1); + let sz2 = Z.(sub sz sz1) in + if Z.(gt sz2 zero) then begin + expand_addimm64 XSP XSP (Ptrofs.repr (Z.neg sz2)); + emit (Pcfi_adjust sz2) + end + end else begin + expand_addimm64 XSP XSP (Ptrofs.repr (Z.neg sz)); + emit (Pcfi_adjust sz); + expand_storeptr X15 XSP linkofs; + expand_storeptr X30 XSP retaddrofs + end | Pfreeframe (sz, ofs) -> expand_addimm64 XSP XSP (coqint_of_camlint64 !current_function_stacksize) | Pcvtx2w rd -> diff --git a/aarch64/Asmgen.v b/aarch64/Asmgen.v index 332e849522..5cd76ee4dc 100644 --- a/aarch64/Asmgen.v +++ b/aarch64/Asmgen.v @@ -1144,16 +1144,15 @@ Definition transl_code' (f: Mach.function) (il: list Mach.instruction) (it1p: bo transl_code_rec f il it1p (fun c => OK c). (** Translation of a whole function. Note that we must check - that the generated code contains less than [2^32] instructions, + that the generated code contains less than [2^64] instructions, otherwise the offset part of the [PC] code pointer could wrap around, leading to incorrect executions. *) Definition transl_function (f: Mach.function) := do c <- transl_code' f f.(Mach.fn_code) true; OK (mkfunction f.(Mach.fn_sig) - (Pallocframe f.(fn_stacksize) f.(fn_link_ofs) :: - storeptr RA XSP f.(fn_retaddr_ofs) - (Pcfi_rel_offset (Ptrofs.to_int f.(fn_retaddr_ofs)):: c))). + (Pallocframe f.(fn_stacksize) f.(fn_link_ofs) f.(fn_retaddr_ofs) :: + Pcfi_rel_offset (Ptrofs.to_int f.(fn_retaddr_ofs)) :: c)). Definition transf_function (f: Mach.function) : res Asm.function := do tf <- transl_function f; diff --git a/aarch64/Asmgenproof.v b/aarch64/Asmgenproof.v index 207e8080dd..3f729e479c 100644 --- a/aarch64/Asmgenproof.v +++ b/aarch64/Asmgenproof.v @@ -396,7 +396,6 @@ Lemma transl_find_label: Proof. intros. monadInv H. destruct (zlt Ptrofs.max_unsigned (list_length_z x.(fn_code))); inv EQ0. monadInv EQ. simpl. rewrite transl_code'_transl_code in EQ0. - erewrite tail_nolabel_find_label by (apply storeptr_label). simpl. eapply transl_code_label; eauto. Qed. @@ -442,8 +441,6 @@ Proof. destruct (zlt Ptrofs.max_unsigned (list_length_z x.(fn_code))); inv EQ0. monadInv EQ. rewrite transl_code'_transl_code in EQ0. exists x; exists true; split; auto. unfold fn_code. - constructor. eapply is_tail_trans. - 2: apply (storeptr_label X30 XSP (fn_retaddr_ofs f0) (Pcfi_rel_offset (Ptrofs.to_int (fn_retaddr_ofs f0)) :: x)). repeat constructor. - exact transf_function_no_overflow. Qed. @@ -932,32 +929,23 @@ Local Transparent destroyed_by_op. change (chunk_of_type Tptr) with Mint64 in *. (* Execution of function prologue *) monadInv EQ0. rewrite transl_code'_transl_code in EQ1. - set (x1 := Pcfi_rel_offset (Ptrofs.to_int (fn_retaddr_ofs f)) :: x0) in *. - set (tfbody := Pallocframe (fn_stacksize f) (fn_link_ofs f) :: - storeptr RA XSP (fn_retaddr_ofs f) x1) in *. + set (tfbody := Pallocframe (fn_stacksize f) (fn_link_ofs f) (fn_retaddr_ofs f) :: + Pcfi_rel_offset (Ptrofs.to_int (fn_retaddr_ofs f)) :: x0) in *. set (tf := {| fn_sig := Mach.fn_sig f; fn_code := tfbody |}) in *. set (rs2 := nextinstr (rs0#X15 <- (parent_sp s) #SP <- sp #X16 <- Vundef)). - exploit (storeptr_correct tge tf XSP (fn_retaddr_ofs f) RA x1 m2' m3' rs2). - simpl preg_of_iregsp. change (rs2 X30) with (rs0 X30). rewrite ATLR. - change (rs2 X2) with sp. eexact P. - simpl; congruence. congruence. - intros (rs3 & U & V). - set (rs4 := nextinstr rs3). + set (rs3 := nextinstr rs2). assert (EXEC_PROLOGUE: exec_straight tge tf tf.(fn_code) rs0 m' - x0 rs4 m3'). + x0 rs3 m3'). { change (fn_code tf) with tfbody; unfold tfbody. - apply exec_straight_step with rs2 m2'. + apply exec_straight_two with rs2 m3'. unfold exec_instr. rewrite C. fold sp. - rewrite <- (sp_val _ _ _ AG). rewrite F. reflexivity. - reflexivity. - eapply exec_straight_trans with (rs2 := rs3) (m2 := m3'). - eexact U. eapply exec_straight_one; eauto. - } + rewrite <- (sp_val _ _ _ AG), ATLR. rewrite F, P. + reflexivity. reflexivity. reflexivity. reflexivity. } exploit exec_straight_steps_2; eauto using functions_transl. lia. constructor. intros (ofs' & X & Y). - left; exists (State rs4 m3'); split. + left; exists (State rs3 m3'); split. eapply exec_straight_steps_1; eauto. lia. constructor. econstructor; eauto. rewrite X; econstructor; eauto. @@ -969,9 +957,7 @@ Local Transparent destroyed_by_op. Local Transparent destroyed_at_function_entry. simpl. simpl; intros; Simpl. unfold sp; congruence. - intros. unfold rs4. rewrite <- V by eauto with asmgen; Simpl. - unfold rs4 in * ; intros. - rewrite nextinstr_inv; try apply V; eauto with asmgen. + intros. unfold rs3. Simpl. - (* external function *) exploit functions_translated; eauto. intros [tf [A B]]. simpl in B. inv B. diff --git a/aarch64/TargetPrinter.ml b/aarch64/TargetPrinter.ml index 198a75b7a4..899ba5f389 100644 --- a/aarch64/TargetPrinter.ml +++ b/aarch64/TargetPrinter.ml @@ -308,6 +308,7 @@ module Target(System: SYSTEM): TARGET = | ADuxt(base, r, n) -> fprintf oc "[%a, %a, uxtw #%a]" xregsp base wreg r coqint n | ADadr(base, id, ofs) -> fprintf oc "[%a, %a]" xregsp base symbol_offset_low (id, ofs) | ADpostincr(base, n) -> fprintf oc "[%a], #%a" xregsp base coqint64 n + | ADpreincr(base, n) -> fprintf oc "[%a, #%a]!" xregsp base coqint64 n (* Print a shifted operand *) let shiftop oc = function @@ -583,7 +584,7 @@ module Target(System: SYSTEM): TARGET = | Pnop -> fprintf oc " nop\n" (* Pseudo-instructions expanded in Asmexpand *) - | Pallocframe(sz, linkofs) -> assert false + | Pallocframe(sz, linkofs, retaddrofs) -> assert false | Pfreeframe(sz, linkofs) -> assert false | Pcvtx2w rd -> assert false (* Pseudo-instructions not yet expanded *) diff --git a/test b/test index 081175df37..00171a6b4f 160000 --- a/test +++ b/test @@ -1 +1 @@ -Subproject commit 081175df374260304a5cd78dd3dfaded93dceb41 +Subproject commit 00171a6b4ff307785729cc41c3ac2cdc416fafea From 4874bcf0767218802e9232984bbd68467e516e7b Mon Sep 17 00:00:00 2001 From: Xavier Leroy Date: Fri, 22 May 2026 16:22:15 +0200 Subject: [PATCH 2/3] AArch64: better use of store pair in function prologues Even when the stack frame is too big for a preindexed store pair, we can use a SP decrement followed by a store pair, saving one instruction. --- aarch64/Asmexpand.ml | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/aarch64/Asmexpand.ml b/aarch64/Asmexpand.ml index 7df7c21164..f0dbeac929 100644 --- a/aarch64/Asmexpand.ml +++ b/aarch64/Asmexpand.ml @@ -18,6 +18,7 @@ open Asmexpandaux open AST open Camlcoq module Ptrofs = Integers.Ptrofs +module I64 = Integers.Int64 exception Error of string @@ -39,9 +40,6 @@ let expand_loadimm32 (dst: ireg) n = let expand_addimm64 (dst: iregsp) (src: iregsp) n = List.iter emit (Asmgen.addimm64 dst src n []) -let expand_storeptr (src: ireg) (base: iregsp) ofs = - List.iter emit (Asmgen.indexed_memory_access (fun a -> Pstrx(src, a)) (Z.of_uint 8) base ofs []) - (* Handling of varargs *) (* Determine the number of int registers, FP registers, and stack locations @@ -501,21 +499,20 @@ let expand_instruction instr = end else begin current_function_stacksize := Z.to_int64 sz end; + assert (retaddrofs = Z.(add linkofs (of_uint 8))); let sz1 = Z.(sub sz linkofs) in - if retaddrofs = Z.(add linkofs (of_uint 8)) - && Z.(le sz1 (of_uint 512)) then begin - emit (Pstp(X15, X30, ADpreincr(XSP, Z.neg sz1))); - emit (Pcfi_adjust sz1); - let sz2 = Z.(sub sz sz1) in - if Z.(gt sz2 zero) then begin - expand_addimm64 XSP XSP (Ptrofs.repr (Z.neg sz2)); - emit (Pcfi_adjust sz2) - end + if Z.(le sz1 (of_uint 512)) then begin + emit (Pstp(X15, X30, ADpreincr(XSP, I64.repr (Z.neg sz1)))); + emit (Pcfi_adjust sz1) end else begin - expand_addimm64 XSP XSP (Ptrofs.repr (Z.neg sz)); - emit (Pcfi_adjust sz); - expand_storeptr X15 XSP linkofs; - expand_storeptr X30 XSP retaddrofs + expand_addimm64 XSP XSP (Ptrofs.repr (Z.neg sz1)); + emit (Pcfi_adjust sz1); + emit (Pstp(X15, X30, ADimm(XSP, I64.zero))) + end; + let sz2 = Z.(sub sz sz1) in + if Z.(gt sz2 zero) then begin + expand_addimm64 XSP XSP (Ptrofs.repr (Z.neg sz2)); + emit (Pcfi_adjust sz2) end | Pfreeframe (sz, ofs) -> expand_addimm64 XSP XSP (coqint_of_camlint64 !current_function_stacksize) From ade8dc3bcaa1d347fc91dfa34feba61b9de42860 Mon Sep 17 00:00:00 2001 From: Xavier Leroy Date: Fri, 22 May 2026 17:03:40 +0200 Subject: [PATCH 3/3] Un-update the test suite (too early) --- test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test b/test index 00171a6b4f..081175df37 160000 --- a/test +++ b/test @@ -1 +1 @@ -Subproject commit 00171a6b4ff307785729cc41c3ac2cdc416fafea +Subproject commit 081175df374260304a5cd78dd3dfaded93dceb41