From db057d0bebd6c4c3510bd3158558b918f568a3dc Mon Sep 17 00:00:00 2001 From: Benedikt Meurer Date: Nov 02 2013 15:19:20 +0000 Subject: [arm] Optimize integer division and modulus by constant. git-svn-id: http://caml.inria.fr/svn/ocaml/trunk@14259 f963ae5c-01c2-4b8c-9fe0-0dff7051ff02 --- diff --git a/asmcomp/arm/emit.mlp b/asmcomp/arm/emit.mlp index f580227..3a5fa76 100644 --- a/asmcomp/arm/emit.mlp +++ b/asmcomp/arm/emit.mlp @@ -597,33 +597,79 @@ let emit_instr i = | Lop(Iintop op) -> let instr = name_for_int_operation op in ` {emit_string instr} {emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}\n`; 1 - | Lop(Iintop_imm(Idiv, n)) -> (* n is a power of 2 *) + | Lop(Iintop_imm(Idiv, n)) -> let l = Misc.log2 n in - let r = i.res.(0) in - ` movs {emit_reg r}, {emit_reg i.arg.(0)}\n`; - if n <= 256 then begin - ` it lt\n`; - ` addlt {emit_reg r}, {emit_reg r}, #{emit_int (n-1)}\n` + if n = 1 lsl l then begin + let r = i.res.(0) in + ` movs {emit_reg r}, {emit_reg i.arg.(0)}\n`; + if n <= 256 then begin + ` it lt\n`; + ` addlt {emit_reg r}, {emit_reg r}, #{emit_int (n-1)}\n` + end else begin + ` itt lt\n`; + ` addlt {emit_reg r}, {emit_reg r}, #{emit_int n}\n`; + ` sublt {emit_reg r}, {emit_reg r}, #1\n` + end; + (* Use movs to enable 16-bit T1 encoding *) + ` movs {emit_reg r}, {emit_reg r}, asr #{emit_int l}\n`; 5 end else begin - ` itt lt\n`; - ` addlt {emit_reg r}, {emit_reg r}, #{emit_int n}\n`; - ` sublt {emit_reg r}, {emit_reg r}, #1\n` - end; - (* Use movs to enable 16-bit T1 encoding *) - ` movs {emit_reg r}, {emit_reg r}, asr #{emit_int l}\n`; 5 - | Lop(Iintop_imm(Imod, n)) -> (* n is a power of 2 *) + assert (!arch >= ARMv6); + let (m, p) = Selectgen.divimm_parameters (Nativeint.of_int n) in + (* Algorithm: + t = multiply-high-signed(arg, m) + if m < 0, t = t + m + t = shift-right-signed(t, p) + res = t + sign-bit(arg) + *) + let a = i.arg.(0) in + let r = i.res.(0) in + let ninstr = emit_intconst r (Nativeint.to_int32 m) in + if m >= 0n then + ` smmul {emit_reg r}, {emit_reg r}, {emit_reg a}\n` + else + ` smmla {emit_reg r}, {emit_reg r}, {emit_reg a}, {emit_reg r}\n`; + if p > 0 then + ` movs {emit_reg r}, {emit_reg r}, asr #{emit_int p}\n`; + ` add {emit_reg r}, {emit_reg r}, {emit_reg a}, lsr #31\n`; + ninstr + 3 + end + | Lop(Iintop_imm(Imod, n)) -> let l = Misc.log2 n in let a = i.arg.(0) in let r = i.res.(0) in - let lbl = new_label() in - ` cmp {emit_reg a}, #0\n`; - ` mov {emit_reg r}, {emit_reg a}, lsl #{emit_int (32-l)}\n`; - ` mov {emit_reg r}, {emit_reg r}, lsr #{emit_int (32-l)}\n`; - ` bpl {emit_label lbl}\n`; - ` cmp {emit_reg r}, #0\n`; - ` it ne\n`; - ` subne {emit_reg r}, {emit_reg r}, #{emit_int n}\n`; - `{emit_label lbl}:\n`; 7 + if n = 1 lsl l then begin + let lbl = new_label() in + ` cmp {emit_reg a}, #0\n`; + ` mov {emit_reg r}, {emit_reg a}, lsl #{emit_int (32-l)}\n`; + ` mov {emit_reg r}, {emit_reg r}, lsr #{emit_int (32-l)}\n`; + ` bpl {emit_label lbl}\n`; + ` cmp {emit_reg r}, #0\n`; + ` it ne\n`; + ` subne {emit_reg r}, {emit_reg r}, #{emit_int n}\n`; + `{emit_label lbl}:\n`; 7 + end else begin + assert (!arch >= ARMv6); + let (m, p) = Selectgen.divimm_parameters (Nativeint.of_int n) in + (* Algorithm: + t = multiply-high-signed(arg, m) + if m < 0, t = t + m + t = shift-right-signed(t, p) + t = (t + sign-bit(arg)) * n + res = arg - t + *) + let r12 = phys_reg 8 in + let ninstr = emit_intconst r (Nativeint.to_int32 m) in + if m >= 0n then + ` smmul {emit_reg r}, {emit_reg r}, {emit_reg a}\n` + else + ` smmla {emit_reg r}, {emit_reg r}, {emit_reg a}, {emit_reg r}\n`; + if p > 0 then + ` movs {emit_reg r}, {emit_reg r}, asr #{emit_int p}\n`; + ` add {emit_reg r}, {emit_reg r}, {emit_reg a}, lsr #31\n`; + let ninstr = ninstr + emit_intconst r12 (Int32.of_int n) in + ` mls {emit_reg r}, {emit_reg r}, r12, {emit_reg a}\n`; + ninstr + 4 + end | Lop(Iintop_imm((Ilsl | Ilsr | Iasr as op), n)) -> let shift = name_for_shift_operation op in (* Use movs to enable 16-bit T1 encoding *) diff --git a/asmcomp/arm/proc.ml b/asmcomp/arm/proc.ml index dbb1317..2dd573f 100644 --- a/asmcomp/arm/proc.ml +++ b/asmcomp/arm/proc.ml @@ -201,6 +201,8 @@ let destroyed_at_oper = function destroyed_at_alloc | Iop(Iconst_symbol _) when !pic_code -> [| phys_reg 3; phys_reg 8 |] (* r3 and r12 destroyed *) + | Iop(Iintop_imm(Imod, n)) when !arch >= ARMv6 && n = 1 lsl Misc.log2 n -> + [| phys_reg 8 |] (* r12 destroyed *) | Iop(Iintoffloat | Ifloatofint | Iload(Single, _) | Istore(Single, _)) -> [| phys_reg 107 |] (* d7 (s14-s15) destroyed *) | _ -> [||] diff --git a/asmcomp/arm/selection.ml b/asmcomp/arm/selection.ml index 97f615e..023202e 100644 --- a/asmcomp/arm/selection.ml +++ b/asmcomp/arm/selection.ml @@ -54,6 +54,13 @@ let pseudoregs_for_operation op arg res = is also a result of the mul / mla operation. *) Iintop Imul | Ispecific Imuladd when !arch < ARMv6 -> (arg, [| res.(0); arg.(0) |]) + (* For integer division by a constant, which is not a power of 2, on ARMv6 + and later, the result and argument registers must be different. We deal + with this by pretending that the argument value is also a result of the + operation. *) + | Iintop_imm((Idiv | Imod), n) when !arch >= ARMv6 + && n <> 1 lsl Misc.log2 n-> + (arg, [| res.(0); arg.(0) |]) (* Soft-float Iabsf and Inegf: arg.(0) and res.(0) must be the same *) | Iabsf | Inegf when !fpu = Soft -> ([|res.(0); arg.(1)|], res) @@ -168,13 +175,13 @@ method! select_operation op args = | (Cmuli, args) -> (Iintop Imul, args) (* Turn integer division/modulus into runtime ABI calls *) - | (Cdivi, [arg; Cconst_int n]) - when n = 1 lsl Misc.log2 n -> + | (Cdivi, [arg; Cconst_int n]) when n > 0 && (!arch >= ARMv6 + || n = 1 lsl Misc.log2 n) -> (Iintop_imm(Idiv, n), [arg]) | (Cdivi, args) -> (Iextcall("__aeabi_idiv", false), args) - | (Cmodi, [arg; Cconst_int n]) - when n > 1 && n = 1 lsl Misc.log2 n -> + | (Cmodi, [arg; Cconst_int n]) when n > 0 && (!arch >= ARMv6 + || n = 1 lsl Misc.log2 n) -> (Iintop_imm(Imod, n), [arg]) | (Cmodi, args) -> (* See above for fix up of return register *)