From c13831e68a0c3660ca2a53200e0da9d6459385d1 Mon Sep 17 00:00:00 2001 From: Vlad Brezae Date: Fri, 30 Apr 2021 23:33:45 +0300 Subject: [PATCH 1/3] [interp] Replace multiplication and division by 1 with simple mov --- src/mono/mono/mini/interp/transform.c | 43 +++++++++++++++++++++++++-- 1 file changed, 40 insertions(+), 3 deletions(-) diff --git a/src/mono/mono/mini/interp/transform.c b/src/mono/mono/mini/interp/transform.c index 7b7a3afa82c2e0..81b2256fa9882e 100644 --- a/src/mono/mono/mini/interp/transform.c +++ b/src/mono/mono/mini/interp/transform.c @@ -7984,7 +7984,7 @@ interp_fold_unop_cond_br (TransformData *td, InterpBasicBlock *cbb, LocalValue * static InterpInst* -interp_fold_binop (TransformData *td, LocalValue *local_defs, InterpInst *ins) +interp_fold_binop (TransformData *td, LocalValue *local_defs, InterpInst *ins, gboolean *folded) { int *local_ref_count = td->local_ref_count; // ins should be a binop, therefore it should have a single dreg and two sregs @@ -7995,6 +7995,8 @@ interp_fold_binop (TransformData *td, LocalValue *local_defs, InterpInst *ins) LocalValue *val2 = &local_defs [sreg2]; LocalValue result; + *folded = FALSE; + if (val1->type != LOCAL_VALUE_I4 && val1->type != LOCAL_VALUE_I8) return ins; if (val2->type != LOCAL_VALUE_I4 && val2->type != LOCAL_VALUE_I8) @@ -8066,7 +8068,7 @@ interp_fold_binop (TransformData *td, LocalValue *local_defs, InterpInst *ins) // with a LDC of the constant. We leave alone the sregs of this instruction, for // deadce to kill the instructions initializing them. mono_interp_stats.constant_folds++; - + *folded = TRUE; if (result.type == LOCAL_VALUE_I4) ins = interp_get_ldc_i4_from_const (td, ins, result.i, dreg); else if (result.type == LOCAL_VALUE_I8) @@ -8341,7 +8343,42 @@ interp_cprop (TransformData *td) } else if (MINT_IS_UNOP_CONDITIONAL_BRANCH (opcode)) { ins = interp_fold_unop_cond_br (td, bb, local_defs, ins); } else if (MINT_IS_BINOP (opcode)) { - ins = interp_fold_binop (td, local_defs, ins); + gboolean folded; + ins = interp_fold_binop (td, local_defs, ins, &folded); + if (!folded) { + int sreg = -1; + int mov_op; + if ((opcode == MINT_MUL_I4 || opcode == MINT_DIV_I4) && + local_defs [ins->sregs [1]].type == LOCAL_VALUE_I4 && + local_defs [ins->sregs [1]].i == 1) { + sreg = ins->sregs [0]; + mov_op = MINT_MOV_4; + } else if ((opcode == MINT_MUL_I8 || opcode == MINT_DIV_I8) && + local_defs [ins->sregs [1]].type == LOCAL_VALUE_I8 && + local_defs [ins->sregs [1]].l == 1) { + sreg = ins->sregs [0]; + mov_op = MINT_MOV_8; + } else if (opcode == MINT_MUL_I4 && + local_defs [ins->sregs [0]].type == LOCAL_VALUE_I4 && + local_defs [ins->sregs [0]].i == 1) { + sreg = ins->sregs [1]; + mov_op = MINT_MOV_4; + } else if (opcode == MINT_MUL_I8 && + local_defs [ins->sregs [0]].type == LOCAL_VALUE_I8 && + local_defs [ins->sregs [0]].l == 1) { + sreg = ins->sregs [1]; + mov_op = MINT_MOV_8; + } + if (sreg != -1) { + ins->opcode = mov_op; + ins->sregs [0] = sreg; + if (td->verbose_level) { + g_print ("Replace idempotent binop :\n\t"); + dump_interp_inst (ins); + } + needs_retry = TRUE; + } + } } else if (MINT_IS_BINOP_CONDITIONAL_BRANCH (opcode)) { ins = interp_fold_binop_cond_br (td, bb, local_defs, ins); } else if (MINT_IS_LDFLD (opcode) && ins->data [0] == 0) { From b6d7053c899fe5654607bfb20fbf487d8e691a8c Mon Sep 17 00:00:00 2001 From: Vlad Brezae Date: Fri, 30 Apr 2021 23:48:39 +0300 Subject: [PATCH 2/3] [interp] Skip emitting redundant branch to next basic block --- src/mono/mono/mini/interp/transform.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/mono/mono/mini/interp/transform.c b/src/mono/mono/mini/interp/transform.c index 81b2256fa9882e..61f7e420229db3 100644 --- a/src/mono/mono/mini/interp/transform.c +++ b/src/mono/mono/mini/interp/transform.c @@ -7549,6 +7549,9 @@ emit_compacted_instruction (TransformData *td, guint16* start_ip, InterpInst *in if (ins->info.target_bb->native_offset >= 0) { // Backwards branch. We can already patch it. *ip++ = ins->info.target_bb->native_offset - br_offset; + } else if (opcode == MINT_BR_S && ins->info.target_bb == td->cbb->next_bb) { + // Ignore branch to the next basic block. Revert the added MINT_BR_S. + ip--; } else { // We don't know the in_offset of the target, add a reloc Reloc *reloc = (Reloc*)mono_mempool_alloc0 (td->mempool, sizeof (Reloc)); @@ -7696,6 +7699,7 @@ generate_compacted_code (TransformData *td) for (bb = td->entry_bb; bb != NULL; bb = bb->next_bb) { InterpInst *ins = bb->first_ins; bb->native_offset = ip - td->new_code; + td->cbb = bb; while (ins) { ip = emit_compacted_instruction (td, ip, ins); ins = ins->next; From 397c998039f1570135de6ba145f12cc7d98a631e Mon Sep 17 00:00:00 2001 From: Vlad Brezae Date: Sat, 1 May 2021 01:40:51 +0300 Subject: [PATCH 3/3] [interp] Squash multiple call args moves into single opcode Some vars cannot be used directly as an argument to another call. In this case, the var offset allocator generates new intermediary vars. For methods with a lot of parameters, we can end up with quite a lot of these stores. As an example, for the following method: ``` public static void MethodPartial (int a, int b, object c, object d) { MethodFull (a, b, c, d, 12523); } ``` Before: ``` IR_0000: ldc.i8 [72 <- nil], 12523 IR_0006: mov.4 [40 <- 0], IR_0009: mov.4 [48 <- 8], IR_000c: mov.8 [56 <- 16], IR_000f: mov.8 [64 <- 24], IR_0012: call [32 <- 40], 0 IR_0016: ret.void [nil <- nil], ``` After: ``` IR_0000: ldc.i8 [72 <- nil], 12523 IR_0006: mov.8.4 [nil <- nil], 40 <- 0, 48 <- 8, 56 <- 16, 64 <- 24 IR_000f: call [32 <- 40], 0 IR_0013: ret.void [nil <- nil] ``` --- src/mono/mono/mini/interp/interp.c | 19 +++++++ src/mono/mono/mini/interp/mintops.def | 6 +++ src/mono/mono/mini/interp/mintops.h | 7 ++- src/mono/mono/mini/interp/transform.c | 74 +++++++++++++++++++++++---- 4 files changed, 94 insertions(+), 12 deletions(-) diff --git a/src/mono/mono/mini/interp/interp.c b/src/mono/mono/mini/interp/interp.c index 5fb08cc4f23d82..2b92e7f160e22e 100644 --- a/src/mono/mono/mini/interp/interp.c +++ b/src/mono/mono/mini/interp/interp.c @@ -6576,6 +6576,25 @@ MINT_IN_CASE(MINT_BRTRUE_I8_SP) ZEROP_SP(gint64, !=); MINT_IN_BREAK; MINT_IN_BREAK; } + MINT_IN_CASE(MINT_MOV_8_2) + LOCAL_VAR (ip [1], guint64) = LOCAL_VAR (ip [2], guint64); + LOCAL_VAR (ip [3], guint64) = LOCAL_VAR (ip [4], guint64); + ip += 5; + MINT_IN_BREAK; + MINT_IN_CASE(MINT_MOV_8_3) + LOCAL_VAR (ip [1], guint64) = LOCAL_VAR (ip [2], guint64); + LOCAL_VAR (ip [3], guint64) = LOCAL_VAR (ip [4], guint64); + LOCAL_VAR (ip [5], guint64) = LOCAL_VAR (ip [6], guint64); + ip += 7; + MINT_IN_BREAK; + MINT_IN_CASE(MINT_MOV_8_4) + LOCAL_VAR (ip [1], guint64) = LOCAL_VAR (ip [2], guint64); + LOCAL_VAR (ip [3], guint64) = LOCAL_VAR (ip [4], guint64); + LOCAL_VAR (ip [5], guint64) = LOCAL_VAR (ip [6], guint64); + LOCAL_VAR (ip [7], guint64) = LOCAL_VAR (ip [8], guint64); + ip += 9; + MINT_IN_BREAK; + MINT_IN_CASE(MINT_LOCALLOC) { int len = LOCAL_VAR (ip [2], gint32); gpointer mem = frame_data_allocator_alloc (&context->data_stack, frame, ALIGN_TO (len, MINT_VT_ALIGNMENT)); diff --git a/src/mono/mono/mini/interp/mintops.def b/src/mono/mono/mini/interp/mintops.def index a82cdb8623ecd8..1f55c92ec59000 100644 --- a/src/mono/mono/mini/interp/mintops.def +++ b/src/mono/mono/mini/interp/mintops.def @@ -108,6 +108,12 @@ OPDEF(MINT_MOV_4, "mov.4", 3, 1, 1, MintOpNoArgs) OPDEF(MINT_MOV_8, "mov.8", 3, 1, 1, MintOpNoArgs) OPDEF(MINT_MOV_VT, "mov.vt", 4, 1, 1, MintOpShortInt) +// These opcodes represent multiple moves stacked together. They have multiple src and dst +// but they are not represented here. They are generated by the var offset allocator. +OPDEF(MINT_MOV_8_2, "mov.8.2", 5, 0, 0, MintOpPair2) +OPDEF(MINT_MOV_8_3, "mov.8.3", 7, 0, 0, MintOpPair3) +OPDEF(MINT_MOV_8_4, "mov.8.4", 9, 0, 0, MintOpPair4) + OPDEF(MINT_LDLOCA_S, "ldloca.s", 3, 1, 0, MintOpUShortInt) OPDEF(MINT_LDIND_I1, "ldind.i1", 3, 1, 1, MintOpNoArgs) diff --git a/src/mono/mono/mini/interp/mintops.h b/src/mono/mono/mini/interp/mintops.h index ec5c95298c0467..82c78ac9243bfa 100644 --- a/src/mono/mono/mini/interp/mintops.h +++ b/src/mono/mono/mini/interp/mintops.h @@ -24,7 +24,10 @@ typedef enum MintOpClassToken, MintOpTwoShorts, MintOpShortAndInt, - MintOpShortAndShortBranch + MintOpShortAndShortBranch, + MintOpPair2, + MintOpPair3, + MintOpPair4 } MintOpArgType; #define OPDEF(a,b,c,d,e,f) a, @@ -74,6 +77,8 @@ typedef enum { #define MINT_CALL_ARGS 2 #define MINT_CALL_ARGS_SREG -2 +#define MINT_MOV_PAIRS_MAX 4 + extern unsigned char const mono_interp_oplen[]; extern int const mono_interp_op_dregs []; extern int const mono_interp_op_sregs []; diff --git a/src/mono/mono/mini/interp/transform.c b/src/mono/mono/mini/interp/transform.c index 61f7e420229db3..284004fbcc4155 100644 --- a/src/mono/mono/mini/interp/transform.c +++ b/src/mono/mono/mini/interp/transform.c @@ -1411,6 +1411,14 @@ dump_interp_ins_data (InterpInst *ins, gint32 ins_offset, const guint16 *data, g target = ins_offset + *(gint16*)(data + 1); g_string_append_printf (str, " %u, IR_%04x", *(guint16*)data, target); } + case MintOpPair2: + g_string_append_printf (str, " %u <- %u, %u <- %u", data [0], data [1], data [2], data [3]); + break; + case MintOpPair3: + g_string_append_printf (str, " %u <- %u, %u <- %u, %u <- %u", data [0], data [1], data [2], data [3], data [4], data [5]); + break; + case MintOpPair4: + g_string_append_printf (str, " %u <- %u, %u <- %u, %u <- %u, %u <- %u", data [0], data [1], data [2], data [3], data [4], data [5], data [6], data [7]); break; default: g_string_append_printf (str, "unknown arg type\n"); @@ -7650,6 +7658,12 @@ emit_compacted_instruction (TransformData *td, guint16* start_ip, InterpInst *in for (int i = size - 1; i < (jit_call2_size - 1); i++) *ip++ = MINT_NIY; #endif + } else if (opcode >= MINT_MOV_8_2 && opcode <= MINT_MOV_8_4) { + // This instruction is not marked as operating on any vars, all instruction slots are + // actually vas. Resolve their offset + int num_vars = mono_interp_oplen [opcode] - 1; + for (int i = 0; i < num_vars; i++) + *ip++ = td->locals [ins->data [i]].offset; } else { if (mono_interp_op_dregs [opcode]) *ip++ = td->locals [ins->dreg].offset; @@ -9146,7 +9160,11 @@ interp_alloc_offsets (TransformData *td) if (ins->flags & INTERP_INST_FLAG_CALL) { int *call_args = ins->info.call_args; if (call_args) { + int pair_sregs [MINT_MOV_PAIRS_MAX]; + int pair_dregs [MINT_MOV_PAIRS_MAX]; + int num_pairs = 0; int var = *call_args; + while (var != -1) { if (td->locals [var].flags & INTERP_LOCAL_FLAG_GLOBAL || td->locals [var].flags & INTERP_LOCAL_FLAG_NO_CALL_ARGS) { @@ -9155,17 +9173,27 @@ interp_alloc_offsets (TransformData *td) int new_var = create_interp_local (td, td->locals [var].type); td->locals [new_var].call = ins; td->locals [new_var].flags |= INTERP_LOCAL_FLAG_CALL_ARGS; - int opcode = get_mov_for_type (mint_type (td->locals [var].type), FALSE); - InterpInst *new_inst = interp_insert_ins_bb (td, bb, ins->prev, opcode); - interp_ins_set_dreg (new_inst, new_var); - interp_ins_set_sreg (new_inst, var); - if (opcode == MINT_MOV_VT) - new_inst->data [0] = td->locals [var].size; - // The arg of the call is no longer global - *call_args = new_var; - // Also update liveness for this instruction - foreach_local_var (td, new_inst, ins_index, set_var_live_range); - ins_index++; + + int mt = mint_type (td->locals [var].type); + if (mt != MINT_TYPE_VT && num_pairs < MINT_MOV_PAIRS_MAX) { + pair_sregs [num_pairs] = var; + pair_dregs [num_pairs] = new_var; + num_pairs++; + // The arg of the call is no longer global + *call_args = new_var; + } else { + int opcode = get_mov_for_type (mt, FALSE); + InterpInst *new_inst = interp_insert_ins_bb (td, bb, ins->prev, opcode); + interp_ins_set_dreg (new_inst, new_var); + interp_ins_set_sreg (new_inst, var); + if (opcode == MINT_MOV_VT) + new_inst->data [0] = td->locals [var].size; + // The arg of the call is no longer global + *call_args = new_var; + // Also update liveness for this instruction + foreach_local_var (td, new_inst, ins_index, set_var_live_range); + ins_index++; + } } else { // Flag this var as it has special storage on the call args stack td->locals [var].call = ins; @@ -9174,6 +9202,30 @@ interp_alloc_offsets (TransformData *td) call_args++; var = *call_args; } + if (num_pairs > 0) { + int i; + for (i = 0; i < num_pairs; i++) { + set_var_live_range (td, pair_sregs [i], ins_index); + set_var_live_range (td, pair_dregs [i], ins_index); + } + if (num_pairs == 1) { + int mt = mint_type (td->locals [pair_sregs [0]].type); + int opcode = get_mov_for_type (mt, FALSE); + InterpInst *new_inst = interp_insert_ins_bb (td, bb, ins->prev, opcode); + interp_ins_set_dreg (new_inst, pair_dregs [0]); + interp_ins_set_sreg (new_inst, pair_sregs [0]); + } else { + // Squash together multiple moves to the param area into a single opcode + int opcode = MINT_MOV_8_2 + num_pairs - 2; + InterpInst *new_inst = interp_insert_ins_bb (td, bb, ins->prev, opcode); + int k = 0; + for (i = 0; i < num_pairs; i++) { + new_inst->data [k++] = pair_dregs [i]; + new_inst->data [k++] = pair_sregs [i]; + } + } + ins_index++; + } } } // Set live_start and live_end for every referenced local that is not global