Skip to content

Commit e536195

Browse files
Merge branch 'main' into wasi_http
2 parents 9b2312b + 52e9053 commit e536195

File tree

16 files changed

+634
-101
lines changed

16 files changed

+634
-101
lines changed

cranelift/codegen/src/isa/aarch64/inst.isle

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -651,6 +651,16 @@
651651
(rm Reg)
652652
(size VectorSize))
653653

654+
;; A vector ALU op modifying a source register.
655+
(VecFmlaElem
656+
(alu_op VecALUModOp)
657+
(rd WritableReg)
658+
(ri Reg)
659+
(rn Reg)
660+
(rm Reg)
661+
(size VectorSize)
662+
(idx u8))
663+
654664
;; Vector two register miscellaneous instruction.
655665
(VecMisc
656666
(op VecMisc2)
@@ -1850,14 +1860,22 @@
18501860
(_ Unit (emit (MInst.FpuRR op size dst src))))
18511861
dst))
18521862

1853-
;; Helper for emitting `MInst.VecRRR` instructions which use three registers,
1863+
;; Helper for emitting `MInst.VecRRRMod` instructions which use three registers,
18541864
;; one of which is both source and output.
18551865
(decl vec_rrr_mod (VecALUModOp Reg Reg Reg VectorSize) Reg)
18561866
(rule (vec_rrr_mod op src1 src2 src3 size)
18571867
(let ((dst WritableReg (temp_writable_reg $I8X16))
18581868
(_1 Unit (emit (MInst.VecRRRMod op dst src1 src2 src3 size))))
18591869
dst))
18601870

1871+
;; Helper for emitting `MInst.VecFmlaElem` instructions which use three registers,
1872+
;; one of which is both source and output.
1873+
(decl vec_fmla_elem (VecALUModOp Reg Reg Reg VectorSize u8) Reg)
1874+
(rule (vec_fmla_elem op src1 src2 src3 size idx)
1875+
(let ((dst WritableReg (temp_writable_reg $I8X16))
1876+
(_1 Unit (emit (MInst.VecFmlaElem op dst src1 src2 src3 size idx))))
1877+
dst))
1878+
18611879
(decl fpu_rri (FPUOpRI Reg) Reg)
18621880
(rule (fpu_rri op src)
18631881
(let ((dst WritableReg (temp_writable_reg $F64))

cranelift/codegen/src/isa/aarch64/inst/emit.rs

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2914,6 +2914,45 @@ impl MachInstEmit for Inst {
29142914
};
29152915
sink.put4(enc_vec_rrr(top11 | q << 9, rm, bit15_10, rn, rd));
29162916
}
2917+
&Inst::VecFmlaElem {
2918+
rd,
2919+
ri,
2920+
rn,
2921+
rm,
2922+
alu_op,
2923+
size,
2924+
idx,
2925+
} => {
2926+
let rd = allocs.next_writable(rd);
2927+
let ri = allocs.next(ri);
2928+
debug_assert_eq!(rd.to_reg(), ri);
2929+
let rn = allocs.next(rn);
2930+
let rm = allocs.next(rm);
2931+
let idx = u32::from(idx);
2932+
2933+
let (q, _size) = size.enc_size();
2934+
let o2 = match alu_op {
2935+
VecALUModOp::Fmla => 0b0,
2936+
VecALUModOp::Fmls => 0b1,
2937+
_ => unreachable!(),
2938+
};
2939+
2940+
let (h, l) = match size {
2941+
VectorSize::Size32x4 => {
2942+
assert!(idx < 4);
2943+
(idx >> 1, idx & 1)
2944+
}
2945+
VectorSize::Size64x2 => {
2946+
assert!(idx < 2);
2947+
(idx, 0)
2948+
}
2949+
_ => unreachable!(),
2950+
};
2951+
2952+
let top11 = 0b000_011111_00 | (q << 9) | (size.enc_float_size() << 1) | l;
2953+
let bit15_10 = 0b000100 | (o2 << 4) | (h << 1);
2954+
sink.put4(enc_vec_rrr(top11, rm, bit15_10, rn, rd));
2955+
}
29172956
&Inst::VecLoadReplicate {
29182957
rd,
29192958
rn,

cranelift/codegen/src/isa/aarch64/inst/mod.rs

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -812,7 +812,7 @@ fn aarch64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut Operan
812812
collector.reg_use(rn);
813813
collector.reg_use(rm);
814814
}
815-
&Inst::VecRRRMod { rd, ri, rn, rm, .. } => {
815+
&Inst::VecRRRMod { rd, ri, rn, rm, .. } | &Inst::VecFmlaElem { rd, ri, rn, rm, .. } => {
816816
collector.reg_reuse_def(rd, 1); // `rd` == `ri`.
817817
collector.reg_use(ri);
818818
collector.reg_use(rn);
@@ -2171,6 +2171,26 @@ impl Inst {
21712171
let rm = pretty_print_vreg_vector(rm, size, allocs);
21722172
format!("{} {}, {}, {}, {}", op, rd, ri, rn, rm)
21732173
}
2174+
&Inst::VecFmlaElem {
2175+
rd,
2176+
ri,
2177+
rn,
2178+
rm,
2179+
alu_op,
2180+
size,
2181+
idx,
2182+
} => {
2183+
let (op, size) = match alu_op {
2184+
VecALUModOp::Fmla => ("fmla", size),
2185+
VecALUModOp::Fmls => ("fmls", size),
2186+
_ => unreachable!(),
2187+
};
2188+
let rd = pretty_print_vreg_vector(rd.to_reg(), size, allocs);
2189+
let ri = pretty_print_vreg_vector(ri, size, allocs);
2190+
let rn = pretty_print_vreg_vector(rn, size, allocs);
2191+
let rm = pretty_print_vreg_element(rm, idx.into(), size.lane_size(), allocs);
2192+
format!("{} {}, {}, {}, {}", op, rd, ri, rn, rm)
2193+
}
21742194
&Inst::VecRRRLong {
21752195
rd,
21762196
rn,

cranelift/codegen/src/isa/aarch64/lower.isle

Lines changed: 56 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -513,17 +513,62 @@
513513

514514
;;;; Rules for `fma` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
515515

516-
(rule (lower (has_type ty @ (multi_lane _ _) (fma x y z)))
517-
(vec_rrr_mod (VecALUModOp.Fmla) z x y (vector_size ty)))
518-
519-
(rule 1 (lower (has_type ty @ (multi_lane _ _) (fma (fneg x) y z)))
520-
(vec_rrr_mod (VecALUModOp.Fmls) z x y (vector_size ty)))
521-
522-
(rule 2 (lower (has_type ty @ (multi_lane _ _) (fma x (fneg y) z)))
523-
(vec_rrr_mod (VecALUModOp.Fmls) z x y (vector_size ty)))
524-
525-
(rule 3 (lower (has_type (ty_scalar_float ty) (fma x y z)))
526-
(fpu_rrrr (FPUOp3.MAdd) (scalar_size ty) x y z))
516+
(rule (lower (has_type (ty_scalar_float ty) (fma x y z)))
517+
(fpu_rrrr (FPUOp3.MAdd) (scalar_size ty) x y z))
518+
519+
;; Delegate vector-based lowerings to helpers below
520+
(rule 1 (lower (has_type ty @ (multi_lane _ _) (fma x y z)))
521+
(lower_fmla (VecALUModOp.Fmla) x y z (vector_size ty)))
522+
523+
;; Lowers a fused-multiply-add operation handling various forms of the
524+
;; instruction to get maximal coverage of what's available on AArch64.
525+
(decl lower_fmla (VecALUModOp Value Value Value VectorSize) Reg)
526+
527+
;; Base case, emit the op requested.
528+
(rule (lower_fmla op x y z size)
529+
(vec_rrr_mod op z x y size))
530+
531+
;; Special case: if one of the multiplicands are a splat then the element-based
532+
;; fma can be used instead with 0 as the element index.
533+
(rule 1 (lower_fmla op (splat x) y z size)
534+
(vec_fmla_elem op z y x size 0))
535+
(rule 2 (lower_fmla op x (splat y) z size)
536+
(vec_fmla_elem op z x y size 0))
537+
538+
;; Special case: if one of the multiplicands is a shuffle to broadcast a
539+
;; single element of a vector then the element-based fma can be used like splat
540+
;; above.
541+
;;
542+
;; Note that in Cranelift shuffle always has i8x16 inputs and outputs so
543+
;; a `bitcast` is matched here explicitly since that's the main way a shuffle
544+
;; output will be fed into this instruction.
545+
(rule 3 (lower_fmla op (bitcast _ (shuffle x x (shuffle32_from_imm n n n n))) y z size @ (VectorSize.Size32x4))
546+
(if-let $true (u64_lt n 4))
547+
(vec_fmla_elem op z y x size n))
548+
(rule 4 (lower_fmla op x (bitcast _ (shuffle y y (shuffle32_from_imm n n n n))) z size @ (VectorSize.Size32x4))
549+
(if-let $true (u64_lt n 4))
550+
(vec_fmla_elem op z x y size n))
551+
(rule 3 (lower_fmla op (bitcast _ (shuffle x x (shuffle64_from_imm n n))) y z size @ (VectorSize.Size64x2))
552+
(if-let $true (u64_lt n 2))
553+
(vec_fmla_elem op z y x size n))
554+
(rule 4 (lower_fmla op x (bitcast _ (shuffle y y (shuffle64_from_imm n n))) z size @ (VectorSize.Size64x2))
555+
(if-let $true (u64_lt n 2))
556+
(vec_fmla_elem op z x y size n))
557+
558+
;; Special case: if one of the multiplicands is `fneg` then peel that away,
559+
;; reverse the operation being performed, and then recurse on `lower_fmla`
560+
;; again to generate the actual instruction.
561+
;;
562+
;; Note that these are the highest priority cases for `lower_fmla` to peel
563+
;; away as many `fneg` operations as possible.
564+
(rule 5 (lower_fmla op (fneg x) y z size)
565+
(lower_fmla (neg_fmla op) x y z size))
566+
(rule 6 (lower_fmla op x (fneg y) z size)
567+
(lower_fmla (neg_fmla op) x y z size))
568+
569+
(decl neg_fmla (VecALUModOp) VecALUModOp)
570+
(rule (neg_fmla (VecALUModOp.Fmla)) (VecALUModOp.Fmls))
571+
(rule (neg_fmla (VecALUModOp.Fmls)) (VecALUModOp.Fmla))
527572

528573
;;;; Rules for `fcopysign` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
529574

cranelift/codegen/src/isa/riscv64/inst.isle

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -708,8 +708,6 @@
708708
(decl u8_as_i32 (u8) i32)
709709
(extern constructor u8_as_i32 u8_as_i32)
710710

711-
(convert u8 u64 u8_as_u64)
712-
713711
(decl convert_valueregs_reg (ValueRegs) Reg)
714712
(rule (convert_valueregs_reg x)
715713
(value_regs_get x 0))
@@ -1283,7 +1281,7 @@
12831281
(rule
12841282
(load_imm12 x)
12851283
(rv_addi (zero_reg) (imm12_const x)))
1286-
1284+
12871285
;; for load immediate
12881286
(decl imm_from_bits (u64) Imm12)
12891287
(extern constructor imm_from_bits imm_from_bits)
@@ -1509,7 +1507,7 @@
15091507
(_ Unit (emit (MInst.Cltz leading sum step tmp rs ty))))
15101508
sum))
15111509

1512-
1510+
15131511
;; Extends an integer if it is smaller than 64 bits.
15141512
(decl ext_int_if_need (bool ValueRegs Type) ValueRegs)
15151513
;;; For values smaller than 64 bits, we need to extend them to 64 bits
@@ -2117,7 +2115,7 @@
21172115
(reuslt VecWritableReg (vec_writable_clone dst))
21182116
(_ Unit (emit (MInst.Select dst ty c x y))))
21192117
(vec_writable_to_regs reuslt)))
2120-
2118+
21212119
;; Parameters are "intcc compare_a compare_b rs1 rs2".
21222120
(decl gen_select_reg (IntCC Reg Reg Reg Reg) Reg)
21232121
(extern constructor gen_select_reg gen_select_reg)

cranelift/codegen/src/prelude.isle

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@
8282

8383
(decl pure u8_as_u64 (u8) u64)
8484
(extern constructor u8_as_u64 u8_as_u64)
85+
(convert u8 u64 u8_as_u64)
8586

8687
(decl pure u16_as_u64 (u16) u64)
8788
(extern constructor u16_as_u64 u16_as_u64)

cranelift/filetests/filetests/isa/aarch64/fma.clif

Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,3 +157,152 @@ block0(v0: f64x2, v1: f64x2, v2: f64x2):
157157
; fmls v0.2d, v5.2d, v1.2d
158158
; ret
159159

160+
function %f32x4_splat0(f32, f32x4, f32x4) -> f32x4 {
161+
block0(v0: f32, v1: f32x4, v2: f32x4):
162+
v3 = splat.f32x4 v0
163+
v4 = fma v3, v1, v2
164+
return v4
165+
}
166+
167+
; VCode:
168+
; block0:
169+
; mov v5.16b, v0.16b
170+
; mov v0.16b, v2.16b
171+
; fmla v0.4s, v0.4s, v1.4s, v5.s[0]
172+
; ret
173+
;
174+
; Disassembled:
175+
; block0: ; offset 0x0
176+
; mov v5.16b, v0.16b
177+
; mov v0.16b, v2.16b
178+
; fmla v0.4s, v1.4s, v5.s[0]
179+
; ret
180+
181+
function %f32x4_splat1(f32x4, f32, f32x4) -> f32x4 {
182+
block0(v0: f32x4, v1: f32, v2: f32x4):
183+
v3 = splat.f32x4 v1
184+
v4 = fneg v0
185+
v5 = fma v4, v3, v2
186+
return v5
187+
}
188+
189+
; VCode:
190+
; block0:
191+
; mov v5.16b, v0.16b
192+
; mov v0.16b, v2.16b
193+
; fmls v0.4s, v0.4s, v5.4s, v1.s[0]
194+
; ret
195+
;
196+
; Disassembled:
197+
; block0: ; offset 0x0
198+
; mov v5.16b, v0.16b
199+
; mov v0.16b, v2.16b
200+
; fmls v0.4s, v5.4s, v1.s[0]
201+
; ret
202+
203+
function %f32x4_splat2(f32x4, f32x4, f32x4) -> f32x4 {
204+
block0(v0: f32x4, v1: f32x4, v2: f32x4):
205+
v3 = bitcast.i8x16 little v0
206+
v4 = shuffle v3, v3, 0x07060504_07060504_07060504_07060504
207+
v5 = bitcast.f32x4 little v4
208+
v6 = fma v5, v1, v2
209+
return v6
210+
}
211+
212+
; VCode:
213+
; block0:
214+
; mov v5.16b, v0.16b
215+
; mov v0.16b, v2.16b
216+
; fmla v0.4s, v0.4s, v1.4s, v5.s[1]
217+
; ret
218+
;
219+
; Disassembled:
220+
; block0: ; offset 0x0
221+
; mov v5.16b, v0.16b
222+
; mov v0.16b, v2.16b
223+
; fmla v0.4s, v1.4s, v5.s[1]
224+
; ret
225+
226+
function %f32x4_splat3(f32x4, f32x4, f32x4) -> f32x4 {
227+
block0(v0: f32x4, v1: f32x4, v2: f32x4):
228+
v3 = bitcast.i8x16 little v1
229+
v4 = shuffle v3, v3, 0x0f0e0d0c_0f0e0d0c_0f0e0d0c_0f0e0d0c
230+
v5 = bitcast.f32x4 little v4
231+
v6 = fneg v5
232+
v7 = fma v0, v6, v2
233+
return v7
234+
}
235+
236+
; VCode:
237+
; block0:
238+
; mov v5.16b, v0.16b
239+
; mov v0.16b, v2.16b
240+
; fmls v0.4s, v0.4s, v5.4s, v1.s[3]
241+
; ret
242+
;
243+
; Disassembled:
244+
; block0: ; offset 0x0
245+
; mov v5.16b, v0.16b
246+
; mov v0.16b, v2.16b
247+
; fmls v0.4s, v5.4s, v1.s[3]
248+
; ret
249+
250+
function %f32x4_splat4(f32x4, f32x4, f32x4) -> f32x4 {
251+
block0(v0: f32x4, v1: f32x4, v2: f32x4):
252+
v3 = bitcast.i8x16 little v1
253+
v4 = shuffle v3, v3, 0x1f1e1d1c_1f1e1d1c_1f1e1d1c_1f1e1d1c
254+
v5 = bitcast.f32x4 little v4
255+
v6 = fma v0, v5, v2
256+
return v6
257+
}
258+
259+
; VCode:
260+
; block0:
261+
; mov v31.16b, v1.16b
262+
; movz w6, #7452
263+
; movk w6, w6, #7966, LSL #16
264+
; dup v17.4s, w6
265+
; mov v30.16b, v31.16b
266+
; tbl v19.16b, { v30.16b, v31.16b }, v17.16b
267+
; mov v23.16b, v0.16b
268+
; mov v0.16b, v2.16b
269+
; fmla v0.4s, v0.4s, v23.4s, v19.4s
270+
; ret
271+
;
272+
; Disassembled:
273+
; block0: ; offset 0x0
274+
; mov v31.16b, v1.16b
275+
; mov w6, #0x1d1c
276+
; movk w6, #0x1f1e, lsl #16
277+
; dup v17.4s, w6
278+
; mov v30.16b, v31.16b
279+
; tbl v19.16b, {v30.16b, v31.16b}, v17.16b
280+
; mov v23.16b, v0.16b
281+
; mov v0.16b, v2.16b
282+
; fmla v0.4s, v23.4s, v19.4s
283+
; ret
284+
285+
function %f64x2_splat0(f64x2, f64x2, f64x2) -> f64x2 {
286+
block0(v0: f64x2, v1: f64x2, v2: f64x2):
287+
v3 = bitcast.i8x16 little v1
288+
v4 = shuffle v3, v3, 0x0f0e0d0c0b0a0908_0f0e0d0c0b0a0908
289+
v5 = bitcast.f64x2 little v4
290+
v6 = fneg v5
291+
v7 = fma v0, v6, v2
292+
return v7
293+
}
294+
295+
; VCode:
296+
; block0:
297+
; mov v5.16b, v0.16b
298+
; mov v0.16b, v2.16b
299+
; fmls v0.2d, v0.2d, v5.2d, v1.d[1]
300+
; ret
301+
;
302+
; Disassembled:
303+
; block0: ; offset 0x0
304+
; mov v5.16b, v0.16b
305+
; mov v0.16b, v2.16b
306+
; fmls v0.2d, v5.2d, v1.d[1]
307+
; ret
308+

0 commit comments

Comments
 (0)