Skip to content

Commit

Permalink
Add support for AVX10.2, Add AVX10.2 API surface and template tests (#…
Browse files Browse the repository at this point in the history
…111209)

* Add support for AVX10.2. Add AVX10.2 API surface and template tests. Lower Avx10.2 nodes accordingly.

* Add support and template tests for AVX10v2_V512

* Add new coredistools.dll build from latest llvm repo

* Limit JIT unit suite within the subsets which are stable in SDE.

* Rename API as per latest API proposal discussions

* fix sample tests in handwritten project

* Revert "Limit JIT unit suite within the subsets which are stable in SDE."

This reverts commit 067e31e.

* Limit JIT unit suite within the subsets which are stable in SDE.

* Allow a prefix of 0x00 for AVX10.2 instructions.

* Revert "Limit JIT unit suite within the subsets which are stable in SDE."

This reverts commit 067e31e.

* Limit JIT unit suite within the subsets which are stable in SDE.

* remove developer comments from files

* Enable all template tests and enable ymm embedded rounding

* Make emitter independent of ISa and based on insOpts for ymm embedded rounding

* Enable ymm embedded rounding based on architecture

* Revert "Make emitter independent of ISa and based on insOpts for ymm embedded rounding"

This reverts commit 493572f.

* Separate Avx10.2 unit testing framework from APX framework

* Revert "Limit JIT unit suite within the subsets which are stable in SDE."

This reverts commit 067e31e.

* Revert "Add new coredistools.dll build from latest llvm repo"

This reverts commit 61719f8.

* Fix formatting

* Use new keyword for class V512 to hide Avx10v1.V512 and correct CI errors

* Remove MinMax APis from lowering for numargs=2

* Add docstrings for APIs

* revert changes for sde execution of tests

* Add appropriate comments from reviews

* Apply suggestions from code review

Co-authored-by: Bruce Forstall <[email protected]>

* Add emitter tests for XMM9/16 to make sure special handling does not interfere.

* Format code

* Handle sizePrefix = 0 case when decoding evex instruction

* Add assert in appropriate places

* Club similar instructions together in perf calculation in emitxarch

* Run formatting

* Add assembly prints for debug assembly capturing for Avx10.2

* Use correct size when running emitter tests

* Ad appropriate comments and make review changes

* Apply suggestions from code review

Co-authored-by: Bruce Forstall <[email protected]>

---------

Co-authored-by: Ruihan-Yin <[email protected]>
Co-authored-by: Bruce Forstall <[email protected]>
  • Loading branch information
3 people authored Jan 24, 2025
1 parent 6b55713 commit 03b2d3d
Show file tree
Hide file tree
Showing 37 changed files with 2,092 additions and 166 deletions.
1 change: 1 addition & 0 deletions src/coreclr/jit/codegen.h
Original file line number Diff line number Diff line change
Expand Up @@ -650,6 +650,7 @@ class CodeGen final : public CodeGenInterface
#if defined(TARGET_AMD64)
void genAmd64EmitterUnitTestsSse2();
void genAmd64EmitterUnitTestsApx();
void genAmd64EmitterUnitTestsAvx10v2();
#endif

#endif // defined(DEBUG)
Expand Down
34 changes: 28 additions & 6 deletions src/coreclr/jit/codegencommon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1832,15 +1832,26 @@ void CodeGen::genGenerateMachineCode()
#if defined(TARGET_X86)
if (compiler->canUseEvexEncoding())
{
if (compiler->compOpportunisticallyDependsOn(InstructionSet_AVX10v1))
if (compiler->compOpportunisticallyDependsOn(InstructionSet_AVX10v2))
{
if (compiler->compOpportunisticallyDependsOn(InstructionSet_AVX10v2_V512))
{
printf("X86 with AVX10.2/512");
}
else
{
printf("X86 with AVX10.2/256");
}
}
else if (compiler->compOpportunisticallyDependsOn(InstructionSet_AVX10v1))
{
if (compiler->compOpportunisticallyDependsOn(InstructionSet_AVX10v1_V512))
{
printf("X86 with AVX10/512");
printf("X86 with AVX10.1/512");
}
else
{
printf("X86 with AVX10/256");
printf("X86 with AVX10.1/256");
}
}
else
Expand All @@ -1860,15 +1871,26 @@ void CodeGen::genGenerateMachineCode()
#elif defined(TARGET_AMD64)
if (compiler->canUseEvexEncoding())
{
if (compiler->compOpportunisticallyDependsOn(InstructionSet_AVX10v1))
if (compiler->compOpportunisticallyDependsOn(InstructionSet_AVX10v2))
{
if (compiler->compOpportunisticallyDependsOn(InstructionSet_AVX10v2_V512))
{
printf("X64 with AVX10.2/512");
}
else
{
printf("X64 with AVX10.2/256");
}
}
else if (compiler->compOpportunisticallyDependsOn(InstructionSet_AVX10v1))
{
if (compiler->compOpportunisticallyDependsOn(InstructionSet_AVX10v1_V512))
{
printf("X64 with AVX10/512");
printf("X64 with AVX10.1/512");
}
else
{
printf("X64 with AVX10/256");
printf("X64 with AVX10.1/256");
}
}
else
Expand Down
4 changes: 4 additions & 0 deletions src/coreclr/jit/codegenlinear.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2712,6 +2712,10 @@ void CodeGen::genEmitterUnitTests()
{
genAmd64EmitterUnitTestsApx();
}
if (unitTestSectionAll || (strstr(unitTestSection, "avx10v2") != nullptr))
{
genAmd64EmitterUnitTestsAvx10v2();
}

#elif defined(TARGET_ARM64)
if (unitTestSectionAll || (strstr(unitTestSection, "general") != nullptr))
Expand Down
153 changes: 153 additions & 0 deletions src/coreclr/jit/codegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9239,6 +9239,159 @@ void CodeGen::genAmd64EmitterUnitTestsApx()
theEmitter->emitIns_S(INS_not, EA_2BYTE, 0, 0);
}

void CodeGen::genAmd64EmitterUnitTestsAvx10v2()
{
// All the Avx10.2 instructions are evex and evex only has one size.
// Also, there is no specialized handling for XMM0 vs XMM9 vs XMM16

emitter* theEmitter = GetEmitter();

genDefineTempLabel(genCreateTempLabel());

// This test suite needs AVX10.2 enabled.
if (!theEmitter->emitComp->compIsaSupportedDebugOnly(InstructionSet_AVX10v2))
{
return;
}

// packed conversion instructions
theEmitter->emitIns_R_R(INS_vcvttps2dqs, EA_16BYTE, REG_XMM0, REG_XMM1); // xmm
theEmitter->emitIns_R_R(INS_vcvttps2dqs, EA_16BYTE, REG_XMM9, REG_XMM10); // xmm
theEmitter->emitIns_R_R(INS_vcvttps2dqs, EA_16BYTE, REG_XMM15, REG_XMM16); // xmm
theEmitter->emitIns_R_R(INS_vcvttps2dqs, EA_32BYTE, REG_XMM0, REG_XMM1); // ymm
theEmitter->emitIns_R_R(INS_vcvttps2dqs, EA_64BYTE, REG_XMM0, REG_XMM1); // zmm

theEmitter->emitIns_R_R(INS_vcvttps2udqs, EA_16BYTE, REG_XMM0, REG_XMM1); // xmm
theEmitter->emitIns_R_R(INS_vcvttps2udqs, EA_16BYTE, REG_XMM9, REG_XMM10); // xmm
theEmitter->emitIns_R_R(INS_vcvttps2udqs, EA_16BYTE, REG_XMM15, REG_XMM16); // xmm
theEmitter->emitIns_R_R(INS_vcvttps2udqs, EA_32BYTE, REG_XMM0, REG_XMM1); // ymm
theEmitter->emitIns_R_R(INS_vcvttps2udqs, EA_64BYTE, REG_XMM0, REG_XMM1); // zmm

theEmitter->emitIns_R_R(INS_vcvttpd2qqs, EA_16BYTE, REG_XMM0, REG_XMM1); // xmm
theEmitter->emitIns_R_R(INS_vcvttpd2qqs, EA_16BYTE, REG_XMM9, REG_XMM10); // xmm
theEmitter->emitIns_R_R(INS_vcvttpd2qqs, EA_16BYTE, REG_XMM15, REG_XMM16); // xmm
theEmitter->emitIns_R_R(INS_vcvttpd2qqs, EA_32BYTE, REG_XMM0, REG_XMM1); // ymm
theEmitter->emitIns_R_R(INS_vcvttpd2qqs, EA_64BYTE, REG_XMM0, REG_XMM1); // zmm

theEmitter->emitIns_R_R(INS_vcvttpd2uqqs, EA_16BYTE, REG_XMM0, REG_XMM1); // xmm
theEmitter->emitIns_R_R(INS_vcvttpd2uqqs, EA_16BYTE, REG_XMM9, REG_XMM10); // xmm
theEmitter->emitIns_R_R(INS_vcvttpd2uqqs, EA_16BYTE, REG_XMM15, REG_XMM16); // xmm
theEmitter->emitIns_R_R(INS_vcvttpd2uqqs, EA_32BYTE, REG_XMM0, REG_XMM1); // ymm
theEmitter->emitIns_R_R(INS_vcvttpd2uqqs, EA_64BYTE, REG_XMM0, REG_XMM1); // zmm

// scalar conversion instructions
theEmitter->emitIns_R_R(INS_vcvttsd2sis32, EA_4BYTE, REG_EAX, REG_XMM0);
theEmitter->emitIns_R_R(INS_vcvttsd2sis64, EA_8BYTE, REG_RAX, REG_XMM0);
theEmitter->emitIns_R_R(INS_vcvttsd2usis32, EA_4BYTE, REG_EAX, REG_XMM0);
theEmitter->emitIns_R_R(INS_vcvttsd2usis64, EA_8BYTE, REG_RAX, REG_XMM0);
theEmitter->emitIns_R_R(INS_vcvttss2sis32, EA_4BYTE, REG_EAX, REG_XMM0);
theEmitter->emitIns_R_R(INS_vcvttss2sis64, EA_8BYTE, REG_RAX, REG_XMM0);
theEmitter->emitIns_R_R(INS_vcvttss2usis32, EA_4BYTE, REG_EAX, REG_XMM0);
theEmitter->emitIns_R_R(INS_vcvttss2usis64, EA_8BYTE, REG_RAX, REG_XMM0);

// minmax instruction
theEmitter->emitIns_R_R_R_I(INS_vminmaxss, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2, 0);
theEmitter->emitIns_R_R_R_I(INS_vminmaxss, EA_16BYTE, REG_XMM8, REG_XMM9, REG_XMM10, 0);
theEmitter->emitIns_R_R_R_I(INS_vminmaxss, EA_16BYTE, REG_XMM14, REG_XMM15, REG_XMM16, 0);

theEmitter->emitIns_R_R_R_I(INS_vminmaxsd, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2, 0);
theEmitter->emitIns_R_R_R_I(INS_vminmaxsd, EA_16BYTE, REG_XMM9, REG_XMM10, REG_XMM11, 0);
theEmitter->emitIns_R_R_R_I(INS_vminmaxsd, EA_16BYTE, REG_XMM16, REG_XMM17, REG_XMM18, 0);

theEmitter->emitIns_R_R_R_I(INS_vminmaxps, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2, 0);
theEmitter->emitIns_R_R_R_I(INS_vminmaxpd, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2, 0);
theEmitter->emitIns_R_R_R_I(INS_vminmaxps, EA_64BYTE, REG_XMM0, REG_XMM1, REG_XMM2, 0);
theEmitter->emitIns_R_R_R_I(INS_vminmaxpd, EA_64BYTE, REG_XMM0, REG_XMM1, REG_XMM2, 0);

// VCVT[,T]PS2I[,U]BS
theEmitter->emitIns_R_R(INS_vcvtps2ibs, EA_16BYTE, REG_XMM0, REG_XMM1);
theEmitter->emitIns_R_R(INS_vcvtps2ibs, EA_32BYTE, REG_XMM0, REG_XMM1);
theEmitter->emitIns_R_R(INS_vcvtps2ibs, EA_32BYTE, REG_XMM0, REG_XMM1, INS_OPTS_EVEX_er_ru);
theEmitter->emitIns_R_R(INS_vcvtps2ibs, EA_64BYTE, REG_XMM0, REG_XMM1);
theEmitter->emitIns_R_R(INS_vcvtps2ibs, EA_64BYTE, REG_XMM0, REG_XMM1, INS_OPTS_EVEX_er_ru);

theEmitter->emitIns_R_R(INS_vcvtps2iubs, EA_16BYTE, REG_XMM0, REG_XMM1);
theEmitter->emitIns_R_R(INS_vcvtps2iubs, EA_32BYTE, REG_XMM0, REG_XMM1);
theEmitter->emitIns_R_R(INS_vcvtps2iubs, EA_32BYTE, REG_XMM0, REG_XMM1, INS_OPTS_EVEX_er_rz);
theEmitter->emitIns_R_R(INS_vcvtps2iubs, EA_64BYTE, REG_XMM0, REG_XMM1);
theEmitter->emitIns_R_R(INS_vcvtps2iubs, EA_64BYTE, REG_XMM0, REG_XMM1, INS_OPTS_EVEX_er_rz);

theEmitter->emitIns_R_R(INS_vcvttps2ibs, EA_16BYTE, REG_XMM0, REG_XMM1);
theEmitter->emitIns_R_R(INS_vcvttps2ibs, EA_32BYTE, REG_XMM0, REG_XMM1);
theEmitter->emitIns_R_R(INS_vcvttps2ibs, EA_32BYTE, REG_XMM0, REG_XMM1, INS_OPTS_EVEX_eb_er_rd);
theEmitter->emitIns_R_R(INS_vcvttps2ibs, EA_64BYTE, REG_XMM0, REG_XMM1);

theEmitter->emitIns_R_R(INS_vcvttps2iubs, EA_16BYTE, REG_XMM0, REG_XMM1);
theEmitter->emitIns_R_R(INS_vcvttps2iubs, EA_32BYTE, REG_XMM0, REG_XMM1);
theEmitter->emitIns_R_R(INS_vcvttps2iubs, EA_32BYTE, REG_XMM0, REG_XMM1, INS_OPTS_EVEX_er_ru);
theEmitter->emitIns_R_R(INS_vcvttps2iubs, EA_64BYTE, REG_XMM0, REG_XMM1);

// VPDPW[SU,US,UU]D[,S]
theEmitter->emitIns_R_R_R(INS_vpdpwsud, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
theEmitter->emitIns_R_R_R(INS_vpdpwsud, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
theEmitter->emitIns_R_R_R(INS_vpdpwsud, EA_64BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
theEmitter->emitIns_R_R_R(INS_vpdpwsuds, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
theEmitter->emitIns_R_R_R(INS_vpdpwsuds, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
theEmitter->emitIns_R_R_R(INS_vpdpwsuds, EA_64BYTE, REG_XMM0, REG_XMM1, REG_XMM2);

theEmitter->emitIns_R_R_R(INS_vpdpwusd, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
theEmitter->emitIns_R_R_R(INS_vpdpwusd, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
theEmitter->emitIns_R_R_R(INS_vpdpwusd, EA_64BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
theEmitter->emitIns_R_R_R(INS_vpdpwusds, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
theEmitter->emitIns_R_R_R(INS_vpdpwusds, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
theEmitter->emitIns_R_R_R(INS_vpdpwusds, EA_64BYTE, REG_XMM0, REG_XMM1, REG_XMM2);

theEmitter->emitIns_R_R_R(INS_vpdpwuud, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
theEmitter->emitIns_R_R_R(INS_vpdpwuud, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
theEmitter->emitIns_R_R_R(INS_vpdpwuud, EA_64BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
theEmitter->emitIns_R_R_R(INS_vpdpwuuds, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
theEmitter->emitIns_R_R_R(INS_vpdpwuuds, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
theEmitter->emitIns_R_R_R(INS_vpdpwuuds, EA_64BYTE, REG_XMM0, REG_XMM1, REG_XMM2);

// VPDPB[SU,UU,SS]D[,S]
theEmitter->emitIns_R_R_R(INS_vpdpbssd, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
theEmitter->emitIns_R_R_R(INS_vpdpbssd, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
theEmitter->emitIns_R_R_R(INS_vpdpbssd, EA_64BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
theEmitter->emitIns_R_R_R(INS_vpdpbssds, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
theEmitter->emitIns_R_R_R(INS_vpdpbssds, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
theEmitter->emitIns_R_R_R(INS_vpdpbssds, EA_64BYTE, REG_XMM0, REG_XMM1, REG_XMM2);

theEmitter->emitIns_R_R_R(INS_vpdpbsud, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
theEmitter->emitIns_R_R_R(INS_vpdpbsud, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
theEmitter->emitIns_R_R_R(INS_vpdpbsud, EA_64BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
theEmitter->emitIns_R_R_R(INS_vpdpbsuds, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
theEmitter->emitIns_R_R_R(INS_vpdpbsuds, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
theEmitter->emitIns_R_R_R(INS_vpdpbsuds, EA_64BYTE, REG_XMM0, REG_XMM1, REG_XMM2);

theEmitter->emitIns_R_R_R(INS_vpdpbuud, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
theEmitter->emitIns_R_R_R(INS_vpdpbuud, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
theEmitter->emitIns_R_R_R(INS_vpdpbuud, EA_64BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
theEmitter->emitIns_R_R_R(INS_vpdpbuuds, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
theEmitter->emitIns_R_R_R(INS_vpdpbuuds, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
theEmitter->emitIns_R_R_R(INS_vpdpbuuds, EA_64BYTE, REG_XMM0, REG_XMM1, REG_XMM2);

// VMPSADBW
theEmitter->emitIns_R_R_R_I(INS_vmpsadbw, EA_64BYTE, REG_XMM0, REG_XMM1, REG_XMM2, 0); // zmm

// VCOMXSD
theEmitter->emitIns_R_R(INS_vcomxsd, EA_16BYTE, REG_XMM0, REG_XMM1);

// VCOMXSS
theEmitter->emitIns_R_R(INS_vcomxss, EA_16BYTE, REG_XMM0, REG_XMM1);

// VUCOMXSD
theEmitter->emitIns_R_R(INS_vucomxsd, EA_16BYTE, REG_XMM0, REG_XMM1);

// VUCOMXSS
theEmitter->emitIns_R_R(INS_vucomxss, EA_16BYTE, REG_XMM0, REG_XMM1);

// VMOVD
theEmitter->emitIns_R_R(INS_vmovd, EA_16BYTE, REG_XMM0, REG_XMM1);

// VMOVW
theEmitter->emitIns_R_R(INS_vmovw, EA_16BYTE, REG_XMM0, REG_XMM1);
}

#endif // defined(DEBUG) && defined(TARGET_AMD64)

#ifdef PROFILING_SUPPORTED
Expand Down
20 changes: 10 additions & 10 deletions src/coreclr/jit/emit.h
Original file line number Diff line number Diff line change
Expand Up @@ -1607,36 +1607,36 @@ class emitter

bool idIsBound() const
{
assert(!IsAvx512OrPriorInstruction(_idIns));
assert(!IsSimdInstruction(_idIns));
return _idBound != 0;
}
void idSetIsBound()
{
assert(!IsAvx512OrPriorInstruction(_idIns));
assert(!IsSimdInstruction(_idIns));
_idBound = 1;
}

#ifndef TARGET_ARMARCH
bool idIsCallRegPtr() const
{
assert(!IsAvx512OrPriorInstruction(_idIns));
assert(!IsSimdInstruction(_idIns));
return _idCallRegPtr != 0;
}
void idSetIsCallRegPtr()
{
assert(!IsAvx512OrPriorInstruction(_idIns));
assert(!IsSimdInstruction(_idIns));
_idCallRegPtr = 1;
}
#endif // !TARGET_ARMARCH

bool idIsTlsGD() const
{
assert(!IsAvx512OrPriorInstruction(_idIns));
assert(!IsSimdInstruction(_idIns));
return _idTlsGD != 0;
}
void idSetTlsGD()
{
assert(!IsAvx512OrPriorInstruction(_idIns));
assert(!IsSimdInstruction(_idIns));
_idTlsGD = 1;
}

Expand All @@ -1645,12 +1645,12 @@ class emitter
// code, it is not necessary to generate GC info for a call so labeled.
bool idIsNoGC() const
{
assert(!IsAvx512OrPriorInstruction(_idIns));
assert(!IsSimdInstruction(_idIns));
return _idNoGC != 0;
}
void idSetIsNoGC(bool val)
{
assert(!IsAvx512OrPriorInstruction(_idIns));
assert(!IsSimdInstruction(_idIns));
_idNoGC = val;
}

Expand Down Expand Up @@ -1703,7 +1703,7 @@ class emitter

unsigned idGetEvexAaaContext() const
{
assert(IsAvx512OrPriorInstruction(_idIns));
assert(IsSimdInstruction(_idIns));
return _idEvexAaaContext;
}

Expand All @@ -1719,7 +1719,7 @@ class emitter

bool idIsEvexZContextSet() const
{
assert(IsAvx512OrPriorInstruction(_idIns));
assert(IsSimdInstruction(_idIns));
return _idEvexZContext != 0;
}

Expand Down
Loading

0 comments on commit 03b2d3d

Please sign in to comment.