I spent sometime with LLVN tool set and can generate LLVM IR/ bitcode however I am interested in whcih tool can be used to generate the GPU ISA assembler code for Radeon GPU-s?
Thanks.,
ok i made assembly using clang and few other tools using following simple opencl examples and pasted output assembly, but I only seem to see CPU assembler code, not GPU kernel assembly code, how do I generate that?
SRC:
// A minimalist OpenCL program.
#include <CL/cl.h>
#include <stdio.h>
#define printDeviceInfo(X) printf("\n%s: %s", (X));
#define declareDeviceInfo(X) char str(X)[] = "(X)";
#define NWITEMS 2048
#define LOCAL_WORK_SIZE 256
#define DEBUG 0
// A simple kernelfcn kernel
const char *source =
"kernel void kernelfcn( global uint *dev_c, global uint *dev_a, global uint *dev_b) \n"
"{ \n"
" uint tid = get_global_id(0); \n"
" dev_c[tid] = dev_a[tid] + dev_b[tid]; \n"
"} \n";
int main(int argc, char ** argv)
{
int stat;
char str1[100];
ushort ushort1;
uint uint1;
ulong ulong1;
size_t strLen;
cl_int ret;
uint a[NWITEMS], b[NWITEMS], c[NWITEMS];
int i;
// 1. Get a platform.
cl_uint CONFIG_MAX_PLATFORMS=20;
cl_platform_id platforms[CONFIG_MAX_PLATFORMS];
cl_uint platforms_available;
clGetPlatformIDs(CONFIG_MAX_PLATFORMS, platforms, &platforms_available );
printf("\nNo. of platforms available: %d.\n", platforms_available);
for (int i = 0 ; i < platforms_available; i ++ ) {
printf("Platform %d: %d.\n", i, platforms[i]);
}
// 2. Find a gpu/cpu device.
cl_uint CONFIG_MAX_DEVICES=20;
cl_uint devices_available;
enum enum_device_info_types {DEVINFO_STRING=1, DEVINFO_USHORT=2, DEVINFO_UINT=3, DEVINFO_ULONG=4, DEVINFO_SIZE_T=5};
enum enum_device_info_types device_info_types[] = {
DEVINFO_STRING, \
DEVINFO_STRING, \
DEVINFO_STRING, \
DEVINFO_STRING, \
DEVINFO_ULONG, \
DEVINFO_ULONG, \
DEVINFO_USHORT, \
DEVINFO_UINT, \
DEVINFO_UINT, \
DEVINFO_SIZE_T, \
DEVINFO_UINT, \
DEVINFO_SIZE_T, \
DEVINFO_USHORT, \
DEVINFO_STRING, \
DEVINFO_SIZE_T \
};
char *str_device_info[]={\
"CL_DEVICE_NAME", \
"CL_DEVICE_VENDOR", \
"CL_DEVICE_VERSION", \
"CL_DRIVER_VERSION", \
"CL_DEVICE_GLOBAL_MEM_SIZE", \
"CL_DEVICE_LOCAL_MEM_SIZE", \
"CL_DEVICE_LOCAL_MEM_TYPE", \
"CL_DEVICE_MAX_CLOCK_FREQUENCY", \
"CL_DEVICE_MAX_COMPUTE_UNITS", \
"CL_DEVICE_MAX_WORK_GROUP_SIZE", \
"CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS", \
"CL_DEVICE_MAX_WORK_ITEM_SIZES", \
"CL_DEVICE_TYPE", \
"CL_DEVICE_EXTENSIONS", \
"CL_DEVICE_MAX_PARAMETER_SIZE" \
};
cl_device_id device[CONFIG_MAX_DEVICES];
cl_device_info deviceInfos[]={\
CL_DEVICE_NAME, \
CL_DEVICE_VENDOR, \
CL_DEVICE_VERSION, \
CL_DRIVER_VERSION, \
CL_DEVICE_GLOBAL_MEM_SIZE, \
CL_DEVICE_LOCAL_MEM_SIZE, \
CL_DEVICE_LOCAL_MEM_TYPE, \
CL_DEVICE_MAX_CLOCK_FREQUENCY, \
CL_DEVICE_MAX_COMPUTE_UNITS, \
CL_DEVICE_MAX_WORK_GROUP_SIZE, \
CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, \
CL_DEVICE_MAX_WORK_ITEM_SIZES, \
CL_DEVICE_TYPE, \
CL_DEVICE_EXTENSIONS, \
CL_DEVICE_MAX_PARAMETER_SIZE \
};
stat = clGetDeviceIDs( platforms[0], CL_DEVICE_TYPE_ALL, CONFIG_MAX_DEVICES, device, &devices_available);
printf("No. of devices available: %d.\n", devices_available);
for (int j = 0 ; j < devices_available; j++) {
for (int i = 0 ; i < sizeof(deviceInfos)/sizeof(cl_device_info); i ++ ) {
if (stat == 0) {
switch (device_info_types[i]) {
case DEVINFO_STRING:
clGetDeviceInfo(device[0], deviceInfos[i], sizeof(str1), str1, &strLen);
printf("\n%40s: %30s.", str_device_info[i], str1);
break;
case DEVINFO_USHORT:
clGetDeviceInfo(device[0], deviceInfos[i], sizeof(ushort), (void*)&ushort1, &strLen);
printf("\n%40s: %02u (%02x).", str_device_info[i], ushort1, ushort1);
break;
case DEVINFO_UINT:
clGetDeviceInfo(device[0], deviceInfos[i], sizeof(uint), (void*)&uint1, &strLen);
printf("\n%40s: %04u (%04x).", str_device_info[i], uint1, uint1);
break;
case DEVINFO_ULONG:
clGetDeviceInfo(device[0], deviceInfos[i], sizeof(ulong), (void*)&ulong1, &strLen);
printf("\n%40s: %08u (%08x).", str_device_info[i], ulong1, ulong1);
break;
case DEVINFO_SIZE_T:
clGetDeviceInfo(device[0], deviceInfos[i], sizeof(ulong), (void*)&ulong1, &strLen);
printf("\n%40s: %08u (%08x).", str_device_info[i], ulong1, ulong1);
break;
}
//enum device_info_types={DEVINFO_STRING=1, DEVINFO_USHORT=2, DEVINFO_UINT=3, DEVINFO_ULONG=4};
} else {
printf("\nclGetDevicesIDs FAIL.");
return 1;
}
}
}
return 0;
}
COMMANDS:
ROCMOPENCL=/opt/rocm/opencl/
FILENAME=p31
echo "generating direct exec binary..."
clang -emit-llvm -I$ROCMOPENCL/include -c $FILENAME.c
# input: bitcode, output: llvm ir.
echo "input bitcode: output: llvm ir..."
llvm-dis < $FILENAME.bc > $FILENAME.ll
# jit input: bitcode.
#echo "jit exec: input: bitcode..."
#lli $FILENAME.bc
# output assembly: input: bitcode, output: assembly.
echo "input: bitcode, output: assembly..."
llc $FILENAME.bc -o $FILENAME.s
echo "done..."
exit 0
ASSEMBLER OUTPUT (CPU ONLY NO GPU??)
.text
.file "p31.c"
.globl main # -- Begin function main
.p2align 4, 0x90
.type main,@function
main: # @main
.cfi_startproc
# %bb.0:
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset %rbp, -16
movq %rsp, %rbp
.cfi_def_cfa_register %rbp
pushq %r14
pushq %rbx
subq $25072, %rsp # imm = 0x61F0
.cfi_offset %rbx, -32
.cfi_offset %r14, -24
movl $0, -36(%rbp)
movl %edi, -100(%rbp)
movq %rsi, -128(%rbp)
movl $20, -64(%rbp)
movl -64(%rbp), %eax
movq %rsp, -96(%rbp)
leaq 15(,%rax,8), %rcx
andq $-16, %rcx
movq %rsp, %rbx
subq %rcx, %rbx
movq %rbx, %rsp
movq %rax, -120(%rbp)
movl -64(%rbp), %edi
leaq -60(%rbp), %rdx
movq %rbx, %rsi
callq clGetPlatformIDs
movl -60(%rbp), %esi
movabsq $.L.str.1, %rdi
movb $0, %al
callq printf
movl $0, -24(%rbp)
.LBB0_1: # =>This Inner Loop Header: Depth=1
movl -24(%rbp), %eax
cmpl -60(%rbp), %eax
jae .LBB0_4
# %bb.2: # in Loop: Header=BB0_1 Depth=1
movl -24(%rbp), %esi
movslq -24(%rbp), %rax
movq (%rbx,%rax,8), %rdx
movabsq $.L.str.2, %rdi
movb $0, %al
callq printf
# %bb.3: # in Loop: Header=BB0_1 Depth=1
movl -24(%rbp), %eax
addl $1, %eax
movl %eax, -24(%rbp)
jmp .LBB0_1
.LBB0_4:
movl $20, -56(%rbp)
movups .L__const.main.device_info_types+44(%rip), %xmm0
movups %xmm0, -340(%rbp)
movaps .L__const.main.device_info_types+32(%rip), %xmm0
movaps %xmm0, -352(%rbp)
movaps .L__const.main.device_info_types+16(%rip), %xmm0
movaps %xmm0, -368(%rbp)
movaps .L__const.main.device_info_types(%rip), %xmm0
movaps %xmm0, -384(%rbp)
movq .L__const.main.str_device_info+112(%rip), %rax
movq %rax, -144(%rbp)
movaps .L__const.main.str_device_info+96(%rip), %xmm0
movaps %xmm0, -160(%rbp)
movaps .L__const.main.str_device_info+80(%rip), %xmm0
movaps %xmm0, -176(%rbp)
movaps .L__const.main.str_device_info+64(%rip), %xmm0
movaps %xmm0, -192(%rbp)
movaps .L__const.main.str_device_info+48(%rip), %xmm0
movaps %xmm0, -208(%rbp)
movaps .L__const.main.str_device_info+32(%rip), %xmm0
movaps %xmm0, -224(%rbp)
movaps .L__const.main.str_device_info+16(%rip), %xmm0
movaps %xmm0, -240(%rbp)
movaps .L__const.main.str_device_info(%rip), %xmm0
movaps %xmm0, -256(%rbp)
movl -56(%rbp), %eax
leaq 15(,%rax,8), %rcx
andq $-16, %rcx
movq %rsp, %r14
subq %rcx, %r14
movq %r14, %rsp
movq %rax, -112(%rbp)
leaq -320(%rbp), %rdi
movabsq $.L__const.main.deviceInfos, %rsi
movl $60, %edx
callq memcpy@PLT
movq (%rbx), %rdi
movl -56(%rbp), %edx
leaq -52(%rbp), %r8
movl $4294967295, %esi # imm = 0xFFFFFFFF
movq %r14, %rcx
callq clGetDeviceIDs
movl %eax, -88(%rbp)
movl -52(%rbp), %esi
movabsq $.L.str.18, %rdi
movb $0, %al
callq printf
movl $0, -32(%rbp)
.LBB0_5: # =>This Loop Header: Depth=1
# Child Loop BB0_7 Depth 2
movl -32(%rbp), %eax
cmpl -52(%rbp), %eax
jae .LBB0_22
# %bb.6: # in Loop: Header=BB0_5 Depth=1
movl $0, -20(%rbp)
.LBB0_7: # Parent Loop BB0_5 Depth=1
# => This Inner Loop Header: Depth=2
movslq -20(%rbp), %rax
cmpq $15, %rax
jae .LBB0_20
# %bb.8: # in Loop: Header=BB0_7 Depth=2
cmpl $0, -88(%rbp)
jne .LBB0_17
# %bb.9: # in Loop: Header=BB0_7 Depth=2
movslq -20(%rbp), %rax
movl -384(%rbp,%rax,4), %eax
addl $-1, %eax
movl %eax, %ecx
subl $4, %ecx
ja .LBB0_16
# %bb.10: # in Loop: Header=BB0_7 Depth=2
movq .LJTI0_0(,%rax,8), %rax
jmpq *%rax
.LBB0_11: # in Loop: Header=BB0_7 Depth=2
movq (%r14), %rdi
movslq -20(%rbp), %rax
movl -320(%rbp,%rax,4), %esi
leaq -496(%rbp), %rcx
leaq -80(%rbp), %r8
movl $100, %edx
callq clGetDeviceInfo
movslq -20(%rbp), %rax
movq -256(%rbp,%rax,8), %rsi
leaq -496(%rbp), %rdx
movabsq $.L.str.19, %rdi
movb $0, %al
callq printf
jmp .LBB0_16
.LBB0_12: # in Loop: Header=BB0_7 Depth=2
movq (%r14), %rdi
movslq -20(%rbp), %rax
movl -320(%rbp,%rax,4), %esi
leaq -26(%rbp), %rcx
leaq -80(%rbp), %r8
movl $2, %edx
callq clGetDeviceInfo
movslq -20(%rbp), %rax
movq -256(%rbp,%rax,8), %rsi
movzwl -26(%rbp), %edx
movzwl -26(%rbp), %ecx
movabsq $.L.str.20, %rdi
movb $0, %al
callq printf
jmp .LBB0_16
.LBB0_13: # in Loop: Header=BB0_7 Depth=2
movq (%r14), %rdi
movslq -20(%rbp), %rax
movl -320(%rbp,%rax,4), %esi
leaq -68(%rbp), %rcx
leaq -80(%rbp), %r8
movl $4, %edx
callq clGetDeviceInfo
movslq -20(%rbp), %rax
movq -256(%rbp,%rax,8), %rsi
movl -68(%rbp), %edx
movl -68(%rbp), %ecx
movabsq $.L.str.21, %rdi
movb $0, %al
callq printf
jmp .LBB0_16
.LBB0_14: # in Loop: Header=BB0_7 Depth=2
movq (%r14), %rdi
movslq -20(%rbp), %rax
movl -320(%rbp,%rax,4), %esi
leaq -48(%rbp), %rcx
leaq -80(%rbp), %r8
movl $8, %edx
callq clGetDeviceInfo
movslq -20(%rbp), %rax
movq -256(%rbp,%rax,8), %rsi
movq -48(%rbp), %rdx
movq -48(%rbp), %rcx
movabsq $.L.str.22, %rdi
movb $0, %al
callq printf
jmp .LBB0_16
.LBB0_15: # in Loop: Header=BB0_7 Depth=2
movq (%r14), %rdi
movslq -20(%rbp), %rax
movl -320(%rbp,%rax,4), %esi
leaq -48(%rbp), %rcx
leaq -80(%rbp), %r8
movl $8, %edx
callq clGetDeviceInfo
movslq -20(%rbp), %rax
movq -256(%rbp,%rax,8), %rsi
movq -48(%rbp), %rdx
movq -48(%rbp), %rcx
movabsq $.L.str.22, %rdi
movb $0, %al
callq printf
.LBB0_16: # in Loop: Header=BB0_7 Depth=2
jmp .LBB0_18
.LBB0_17:
movabsq $.L.str.23, %rdi
movb $0, %al
callq printf
movl $1, -36(%rbp)
movl $1, -84(%rbp)
jmp .LBB0_23
.LBB0_18: # in Loop: Header=BB0_7 Depth=2
jmp .LBB0_19
.LBB0_19: # in Loop: Header=BB0_7 Depth=2
movl -20(%rbp), %eax
addl $1, %eax
movl %eax, -20(%rbp)
jmp .LBB0_7
.LBB0_20: # in Loop: Header=BB0_5 Depth=1
jmp .LBB0_21
.LBB0_21: # in Loop: Header=BB0_5 Depth=1
movl -32(%rbp), %eax
addl $1, %eax
movl %eax, -32(%rbp)
jmp .LBB0_5
.LBB0_22:
movl $0, -36(%rbp)
movl $1, -84(%rbp)
.LBB0_23:
movq -96(%rbp), %rsp
movl -36(%rbp), %eax
leaq -16(%rbp), %rsp
popq %rbx
popq %r14
popq %rbp
.cfi_def_cfa %rsp, 8
retq
.Lfunc_end0:
.size main, .Lfunc_end0-main
.cfi_endproc
.section .rodata,"a",@progbits
.p2align 3
.LJTI0_0:
.quad .LBB0_11
.quad .LBB0_12
.quad .LBB0_13
.quad .LBB0_14
.quad .LBB0_15
# -- End function
.type .L.str,@object # @.str
.section .rodata.str1.1,"aMS",@progbits,1
.L.str:
.asciz "kernel void kernelfcn( global uint *dev_c, global uint *dev_a, global uint *dev_b) \n{ \n uint tid = get_global_id(0); \n dev_c[tid] = dev_a[tid] + dev_b[tid]; \n} \n"
.size .L.str, 378
.type source,@object # @source
.data
.globl source
.p2align 3
source:
.quad .L.str
.size source, 8
.type .L.str.1,@object # @.str.1
.section .rodata.str1.1,"aMS",@progbits,1
.L.str.1:
.asciz "\nNo. of platforms available: %d.\n"
.size .L.str.1, 34
.type .L.str.2,@object # @.str.2
.L.str.2:
.asciz "Platform %d: %d.\n"
.size .L.str.2, 18
.type .L__const.main.device_info_types,@object # @__const.main.device_info_types
.section .rodata,"a",@progbits
.p2align 4
.L__const.main.device_info_types:
.long 1 # 0x1
.long 1 # 0x1
.long 1 # 0x1
.long 1 # 0x1
.long 4 # 0x4
.long 4 # 0x4
.long 2 # 0x2
.long 3 # 0x3
.long 3 # 0x3
.long 5 # 0x5
.long 3 # 0x3
.long 5 # 0x5
.long 2 # 0x2
.long 1 # 0x1
.long 5 # 0x5
.size .L__const.main.device_info_types, 60
.type .L.str.3,@object # @.str.3
.section .rodata.str1.1,"aMS",@progbits,1
.L.str.3:
.asciz "CL_DEVICE_NAME"
.size .L.str.3, 15
.type .L.str.4,@object # @.str.4
.L.str.4:
.asciz "CL_DEVICE_VENDOR"
.size .L.str.4, 17
.type .L.str.5,@object # @.str.5
.L.str.5:
.asciz "CL_DEVICE_VERSION"
.size .L.str.5, 18
.type .L.str.6,@object # @.str.6
.L.str.6:
.asciz "CL_DRIVER_VERSION"
.size .L.str.6, 18
.type .L.str.7,@object # @.str.7
.L.str.7:
.asciz "CL_DEVICE_GLOBAL_MEM_SIZE"
.size .L.str.7, 26
.type .L.str.8,@object # @.str.8
.L.str.8:
.asciz "CL_DEVICE_LOCAL_MEM_SIZE"
.size .L.str.8, 25
.type .L.str.9,@object # @.str.9
.L.str.9:
.asciz "CL_DEVICE_LOCAL_MEM_TYPE"
.size .L.str.9, 25
.type .L.str.10,@object # @.str.10
.L.str.10:
.asciz "CL_DEVICE_MAX_CLOCK_FREQUENCY"
.size .L.str.10, 30
.type .L.str.11,@object # @.str.11
.L.str.11:
.asciz "CL_DEVICE_MAX_COMPUTE_UNITS"
.size .L.str.11, 28
.type .L.str.12,@object # @.str.12
.L.str.12:
.asciz "CL_DEVICE_MAX_WORK_GROUP_SIZE"
.size .L.str.12, 30
.type .L.str.13,@object # @.str.13
.L.str.13:
.asciz "CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS"
.size .L.str.13, 35
.type .L.str.14,@object # @.str.14
.L.str.14:
.asciz "CL_DEVICE_MAX_WORK_ITEM_SIZES"
.size .L.str.14, 30
.type .L.str.15,@object # @.str.15
.L.str.15:
.asciz "CL_DEVICE_TYPE"
.size .L.str.15, 15
.type .L.str.16,@object # @.str.16
.L.str.16:
.asciz "CL_DEVICE_EXTENSIONS"
.size .L.str.16, 21
.type .L.str.17,@object # @.str.17
.L.str.17:
.asciz "CL_DEVICE_MAX_PARAMETER_SIZE"
.size .L.str.17, 29
.type .L__const.main.str_device_info,@object # @__const.main.str_device_info
.section .rodata,"a",@progbits
.p2align 4
.L__const.main.str_device_info:
.quad .L.str.3
.quad .L.str.4
.quad .L.str.5
.quad .L.str.6
.quad .L.str.7
.quad .L.str.8
.quad .L.str.9
.quad .L.str.10
.quad .L.str.11
.quad .L.str.12
.quad .L.str.13
.quad .L.str.14
.quad .L.str.15
.quad .L.str.16
.quad .L.str.17
.size .L__const.main.str_device_info, 120
.type .L__const.main.deviceInfos,@object # @__const.main.deviceInfos
.p2align 4
.L__const.main.deviceInfos:
.long 4139 # 0x102b
.long 4140 # 0x102c
.long 4143 # 0x102f
.long 4141 # 0x102d
.long 4127 # 0x101f
.long 4131 # 0x1023
.long 4130 # 0x1022
.long 4108 # 0x100c
.long 4098 # 0x1002
.long 4100 # 0x1004
.long 4099 # 0x1003
.long 4101 # 0x1005
.long 4096 # 0x1000
.long 4144 # 0x1030
.long 4119 # 0x1017
.size .L__const.main.deviceInfos, 60
.type .L.str.18,@object # @.str.18
.section .rodata.str1.1,"aMS",@progbits,1
.L.str.18:
.asciz "No. of devices available: %d.\n"
.size .L.str.18, 31
.type .L.str.19,@object # @.str.19
.L.str.19:
.asciz "\n%40s: %30s."
.size .L.str.19, 13
.type .L.str.20,@object # @.str.20
.L.str.20:
.asciz "\n%40s: %02u (%02x)."
.size .L.str.20, 20
.type .L.str.21,@object # @.str.21
.L.str.21:
.asciz "\n%40s: %04u (%04x)."
.size .L.str.21, 20
.type .L.str.22,@object # @.str.22
.L.str.22:
.asciz "\n%40s: %08u (%08x)."
.size .L.str.22, 20
.type .L.str.23,@object # @.str.23
.L.str.23:
.asciz "\nclGetDevicesIDs FAIL."
.size .L.str.23, 23
.ident "clang version 13.0.0 (/root/ROCm-4.3/llvm-project/clang f2943f684437d2c1143a56e418d29fc6b3314072)"
.section ".note.GNU-stack","",@progbits
This topic was automatically closed 183 days after the last reply. New replies are no longer allowed.