;---------------------------------------
;Win64 NASM Example: Using base6.obj + GoLink
;


















; debug

; nasm -f WIN64 -g hex_bench.asm    -l hex_bench.lis

;;;;; golink /console id_decomp_nasm.obj GAIA_IDS_L0HP_6.dll base6.obj msvcrt.dll kernel32.dll /files

; HEX_BENCH_DATA_1GB.dll
; HEX_BENCH_DATA_2GB.dll

; golink /console hex_bench.obj hex_x86_64.obj base64_x86_64.obj HEX_BENCH_DATA_1300KB.obj base6.obj msvcrt.dll kernel32.dll /files

; golink /console hex_bench.obj hex_x86_64.obj base64_x86_64.obj HEX_BENCH_DATA_300KB.obj base6.obj msvcrt.dll kernel32.dll /files

; golink /console hex_bench.obj HEX_BENCH_DATA_300KB.dll hex_x86_64.obj base64_x86_64.obj base6.obj msvcrt.dll kernel32.dll /files


; nasm -f elf64 -g hex_bench.asm    -l hex_bench.lis

; ld -o hex_bench hex_bench.o hex_x86_64.o base64_x86_64.o HEX_BENCH_DATA_1300KB.o

%ifdef ASSEMBLE_COMMAND_LINES_ON_WINDOWS

:: commands to build on Windows (nasm and golink in the path)
nasm -f WIN64 -g hex_bench.asm     -l hex_bench.lis
nasm -f WIN64 -g hex_x86_64.asm    -l hex_x86_64.lis
nasm -f WIN64 -g HEX_BENCH_DATA_1300KB.asm
golink /console hex_bench.obj hex_x86_64.obj HEX_BENCH_DATA_1300KB.obj

%endif

%ifdef ASSEMBLE_COMMAND_LINES_ON_LINUX

# commands to build on LINUX
nasm -f elf64 -g hex_bench.asm     -l hex_bench.lis
nasm -f elf64 -g hex_x86_64.asm    -l hex_x86_64.lis
nasm -f elf64 -g HEX_BENCH_DATA_1300KB.asm
ld -o hex_bench hex_bench.o hex_x86_64.o HEX_BENCH_DATA_1300KB.o

%endif


;;;;GAIA_L0HP_6.dll mul_decomp_arr_nasm.dll

extern hex_encode_fast


extern hex_encode_sse2
extern hex_encode_ssse3
extern hex_encode_avx2
;extern hex_encode_avx512f
extern hex_encode_avx512bw

extern hex_decode_sse2
extern hex_decode_avx2
extern hex_decode_avx512bw

extern get_instr_info


extern base64_encode_ssse3
extern base64_encode_avx2
extern base64_encode_avx512bw

extern base64_decode_ssse3
extern base64_decode_avx2
extern base64_decode_avx512bw


extern HEX_BENCH_BIN_START
extern HEX_BENCH_BIN_TRAIL


%ifidn __OUTPUT_FORMAT__, win64
%define __WIN__ 1
%elifidn __OUTPUT_FORMAT__, elf64
%define __ELF__ 1
%endif

; LINUX call-convention 64 bit
; -----------------------------------
; So %rdi, %rsi, %rdx, %rcx, %r8 and %r9 are the registers in order used to pass integer/pointer (i.e. INTEGER class) 
; parameters to any libc function from assembly. 
; %rdi is used for the first INTEGER parameter. %rsi for 2nd, %rdx for 3rd and so on. 
; Then call instruction should be given. 
; The stack (%rsp) must be 16B-aligned when call executes.


; BENCH results
; length of 300KB-Source-PDF is 0x54756 = 345942 bytes
; 1 million 
; Celeron G5905 ca. 3.5 GHz
; NUC core i3 8109U
; G5905 Hex-encode SSE2 50 sec = 6.919 GB /sec
; NUC Hex-encode SSE2 49 sec = 7.060 GB /sec

; Hex-encode AVX2 50 sec = 6.919 GB /sec
; NUC Hex-encode AVX2 27 sec = 12.813 GB /sec
; NUC BASe64 Encode AVX2 23 sec = 15.040 GB/sec

; BENCH results
; length of 1300KB-NASM.EXE(win) is 0x153000 = 1388544 bytes = 1356 KB
; 200000 loop count 
; Celeron G5905 ca. 3.5 GHz
; NUC core i3 8109U
; Tigerlake i5-1135G7 2.4 GHz- 4.2 GHz
; IceLake XEON Silver 4314 2.4 Ghz-4.7 GHz

; G5905 Hex-encode SSE2 50 sec = 6.919 GB /sec
; NUC Hex-encode SSE2 40 sec = 6.780 GB /sec
; TGL Hex-encode SSE2 27 sec = 9.81 GB /sec

; Hex-encode AVX2 50 sec = 6.919 GB /sec
; NUC Hex-encode AVX2 27 sec = 10.044 GB /sec
; NUC BASe64 Encode AVX2 23 sec = XXX15.040 GB/sec

; 1 Million = 1356 GB
; TGL Hex-encode AVX512bw 	84 sec = 16.1 GB /sec
; TGL Hex-encode AVX2 		27 sec = 9.81 GB /sec
; TGL Hex-encode SSSE3 		27 sec = 9.81 GB /sec
; TGL Hex-encode SSE2 		27 sec = 9.81 GB /sec

; ICL Hex-encode SSSE3 		122 sec =  GB /sec
; ICL Hex-encode AVX2 		117 sec =  GB /sec
; ICL Hex-encode AVX512BW		113 sec =  GB /sec


; LINUX Fedora 35
; length of 1300KB-nasm(linux) is 1759032 bytes = 1718 KB

; 1 Million = 1718 GB
; TGL Hex-encode AVX512bw 	95 sec = 18.1 GB /sec

default rel


section .bss align=64

HEXENCODE_OUT_ARR:
		resq	1024*4096

;HEXDECODE_OUT_ARR:


section .text align=32

global start
global main

%use smartalign

	ALIGNMODE 	p6

start:
;_start:
main:
	push 		rbp
	mov			rbp,rsp
	sub			rsp,32
	push		rdi
	push		rsi
	push 		r15
	push 		r14


;	call		get_instr_info


;	mov			r14,2; 5*200000	;1000000
	mov			r14,1000000

LBENCH_LOOP:

	lea			rsi,[HEX_BENCH_BIN_START]		; parameter 2 input buffer

	lea			rdx,[HEX_BENCH_BIN_TRAIL]

	sub			rdx,rsi							; parameter 3 number of elements
;	mov			rdx,512		;[HEXENCODE_INP_N_ELEM]			; rdx = number of elements

	lea			rdi,[HEXENCODE_OUT_ARR]			; parameter 1 output buffer

%ifdef __WIN__

	mov			rcx,rdi							; parameter 1 output buffer
	mov			r8,rdx							; parameter 3 number of elements
	mov			rdx,rsi							; parameter 2 input buffer

	
%endif

;	call 		hex_encode_fast

;	call 		hex_encode_sse2
;	call 		hex_encode_ssse3
;	call 		hex_encode_avx2
	call 		hex_encode_avx512bw

;	call 		base64_encode_ssse3
;	call 		base64_encode_avx2
;	call 		base64_encode_avx512bw

	sub			r14,1
	jnz			LBENCH_LOOP

	xor		rax,rax
	nop

	pop			r14
	pop			r15
	pop			rsi
	pop			rdi
	sub			rsp,32

	mov			rsp,rbp
	pop 		rbp

	ret




















