########################################################################
# ISPACK FORTRAN SUBROUTINE LIBRARY FOR SCIENTIFIC COMPUTING
# Copyright (C) 1998--2016 Keiichi Ishioka <ishioka@gfd-dennou.org>
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
# 
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
# 02110-1301 USA.
########################################################################
.text
.globl fvzof4_
.globl _fvzof4_	
fvzof4_:
_fvzof4_:	
	pushq %rbx
	pushq %rbp	
	pushq %r12
	pushq %r13
	pushq %r14
	pushq %r15	
	
	movl (%rdi), %edi  # KHH  rdi 
	movl (%rsi), %esi # LS  rsi 
	movl (%rdx), %edx # LL  rdx 		
      # X Ƭɥ쥹 rcx
      # T Ƭɥ쥹 r8

	vbroadcastsd C2(%rip),%zmm11 # ư 2  zmm11 4ս

	movq %rsi,%rax # rax  LS
	addq %rsi,%rdx # rdx  LS+LL (λ)

	shlq $7,%rdi # KHH*M*8*2=KHH*128
	movq %rsi,%r9 # r9  LS	
	imulq %rdi,%r9 # r9  LS*KHH*128
	shlq $2,%r9 # r9  LS*KHH*128*4


# %rax  I ΤΥ󥿤Ȥ

L1:

# %rbp  J*16*4 ΤΥ󥿤Ȥ


	cmpq $0,%rax
	jne L19

# I=0 ξ	

	lea (%rdi,%r9),%r10
	lea (%rdi,%r10),%r11
	lea (%rdi,%r11),%r12	
	

	xorq %rbp,%rbp
	movq %rcx,%r13

L18:
	vmovapd  (%r13,%r9),   %zmm0 # 00R
	vmovapd 64(%r13,%r9),  %zmm1 # 00I	
	vmovapd  (%r13,%r11),  %zmm2 # 10R
	vmovapd 64(%r13,%r11),  %zmm3 # 10I
	vmovapd  (%r13,%r10),  %zmm4 # 01R
	vmovapd 64(%r13,%r10),  %zmm5 # 01I
	vmovapd  (%r13,%r12),  %zmm6 # 11R
	vmovapd 64(%r13,%r12),  %zmm7 # 11I

	vsubpd %zmm2,%zmm0,%zmm2
	vsubpd %zmm3,%zmm1,%zmm3
	
	vfmsub213pd %zmm2,%zmm11,%zmm0 # zmm0 = zmm0 * zmm11 - zmm2
	vfmsub213pd %zmm3,%zmm11,%zmm1 # zmm1 = zmm1 * zmm11 - zmm3	

	vsubpd %zmm6,%zmm4,%zmm6
	vsubpd %zmm7,%zmm5,%zmm7

	vfmsub213pd %zmm6,%zmm11,%zmm4 # zmm4 = zmm4 * zmm11 - zmm6	
	
	vfmsub213pd %zmm7,%zmm11,%zmm5 # zmm5 = zmm5 * zmm11 - zmm7	

#--
	vsubpd %zmm4,%zmm0,%zmm9
	vmovapd %zmm9,(%r13,%r10)  # 01R		
	vsubpd %zmm5,%zmm1,%zmm5
	vmovapd %zmm5,64(%r13,%r10)   # 01I	
	
	vfmsub213pd %zmm9,%zmm11,%zmm0 # zmm0 = zmm0 * zmm11 - zmm9	
	vmovapd %zmm0, (%r13,%r9)       # 00R	
	vfmsub213pd %zmm5,%zmm11,%zmm1 # zmm1 = zmm1 * zmm11 - zmm5	
	vmovapd %zmm1,64(%r13,%r9)       # 00I	
	
	vsubpd %zmm7,%zmm2,%zmm9	
	vmovapd %zmm9, (%r13,%r12)   # 11R		
	
	vaddpd %zmm6,%zmm3,%zmm7	
	vmovapd %zmm7,64(%r13,%r12)   # 11I		

	vfmsub213pd %zmm9,%zmm11,%zmm2 # zmm2 = zmm2 * zmm11 - zmm9	
	vmovapd %zmm2, (%r13,%r11)   # 10R		
	vfmsub213pd %zmm7,%zmm11,%zmm3 # zmm3 = zmm3 * zmm11 - zmm7	
	vmovapd %zmm3,64(%r13,%r11)   # 10I	
	
#-----
	addq $128,%r13
	addq $128,%rbp # J Υ䤹	
	cmpq %rbp,%rdi
	jne L18

	lea (%rdi,%r12),%r9
	addq $1,%rax	
	cmpq %rax,%rdx
	je LE
	
#----------------------- I=0ʳξ -----------	

L19:

	lea (%rdi,%r9),%r10
	lea (%rdi,%r10),%r11
	lea (%rdi,%r11),%r12	
	

	movq %rax,%rbx
	shlq $4,%rbx # KHI0 б륢ɥ쥹
	vbroadcastsd  (%r8,%rbx),%zmm12 # T(1,KHI0)
	vbroadcastsd 8(%r8,%rbx),%zmm13 # T(2,KHI0)
	shlq $1,%rbx # KHHI0 б륢ɥ쥹	
	vbroadcastsd  (%r8,%rbx),%zmm14 # T(1,KHHI0)
	vbroadcastsd 8(%r8,%rbx),%zmm15 # T(2,KHHI0)
	
	movq %rcx,%r13
	movq %rcx,%rbp	
	addq %rdi,%rbp

L2:
	vmovapd  (%r13,%r9),   %zmm0 # 00R
	vmovapd 64(%r13,%r9),  %zmm1 # 00I	
	vmovapd  (%r13,%r11),  %zmm9 # 10R		
	vmovapd 64(%r13,%r11),  %zmm3 # 10I
	vmovapd  (%r13,%r10),  %zmm4 # 01R
	vmovapd 64(%r13,%r10),  %zmm5 # 01I
	vmovapd  (%r13,%r12),  %zmm10 # 11R
	vmovapd 64(%r13,%r12),  %zmm7 # 11I


	vmovapd %zmm0,%zmm2		
	vfnmadd231pd %zmm9,%zmm12,%zmm2 # zmm2 = - zmm9 * zmm12 + zmm2

	vfnmadd231pd %zmm3,%zmm13,%zmm2 # zmm2 = - zmm3 * zmm13 + zmm2
	
	vfnmadd213pd %zmm1,%zmm12,%zmm3 # zmm3 = - zmm3 * zmm12 + zmm1
	
	vfmadd231pd %zmm9,%zmm13,%zmm3 # zmm3 = zmm9 * zmm13 + zmm3
	
	vmovapd %zmm4,%zmm6
	vfnmadd231pd %zmm10,%zmm12,%zmm6 # zmm6 = - zmm10 * zmm12 + zmm6
	
	vfnmadd231pd %zmm7,%zmm13,%zmm6 # zmm6 = - zmm7 * zmm13 + zmm6	
	
	vfnmadd213pd %zmm5,%zmm12,%zmm7 # zmm7 = - zmm7 * zmm12 + zmm5
	
	vfmadd231pd %zmm10,%zmm13,%zmm7 # zmm7 = zmm10 * zmm13 + zmm7
	
	vfmsub213pd %zmm2,%zmm11,%zmm0 # zmm0 = zmm0 * zmm11 - zmm2
	
	vfmsub213pd %zmm3,%zmm11,%zmm1 # zmm1 = zmm1 * zmm11 - zmm3
	
	vfmsub213pd %zmm6,%zmm11,%zmm4 # zmm4 = zmm4 * zmm11 - zmm6
	
	vfmsub213pd %zmm7,%zmm11,%zmm5 # zmm5 = zmm5 * zmm11 - zmm7

#--
	vmovapd %zmm0,%zmm9
	vfnmadd231pd %zmm4,%zmm14,%zmm9 # zmm9 = - zmm4 * zmm14 + zmm9
	vfnmadd231pd %zmm5,%zmm15,%zmm9 # zmm9 = - zmm5 * zmm15 + zmm9
	vmovapd %zmm9,(%r13,%r10)  # 01R	
	
	vfnmadd213pd %zmm1,%zmm14,%zmm5 # zmm5 = - zmm5 * zmm14 + zmm1
	vfmadd231pd %zmm4,%zmm15,%zmm5 # zmm5 = zmm4 * zmm15 + zmm5
	vmovapd %zmm5,64(%r13,%r10)   # 01I
	
	vfmsub213pd %zmm9,%zmm11,%zmm0 # zmm0 = zmm0 * zmm11 - zmm9
	vmovapd %zmm0, (%r13,%r9)       # 00R
	vfmsub213pd %zmm5,%zmm11,%zmm1 # zmm1 = zmm1 * zmm11 - zmm5
	vmovapd %zmm1,64(%r13,%r9)       # 00I

	vmovapd %zmm2,%zmm9
	vfmadd231pd %zmm6,%zmm15,%zmm9 # zmm9 =  zmm6 * zmm15 + zmm9
	vfnmadd231pd %zmm7,%zmm14,%zmm9 # zmm9 =  - zmm7 * zmm14 + zmm9	
	vmovapd %zmm9, (%r13,%r12)   # 11R	
	
	vfmadd213pd %zmm3,%zmm15,%zmm7 # zmm7 = zmm7 * zmm15 + zmm3
	vfmadd231pd %zmm6,%zmm14,%zmm7 # zmm7 =  zmm6 * zmm14 + zmm7
	vmovapd %zmm7,64(%r13,%r12)   # 11I	

	vfmsub213pd %zmm9,%zmm11,%zmm2 # zmm2 = zmm2 * zmm11 - zmm9
	vmovapd %zmm2, (%r13,%r11)   # 10R	
	vfmsub213pd %zmm7,%zmm11,%zmm3 # zmm3 = zmm3 * zmm11 - zmm7
	vmovapd %zmm3,64(%r13,%r11)   # 10I
	
#-----
	addq $128,%r13
	cmpq %r13,%rbp
	jne L2

	addq $1,%rax		
	lea (%rdi,%r12),%r9
	cmpq %rax,%rdx
	jne L19

LE:	


	popq %r15	
	popq %r14
	popq %r13	
	popq %r12
	popq %rbp		
	popq %rbx
       
	ret
       
C2: # ư 2
	.long   0x00000000,0x40000000
	
