SREERAMI: How can I improve the performance of this huge nested loop? (Fortran 90)

How can I improve the performance of this huge nested loop? (Fortran 90)

I'll post the whole code segment here, but the only issue really is the
nested loop at the end. All read-in matrices are of dimension 180x180 and
the loop is unbearably slow. I don't see an easy way to simplify the
calculation, since the index-wise multiplications to get the matrix
"AnaInt" are no simple matrix products due to the threefold occurance of
the indices. Any thoughts? Thanks!
program AC
implicit none
integer, parameter :: dp = selected_real_kind(15, 307)
integer :: n, ndim, k, j, i, o, l, m, steps
real(dp) :: emax, omega, pi, EFermi, auev
complex(dp) :: Grs,Gas, ACCond, tinyc, cunit, czero, cone
complex(dp), allocatable :: GammaL(:,:)
complex(dp), allocatable :: GammaL_EB(:,:)
complex(dp), allocatable :: GammaR(:,:)
complex(dp), allocatable :: R(:,:)
complex(dp), allocatable :: Yc(:,:)
complex(dp), allocatable :: Yd(:,:)
complex(dp), allocatable :: AnaInt(:,:)
complex(dp), allocatable :: H(:,:)
complex(dp), allocatable :: HamEff(:,:)
complex(dp), allocatable :: EigVec(:,:)
complex(dp), allocatable :: InvEigVec(:,:)
complex(dp), allocatable :: EigVal(:)
complex(dp), allocatable :: ctemp(:,:)
complex(dp), allocatable :: ctemp2(:,:)
complex(dp), allocatable :: S(:,:)
complex(dp), allocatable :: SelfL(:,:)
complex(dp), allocatable :: SelfR(:,:)
complex(dp), allocatable :: SHalf(:,:)
complex(dp), allocatable :: InvSHalf(:,:)
complex(dp), allocatable :: HEB(:,:)
complex(dp), allocatable :: Integrand(:,:)
!Lapack arrays and variables
integer :: info, lwork
complex(dp), allocatable :: work(:)
real(dp), allocatable :: rwork(:)
integer,allocatable :: ipiv(:)
!########################################################################
!Constants
auev = 27.211385
pi = 3.14159265359
cunit = (0,1)
czero = (0,0)
cone = (1,0)
tinyc = (0.0, 0.000000000001)
!System and calculation parameters
open(unit=123, file="ForAC.dat", action='read', form='formatted')
read(123,*) ndim, EFermi
lwork = ndim*ndim
emax = 5.0/auev
steps = 1000
allocate(HEB(ndim,ndim))
allocate(H(ndim,ndim))
allocate(Yc(ndim,ndim))
allocate(Yd(ndim,ndim))
allocate(S(ndim,ndim))
allocate(SelfL(ndim,ndim))
allocate(SelfR(ndim,ndim))
allocate(HamEff(ndim,ndim))
allocate(GammaR(ndim,ndim))
allocate(GammaL(ndim,ndim))
allocate(AnaInt(ndim,ndim))
allocate(EigVec(ndim,ndim))
allocate(EigVal(ndim))
allocate(InvEigVec(ndim,ndim))
allocate(R(ndim,ndim))
allocate(GammaL_EB(ndim,ndim))
allocate(Integrand(ndim,ndim))
!################################################
read(123,*) H, S, SelfL, SelfR
close(unit=123)
HamEff(:,:)=(H(:,:) + SelfL(:,:) + SelfR(:,:))
allocate(SHalf(ndim, ndim))
allocate(InvSHalf(ndim,ndim))
SHalf(:,:) = (cmplx(real(S(:,:),dp),0.0_dp,dp))
call zpotrf('l', ndim, SHalf, ndim, info)
InvSHalf(:,:) = SHalf(:,:)
call ztrtri('l', 'n', ndim, InvSHalf, ndim, info)
call ztrmm('l', 'l', 'n', 'n', ndim, ndim, cone, InvSHalf, ndim,
HamEff, ndim)
call ztrmm('r', 'l', 't', 'n', ndim, ndim, cone, InvSHalf, ndim,
HamEff, ndim)
call ztrmm('l', 'l', 'n', 'n', ndim, ndim, cone, InvSHalf, ndim,
GammaL, ndim)
call ztrmm('r', 'l', 't', 'n', ndim, ndim, cone, InvSHalf, ndim,
GammaL, ndim)
call ztrmm('l', 'l', 'n', 'n', ndim, ndim, cone, InvSHalf, ndim,
GammaR, ndim)
call ztrmm('r', 'l', 't', 'n', ndim, ndim, cone, InvSHalf, ndim,
GammaR, ndim)
deallocate(SHalf)
deallocate(InvSHalf)
!In the PDF: B = EigVec, B^(-1) = InvEigVec, Hk = EigVal
allocate(ctemp(ndim,ndim))
ctemp(:,:) = HamEff(:,:)
allocate(work(lwork),rwork(2*ndim))
call zgeev('N', 'V', ndim, ctemp, ndim, EigVal, InvEigVec, ndim,
EigVec, ndim, work, lwork, rwork, info)
if(info/=0)write(*,*) "Warning: zgeev info=", info
deallocate(work,rwork)
deallocate(ctemp)
InvEigVec(:,:)=EigVec(:,:)
lwork = 3*ndim
allocate(ipiv(ndim))
allocate(work(lwork))
call zgetrf(ndim,ndim,InvEigVec,ndim,ipiv,info)
if(info/=0)write(*,*) "Warning: zgetrf info=", info ! LU decomposition
call zgetri(ndim,InvEigVec,ndim,ipiv,work,lwork,info)
if(info/=0)write(*,*) "Warning: zgetri info=", info ! Inversion by LU
decomposition (Building of InvEigVec)
deallocate(work)
deallocate(ipiv)
R(:,:) = 0.0_dp
do j=1,ndim
do m=1,ndim
do k=1,ndim
do l=1,ndim
R(j,m) = R(j,m) + InvEigVec(j,k) * GammaR(k,l) * conjg(InvEigVec(m,l))
end do
end do
end do
end do
!!!THIS IS THE LOOP IN QUESTION. MATRIX DIMENSION 180x180, STEPS=1000
open(unit=125,file="ACCond.dat")
!Looping over omega
do o=1,steps
omega=real(o,dp)*emax/real(steps,dp)
AnaInt(:,:) = 0.0_dp
do i=1,ndim
do n=1,ndim
do j=1,ndim
do m=1,ndim
Grs =
log((EFermi-(EigVal(j)+tinyc)+omega)/(EFermi-(EigVal(j)+tinyc)))
Gas =
log((EFermi-conjg(EigVal(m)+tinyc))/(EFermi-omega-conjg(EigVal(m)+tinyc)))
Integrand =
(Grs-Gas)/(EigVal(j)-tinyc-omega-conjg(EigVal(m)-tinyc))
AnaInt(i,n)= AnaInt(i,n) + EigVec(i,j) * R(j,m)
* Integrand(j,m) * conjg(EigVec(n,m))
end do
end do
end do
end do
Yc = 1/(2.0*pi*omega) * matmul(AnaInt,GammaL)
Yd(:,:) = - 1/(2.0*pi) * cunit * AnaInt(:,:)
ACCond = czero
do k=1,ndim
ACCond=ACCond+Yc(k,k) + 1/(2.0) * Yd(k,k)
end do
write(125,*) omega, real(ACCond,dp), aimag(ACCond)
end do
!#############################################
deallocate(Integrand)
deallocate(HEB)
deallocate(Yc)
deallocate(Yd)
deallocate(HamEff)
deallocate(GammaR)
deallocate(GammaL)
deallocate(AnaInt)
deallocate(EigVec)
deallocate(EigVal)
deallocate(InvEigVec)
deallocate(H)
deallocate(S)
deallocate(SelfL)
deallocate(SelfR)
deallocate(R)
deallocate(GammaL_EB)
end program AC

SREERAMI

Thursday, 12 September 2013

How can I improve the performance of this huge nested loop? (Fortran 90)

No comments:

Post a Comment