program fft_with_HPF parameter(indx = 15) parameter(nx = 2**indx,maxiters=1000) integer comm,myid,nnodes,ierr double complex,allocatable,dimension(:,:) :: f,g,sct,a,b,t integer,allocatable,dimension(:,:) :: mixup double precision time_begin,time_end double precision elapsed_time ! HPF$ processors proc(number_of_processors()) !HPF$ distribute (*,block) :: f,g,sct,a,b,t,mixup interface subroutine bitrv1(mixup,isign,indx,kxp,kblok,sct) integer,intent(inout) :: mixup(:,:) double complex,intent(inout) :: sct(:,:) !HPF$ distribute *(*,block) :: mixup,sct end subroutine subroutine permute(f,g,mixup,indx,kxp,kblok) integer,intent(inout) :: mixup(:,:) double complex,intent(inout) :: f(:,:),g(:,:) !HPF$ distribute *(*,block) :: mixup,f,g end subroutine subroutine fft(f,g,isign,mixup,sct,indx,kxp,kblok) integer,intent(inout) :: mixup(:,:) double complex,intent(inout) :: sct(:,:),f(:,:),g(:,:) !HPF$ distribute *(*,block) :: mixup,f,g,sct end subroutine end interface kblok = number_of_processors() kxp = nx/kblok isign = 1 allocate(f(kxp,kblok)) allocate(g(kxp,kblok)) allocate(sct(kxp,kblok)) allocate(mixup(kxp,kblok)) forall(i=1:kxp,k=1:kblok) g(i,k) = cmplx(99.0,-9.0) endforall !start timer call timer(time_begin) print *,elapsed_time() call bitrv1(mixup,isign,indx,kxp,kblok,sct) !do iter =1,maxiters call permute(f,g,mixup,indx,kxp,kblok) call fft(f,g,isign,mixup,sct,indx,kxp,kblok) !enddo !stop timer print *,'after fft',elapsed_time() call timer(time_end) print *,'fft of 1-d with size= ',nx,' took ' print *,time_end-time_begin,' seconds for ',maxiters,' iterations' end subroutine timer(t) integer val(8) double precision t call date_and_time(values=val) t = dble(val(8))*1d-3 + dble(val(7)) + & dble(val(6))*60d0 + dble(val(5))*3600d0 end subroutine timer double precision function elapsed_time() integer :: clock_count, clock_rate, clock_max integer, save :: last_count logical, save :: system_clock_not_yet_called = .true. call system_clock(count=clock_count,& count_rate=clock_rate, & count_max=clock_max) if (system_clock_not_yet_called) then system_clock_not_yet_called = .false. elapsed_time = 0.0 else if (clock_count > last_count) then elapsed_time = real (clock_count-last_count)/real(clock_rate) else elapsed_time = real (clock_max-last_count+clock_count)& /real(clock_rate) end if last_count = clock_count return end function elapsed_time !kblok is the number of processors !g is the input vector whose fft will be taken !kxp is the length of local data subroutine fft(f,g,isign,mixup,sct,indx,kxp,kblok) integer,intent(inout) :: mixup(:,:) double complex,intent(inout) :: sct(:,:),f(:,:),g(:,:) !HPF$ distribute *(*,block) :: mixup,f,g,sct integer lb,ub,sign,info,iarg double complex t1,tcmlx double complex global_g(kxp,kblok) nx = 2**indx nxh = nx/2 indx1 = indx !-1 kbs = nxh/kxp dnx = 6.28318530717959/float(nx) sign = isign kxp1 = nx/kblok !C --------------------------- !C inverse fourier transform !C bit-reverse array elements to temporary !C --------------------------- do l = 1, indx1 nxs = 2**(l - 1) kxs = nxs/kxp !C --------------------------- !C local calculation !C --------------------------- if (kxs.eq.0) then ! HPF$ INDEPENDENT,new(k,j,lb),ON HOME(f(j,:)) !HPF$ INDEPENDENT,new(k,j,lb) do k = 1,kblok do j = 1,kxp lb = (j - 1)/nxs if (btest(lb,0) .eqv. .FALSE.) then f(j,k) = g(j + nxs,k) else f(j,k) = g(j - nxs,k) endif enddo enddo !C --------------------------- !C perform reduction !C --------------------------- km = kxp/nxs !HPF$ INDEPENDENT,new(k,j,kb,lb,t1,iarg) do k = 1,kblok do j = 1, kxp kb = j - 1 lb = kb/nxs iarg = 1+km*(kb-nxs*lb) if (sign .eq. -1) then t1 = sct(iarg,k) else if (sign .eq.1) then tcmlx = sct(iarg,k) t1 = conjg(tcmlx ) endif if (btest(lb,0) .eqv. .FALSE.) then g(j,k) = g(j,k) + t1*f(j,k) else g(j,k) = f(j,k) - t1*g(j,k) endif enddo enddo else !C --------------------------- !C copy data !C --------------------------- !HPF$ INDEPENDENT,new(k,kb,lb,j) do k = 1, kblok kb = k - 1 lb = kb/kxs kb = kb + 1 do j = 1, kxp if (btest(lb,0) .eqv. .FALSE.) then f(j,k) = g(j,kb+kxs) else f(j,k) = g(j,kb-kxs) endif enddo enddo !C ------------------ !C perform reduction !C ------------------ km = nxh/nxs dns = dnx*float(km) !HPF$ INDEPENDENT ,new(k,kb,lb,j,arg,t1) do k = 1, kblok kb = k-1 lb = kb/kxs kb = kxp*(kb - kxs*lb) - 1 do j = 1, kxp arg = dns*float(j + kb) t1 = cmplx(cos(arg),sign*sin(arg)) if (btest(lb,0) .eqv. .FALSE.) then g(j,k) = g(j,k) + t1*f(j,k) else g(j,k) = f(j,k) - t1*g(j,k) endif enddo enddo endif enddo return end subroutine subroutine bitrv1(mixup,isign,indx,kxp,kblok,sct) integer,intent(inout) :: mixup(:,:) double complex,intent(inout) :: sct(:,:) !HPF$ distribute *(*,block) :: mixup,sct integer isign,indx,kxp,kblok integer llb,ub real arg,dnx nx = 2**indx nxh = nx/2 dnx = 6.28318530717959/float(nx) !C --------------------------------- !C prepare bit-reverse index table !C --------------------------------- !HPF$ INDEPENDENT,new(k,koff,j,lb,ll,l,jb,it) do k = 1,kblok koff = kxp*(k-1) - 1 do j = 1, kxp lb = j + koff ll = 0 do l = 1, indx jb = lb/2 it = lb - 2*jb lb = jb ll = 2*ll + it enddo mixup(j,k) = ll + 1 enddo enddo !HPF$ INDEPENDENT,new(k,j,arg) do k = 1,kblok do j = 1, kxp arg = dnx*float((nxh*(j - 1))/kxp) sct(j,k) = cmplx(cos(arg),-sin(arg)) enddo enddo return end subroutine permute(f,g,mixup,indx,kxp,kblok) integer,intent(inout) :: mixup(:,:) double complex,intent(inout) :: f(:,:),g(:,:) !HPF$ distribute *(*,block) :: mixup,f,g !C --------------------------- !C inverse fourier transform !C bit-reverse array elements to temporary !C --------------------------- !HPF$ INDEPENDENT,new(k,j,ll,kk,jj) do k=1,kblok do j = 1, kxp ll = mixup(j,k) kk = (ll - 1)/kxp + 1 jj = ll - kxp*(kk - 1) g(j,k) = f(jj,kk) enddo enddo return end