#define IMGX     256
#define IMGY     240
#define WIND     13
#define WINDHALF 6

#define ITERS    10

#define NUMDISP   16

C
C Test the performance of window sums, both for a single image
C and for a stack of images and compare to a PDO implementation
C

      program windowsum

      integer i,j,k,l,m,iter
      external timer_start, timer_stop
      real timer_stop, totaltime

      real errimg(IMGX,IMGY), outimg(IMGX,IMGY)
      real errimgs(NUMDISP,IMGX,IMGY), outimgs(NUMDISP,IMGX,IMGY)
      template terr(IMGX,IMGY)
      align errimg(i,j), outimg(i,j), errimgs(k,i,j),
     &      outimgs(k,i,j) with terr(i,j)
      distribute terr(BLOCK(8),BLOCK(8))
      

C First, try a naive PDO on the single image
C This is not as naive as we do it on the real version, but I want
C to keep the same distribution

      print *, "Performance test, naive PDO"

      errimg=1.0

      call fx_true_barrier()
      call timer_start()
      
      do iter=1,ITERS
C        print *,"Iteration ",iter
        pdo (i=WINDHALF+1:IMGX-WINDHALF, j=WINDHALF+1:IMGY-WINDHALF)
        pin  errimg(i-WINDHALF:i+WINDHALF,j-WINDHALF:j+WINDHALF)
        pout outimg(i,j)
        pbody
           s=0.0
           do k=i-WINDHALF, i+WINDHALF
              do l=j-WINDHALF, j+WINDHALF
                 s=s+errimg(k,l)
              enddo
           enddo
           outimg(i,j)=s
        endpdo
      enddo

      call fx_true_barrier()
      totaltime=timer_stop()

      print *, "Basic performance of naive PDO is ", 
     &         ITERS/totaltime, "Iters/second"

C
C
C Now try the scan implementation
C
C
      print *, "Performance test, scan version"

      errimg=1.0

      call fx_true_barrier()
      call timer_start()

      do iter=1,ITERS
C         print *,"Iteration ",iter
         errimg=sum_prefix(errimg,0,.TRUE.,.TRUE.,.TRUE.)
         errimg(1:IMGX-WIND,:)=errimg(1:IMGX-WIND,:)-errimg(WIND:IMGX,:)
         errimg=sum_prefix(errimg,1,.TRUE.,.TRUE.,.TRUE.)
         errimg(:,1:IMGY-WIND)=errimg(:,1:IMGY-WIND)-errimg(:,WIND:IMGY)
      enddo
      
      call fx_true_barrier()
      totaltime=timer_stop()
 

      print *,"Basic performance of scan version is ",
     &        ITERS/totaltime, "iterations/second"

      errimg=1.0

      print *,"Performance test, scans only"

      call fx_true_barrier()
      call timer_start()

      do iter=1,ITERS
         errimg=sum_prefix(errimg,0,.TRUE.,.TRUE.,.TRUE.)
         errimg=sum_prefix(errimg,1,.TRUE.,.TRUE.,.TRUE.)
      enddo
      
      call fx_true_barrier()
      totaltime=timer_stop()
 

      print *,"Performance of scans only is ",
     &         ITERS/totaltime, " iterations/second"


C First, try a naive PDO on the 16 images
C This is not as naive as we do it on the real version, but I want
C to keep the same distribution

      print *, "Performance test, naive PDO - 16 images"

      errimgs=1.0

      call fx_true_barrier()
      call timer_start()
      
      do iter=1,ITERS
C        print *,"Iteration ",iter
        pdo (i=WINDHALF+1:IMGX-WINDHALF, j=WINDHALF+1:IMGY-WINDHALF)
        pin  errimgs(:,i-WINDHALF:i+WINDHALF,j-WINDHALF:j+WINDHALF)
        pout outimgs(:,i,j)
        pbody
          do m=1,NUMDISP
           s=0.0
           do k=i-WINDHALF, i+WINDHALF
              do l=j-WINDHALF, j+WINDHALF
                 s=s+errimgs(m,k,l)
              enddo
           enddo
           outimgs(m,i,j)=s
          enddo
        endpdo
      enddo

      call fx_true_barrier()
      totaltime=timer_stop()

      print *, "Basic performance of naive PDO on 16 imgs is ", 
     &         ITERS/totaltime, "Iters/second"

C 
C Now try scan performance of 16 images together
C

      print *,"Perfomance test, scan version 16 images"

      errimgs=1.0

      call fx_true_barrier()
      call timer_start()

 
      do iter=1,ITERS
C         print *,"Iteration ",iter
         errimgs=sum_prefix(errimgs,1,.TRUE.,.TRUE.,.TRUE.)
         errimgs(:,1:IMGX-WIND,:)=errimgs(:,1:IMGX-WIND,:)
     &                           -errimgs(:,WIND:IMGX,:)
         errimgs=sum_prefix(errimgs,2,.TRUE.,.TRUE.,.TRUE.)
         errimgs(:,:,1:IMGY-WIND)=errimgs(:,:,1:IMGY-WIND)
     &                           -errimgs(:,:,WIND:IMGY)
         errimgs=sum_prefix(errimgs,0,.TRUE.,.TRUE.,.TRUE.)
      enddo

      call fx_true_barrier()
      totaltime=timer_stop()

      print *, "Basic performance of scan on 16 imgs is ", 
     &         ITERS/totaltime, "Iters/second"

      end

