############################################################################################################
################################## Various helpers for Distributed Array tasks #############################

# Prints out info on how the given darray is distributed across different processors
function howdist(s::DArray)
   d = procs(s)
   dn = numel(d)
   for k=1:dn
      pid = d[k]
      st = fetch(@spawnat pid myindexes(s))
      ind = st[1]
      println("$pid - $ind ")
   end
end

# Returns an array of indexes of elements of darray s that are local to the current processor,
# after data from processors (pid1+1 to pid2) have been moved to pid1
function getLocalDistIndexes(s::DArray, pid1, pid2)
    if (pid2 > pid1)
        i = myid()

        #If current proc is outside the range then just return its current distribution
        if (i < pid1 || i > pid2)
            return myindexes(s)[1]

        #If current proc is the processor involved in the redistribution, i.e. pid1
        else if (i == pid1)
            ind2 = fetch(@spawnat pid2 myindexes(s))[1]
            retStop = ind2[numel(ind2)]

            # get the index of the first element that lives on current proc
            retStart = myindexes(s)[1][1]

            return (retStart:retStop)

            end
        end
    end
end

# Computes and returns an array of indexes that specify blocks of contiguous data for
# the darray after data from processors (pid1 + 1 to pid2) have been moved to pid1
function newDistIndexes(s::DArray, pid1, pid2)
    if (pid2 > pid1)
        plist = procs(s)
        sp = numel(plist)

        # Find out the number of processors between pid1 and pid2 that contain data from s
        indexOfPid1 = 0
        indexOfPid2 = 0
        for k = 1 : sp
           if (plist[k] == pid1)
              indexOfPid1 = k
           else if (plist[k] == pid2)
              indexOfPid2 = k
           end
           end
        end
        offset = indexOfPid2 - indexOfPid1

        indexes = int32(zeros(sp + 1 - offset))
        topIndexes = 1

        for k = 1 : sp
           if (plist[k] <= pid1 || plist[k] > pid2)
                indexes[topIndexes] = fetch(@spawnat plist[k] myindexes(s))[1][1]
                topIndexes = topIndexes + 1
           end
        end

        indexes[topIndexes] = numel(s) + 1

        return indexes
    end
end



# Redistribute the darray s by moving all data from processors (pid1 + 1 to pid2) to processor pid1
# except leaving only one element left on each of those processors
# Returns a new darray
function redist(s::DArray, pid1, pid2)
   plist = procs(s)
   np = numel(plist)
   newplist = int64(zeros(np))
   toplist = 1

   for k = 1 : np
      if (plist[k] <= pid1 || plist[k] > pid2)
         newplist[toplist] = plist[k]
         toplist = toplist + 1
      end
   end

   return darray((T,lsz,da)->s[getLocalDistIndexes(s, pid1, pid2)], eltype(s), size(s), distdim(s), newplist[1:top\
list-1], newDistIndexes(s, pid1, pid2))
end

# Returns an array of indexes of elements of darray s that are local to the current processor
# given that data is specially redistributed for black box algorithm
function redistbb_localIndexes(s::DArray, np, l)
   numIndex = np >> 1 + 1
   size = 2^l
   list = zeros(Int64, numIndex)
   for i = 1 : numIndex - 1
      list[i] = fetch(@spawnat ((i - 1) * size + 1) myindexes(s))[1][1]
   end
   list[numIndex] = numel(s) + 1
   return list
end

# l = level, 1-based number
function redistbb_localValues(s::DArray, l)
   p = myid()
   subsize = 2^l
   r = p % subsize
   if (r == 1)
      iStart = myindexes(s)[1][1]
      indexesStop = fetch(@spawnat (p + (subsize >> 1)) myindexes(s))[1]
      iStop = indexesStop[numel(indexesStop)]
      return (iStart:iStop)
   end
end

# Returns a list of processors that will contain data, given the DArray is
# redistributed for the black box algorithm
function redistbb_procs(np, l)
   numprocs = np >> 1
   size = 2^l
   list = zeros(Int64, numprocs)
   for p = 1 : numprocs
      list[p] = (p - 1) * size + 1
   end
   return list
end

# Redistribute the darray s by moving data in the pattern needed for black box algorithm
# Returns a new darray
function redistbb(s, l)
   np = numel(procs(s))
   return darray((T,lsz,da)->s[redistbb_localValues(s,l)], eltype(s),
                 size(s), distdim(s), redistbb_procs(np, l), redistbb_localIndexes(s,np,l))
end


############################################################################################################
################################## Different implementations of Bit-Reversal ###############################

# Bit reversal lookup table for all values up to a Byte
BRLT256 =
[
         0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0, 0x10, 0x90, 0x50, 0xD0, 0x30, 0xB0, 0x70, 0xF0,
         0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8, 0x18, 0x98, 0x58, 0xD8, 0x38, 0xB8, 0x78, 0xF8,
         0x04, 0x84, 0x44, 0xC4, 0x24, 0xA4, 0x64, 0xE4, 0x14, 0x94, 0x54, 0xD4, 0x34, 0xB4, 0x74, 0xF4,
         0x0C, 0x8C, 0x4C, 0xCC, 0x2C, 0xAC, 0x6C, 0xEC, 0x1C, 0x9C, 0x5C, 0xDC, 0x3C, 0xBC, 0x7C, 0xFC,
         0x02, 0x82, 0x42, 0xC2, 0x22, 0xA2, 0x62, 0xE2, 0x12, 0x92, 0x52, 0xD2, 0x32, 0xB2, 0x72, 0xF2,
         0x0A, 0x8A, 0x4A, 0xCA, 0x2A, 0xAA, 0x6A, 0xEA, 0x1A, 0x9A, 0x5A, 0xDA, 0x3A, 0xBA, 0x7A, 0xFA,
         0x06, 0x86, 0x46, 0xC6, 0x26, 0xA6, 0x66, 0xE6, 0x16, 0x96, 0x56, 0xD6, 0x36, 0xB6, 0x76, 0xF6,
         0x0E, 0x8E, 0x4E, 0xCE, 0x2E, 0xAE, 0x6E, 0xEE, 0x1E, 0x9E, 0x5E, 0xDE, 0x3E, 0xBE, 0x7E, 0xFE,
         0x01, 0x81, 0x41, 0xC1, 0x21, 0xA1, 0x61, 0xE1, 0x11, 0x91, 0x51, 0xD1, 0x31, 0xB1, 0x71, 0xF1,
         0x09, 0x89, 0x49, 0xC9, 0x29, 0xA9, 0x69, 0xE9, 0x19, 0x99, 0x59, 0xD9, 0x39, 0xB9, 0x79, 0xF9,
         0x05, 0x85, 0x45, 0xC5, 0x25, 0xA5, 0x65, 0xE5, 0x15, 0x95, 0x55, 0xD5, 0x35, 0xB5, 0x75, 0xF5,
         0x0D, 0x8D, 0x4D, 0xCD, 0x2D, 0xAD, 0x6D, 0xED, 0x1D, 0x9D, 0x5D, 0xDD, 0x3D, 0xBD, 0x7D, 0xFD,
         0x03, 0x83, 0x43, 0xC3, 0x23, 0xA3, 0x63, 0xE3, 0x13, 0x93, 0x53, 0xD3, 0x33, 0xB3, 0x73, 0xF3,
         0x0B, 0x8B, 0x4B, 0xCB, 0x2B, 0xAB, 0x6B, 0xEB, 0x1B, 0x9B, 0x5B, 0xDB, 0x3B, 0xBB, 0x7B, 0xFB,
         0x07, 0x87, 0x47, 0xC7, 0x27, 0xA7, 0x67, 0xE7, 0x17, 0x97, 0x57, 0xD7, 0x37, 0xB7, 0x77, 0xF7,
         0x0F, 0x8F, 0x4F, 0xCF, 0x2F, 0xAF, 0x6F, 0xEF, 0x1F, 0x9F, 0x5F, 0xDF, 0x3F, 0xBF, 0x7F, 0xFF
]

# Reverse bits for an n-bit number, with n being at most 32,
# using lookup table
function bitr_lookup(i, n)
   bitReversed = (BRLT256[(i & 0xff) + 1] << 24) |
                 (BRLT256[((i >> 8) & 0xff) + 1] << 16) |
                 (BRLT256[((i >> 16) & 0xff) + 1] << 8) |
                 (BRLT256[((i >> 24) & 0xff) + 1])
   bitReversed = bitReversed >> (32 - n)
   return bitReversed
end

# Bit-Reversal using normal loop and bit shifts
# for a number i that is represented with n bits
function bitr_loop(i, n)
   bitReversed = 0
   bitShifted = 0

   while (i >= 1 || bitShifted < n)
      bitReversed = (bitReversed << 1) + (i & 1)
      i = i >> 1
      bitShifted += 1
   end

   return bitReversed
end

# Bit-Reverse an array of numbers
function bitr(array)
   n = numel(array)
   numbits = convert(Int32, ceil(log(n)/log(2)));
   for i=0:n-1
      bitReversed = bitr_loop(i, numbits)
      #bitReversed = bitr_lookup(i, numbits)

      if (i < bitReversed && bitReversed < n)
         temp = array[i+1]
         array[i+1] = array[bitReversed+1]
         array[bitReversed+1] = temp
      end
   end
   return array
end

############################################################################################################
############################### Different implementations of FFT and parallel FFT ##########################

function fft_dit_in_place(array, top::Int64, bot::Int64)
   if (bot > top)
      half::Int64 = floor((bot + top) / 2)

      fft_dit_in_place(array, top, half)
      fft_dit_in_place(array, half + 1, bot)

      n::Int64 = bot - top + 1
      subsize::Int64 = floor(n/2)

      for k = 0 : half - top
         upper::Int64 = top + k
         lower::Int64 = upper + subsize

         upperVal = array[upper + 1] # make sure to increment 1 since Julia is 1-based
         lowerVal = exp(im * (-2 * pi * k / n)) * array[lower + 1]

         array[upper + 1] = upperVal + lowerVal
         array[lower + 1] = upperVal - lowerVal
      end
   end
end

# In place FFT computation, modifies the distributed array
# Random spawning #
function fftp_dit_darray_random_spawn(array::DArray, startIdx, endIdx)
   if (endIdx > startIdx)
      half = int64(floor((startIdx + endIdx) / 2))

      @sync begin
         @spawn fftp_dit_darray_random_spawn(array, startIdx, half)
         @spawn fftp_dit_darray_random_spawn(array, half + 1, endIdx)
      end

      n = endIdx - startIdx + 1

      for k = 0 : half - startIdx
         top = startIdx + k
         bot = top + int64(n / 2)

         topValue = array[top + 1]
         botValue = exp(im * (-2 *pi * k / n)) * array[bot + 1]

         array[top + 1] = topValue + botValue
         array[bot + 1] = topValue - botValue
      end
   end
end

# In place FFT computation, modifies the distributed array
# Smart spawning to make sure processors with local data are used most#
function fftp_dit_darray_smart_spawn(array::DArray, startIdx, endIdx)
   if (endIdx > startIdx)
      half = int64(floor((startIdx + endIdx) / 2))

      #Find the local processor of the first piece of data
      pTop = owner(array, startIdx)

      #Find the local processor of the second piece of data
      pBot = owner(array, half + 1)

      @sync begin
            @spawnat pTop fftp_dit_darray_smart_spawn(array, startIdx, half)
            @spawnat pBot fftp_dit_darray_smart_spawn(array, half + 1, endIdx)
      end

      #if (pTop != pBot) #If the data is not yet local, bring them together before doing computations
         #array = redist(array, pTop, pBot)
      #end

      n = endIdx - startIdx + 1


      for k = 0 : half - startIdx
         top = startIdx + k
         bot = top + int64(floor(n/2))

         topValue = array[top]
         botValue = exp(im * (-2 *pi * k / n)) * array[bot]

         array[top] = topValue + botValue
         array[bot] = topValue - botValue
      end

   end
end

function fft(array::DArray)
   localIndexes = myindexes(array)[1]
   localArray = array[localIndexes]
   fft_dit_in_place(localArray, 0, numel(localArray)-1)
   array[localIndexes] = localArray
end

function fftp_combine(array::DArray)
   localIndexes = myindexes(array)[1]
   n = numel(localIndexes)
   startIdx = localIndexes[1]
   endIdx = localIndexes[n]
   half = int64(floor((startIdx + endIdx) / 2))

   subsize = int64(floor(n/2))
   for k = 0 : half - startIdx
      top = startIdx + k
      bot = top + subsize

      topValue = array[top]
      botValue = exp(im * (-2 *pi * k / n)) * array[bot]

      array[top] = topValue + botValue
      array[bot] = topValue - botValue
   end
end

function fftp_dit_darray_fftw(array::DArray)
   procDist = procs(array)
   np = numel(procDist)

   @sync begin
      for p = 1 : np
         @spawnat p fft(array)
      end
   end

   logp = int32(log(np) / log(2))

   for stage = 1 : logp
      array = redistbb(array, stage)
      numCombs = np >> stage
      subsize = 2^stage

      @sync begin
         for i = 1 : numCombs
            @spawnat ((i - 1) * subsize + 1) fftp_combine(array)
         end
      end
   end

   return array
end

# Only communicate version
function fft_oc(array::DArray)
   #localIndexes = myindexes(array)[1]
   #localArray = array[localIndexes]
   #array[localIndexes] = localArray
end

# Only communicate version
function fftp_combine_oc(array::DArray)
   #localIndexes = myindexes(array)[1]
end

# Only communicate version
function fftp_dit_darray_fftw_oc(array::DArray)
   procDist = procs(array)
   np = numel(procDist)

   @sync begin
      for p = 1 : np
         @spawnat p fft_oc(array)
      end
   end

   logp = int32(log(np) / log(2))

   for stage = 1 : logp
      array = redistbb(array, stage)
      numCombs = np >> stage
      subsize = 2^stage

      @sync begin
         for i = 1 : numCombs
            @spawnat ((i - 1) * subsize + 1) fftp_combine_oc(array)
         end
      end
   end

   return array
end

# Entry point to compute the parallel FFT
function fftp(array)
   n = numel(array)
   if ((n & (n-1)) == 0)
      bitr(array)
      darr = distribute(array)
      return fftp_dit_darray_fftw(darr)
   else
      fft(array)
   end
end

# Only communicate version of fftp, no actual computation is done
# This is mainly used for measuring the latency cost of transferring data between nodes in FFT
function fftp_oc(array)
   n = numel(array)
   if ((n & (n-1)) == 0)
      darr = distribute(array)
      return fftp_dit_darray_fftw_oc(darr)
   end
end