#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <zlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <stdlib.h>
#include <png.h>
#include <math.h>
#define RED 2
#define GREEN 1
#define BLUE 0

unsigned int h_board_width = 0;
unsigned int h_board_height = 0;
char p_board_width = 12;
char p_board_height = 12;
unsigned long long * buffer;
int png_bit_depth;
__device__ unsigned int BOARD_WIDTH;
__device__ unsigned int BOARD_HEIGHT;

__device__ unsigned char ruletable[4096];
extern unsigned char h_ruletable[4096];

extern char p_board_width;
extern char p_board_height;

extern int h_offsetx;
extern int h_offsety;
extern int zoom;

extern "C" void init_cellauto(int seed);
extern "C" void calc_red(int numSMs);
extern "C" void move_green(int numSMs);

extern "C" void display_cellauto(uchar4 *dst,int imageW, int imageH,int numSMs);
extern "C" void paint_cell(int x,int y,int c);

//extern "C" void saveF(char *filename);
//extern "C" void saveImg(char *filename);
extern "C" void init_color_palette();

//extern "C" bool openF(char *filename);
extern "C" bool create_board23();
extern "C" int get_red(int i);
extern "C" int get_green(int i);
extern "C" int get_blue(int i);

unsigned long long num_generations = 0;
__device__ unsigned int d_num_generations = 0;
//unsigned int *h_bitboard_red = NULL;
//unsigned int *h_bitboard_green = NULL;
//unsigned int *h_bitboard_blue = NULL;
unsigned int *h_bitboard[3] = {NULL,NULL,NULL};
__device__ unsigned int *bitboard[3];
unsigned long long num_generations = 0;
__device__ unsigned int d_num_generations = 0;

__device__ int blockCounter = 0;
__device__ int offsetx;
__device__ int offsety;

__device__ unsigned int d_rand_bits[10240];

unsigned int h_rand_bits[10240];

#define PAL_SIZE 8
//unsigned int my_palette[PAL_SIZE] = {0x000000,0xFF0000,
//0x00FF00,0xFFFF00,0x0000FF,0xFF00FF,0x00FFFF,0xFFFFFF};
__device__ unsigned char gpu_palette[PAL_SIZE*3];
png_color pal_color[PAL_SIZE];
unsigned char pal_color_byte[PAL_SIZE*3];
void clear_board(int time,int numSMs);
bool create_board23();

void set_color(int i,int red,int green,int blue) {
  pal_color[i].red = red;
  pal_color[i].green = green;
  pal_color[i].blue = blue;
  pal_color_byte[(i*3)+0] = red;
  pal_color_byte[(i*3)+1] = green;
  pal_color_byte[(i*3)+2] = blue;
}
void set_colorf(int i,double red,double green,double blue) {
  set_color(i,(int) (red*255.0),(int) (green*255.0),(int) (blue*255.0));
}
int get_red(int i) {return pal_color[i].red;}
int get_green(int i) {return pal_color[i].green;}
int get_blue(int i) {return pal_color[i].blue;}
void update_pal_color(png_colorp p,int ps) {
  if (ps > PAL_SIZE) {ps = PAL_SIZE;}
  for (int i = 0;i < ps;i++) {
    set_color(i,p[i].red,p[i].green,p[i].blue);
  }
}
void init_color_palette3564() {
  for (int i = 0;i < PAL_SIZE;i++) {
    int c = my_palette[i];
    set_color(i,c & 255,(c >> 8) & 255,(c >> 16) & 255);
  }
}
void init_color_palette() {
  for (int i = 0;i < PAL_SIZE;i++) {
    set_colorf(i,(i >> 2) & 1,(i >> 1) & 1,i & 1);
  }
}
__global__ void paint_cell2(int x,int y,int c) {
  int bw = (BOARD_WIDTH>>5);
  x = x & (BOARD_WIDTH-1);
  y = y & (BOARD_HEIGHT-1);
  for (int a = 0;a < 2;a++) {
    int m = bitboard[a][(bw*y)+(x >> 5)];
    m = m & ~(1 << (x&31));
    int b = (c >> a) & 1;
    bitboard[a][(bw*y)+(x >> 5)] = m | (b << (x&31));
  }
}
void paint_cell(int x,int y,int c) {
  int x2 = h_offsetx;
  int y2 = h_offsety;
  if (zoom > 0) {
    x2 = x2 + (x>>zoom);
    y2 = y2 + (y>>zoom);
  } else {
    x2 = x2 + (x<<-zoom);
    y2 = y2 + (y<<-zoom);
  }
  paint_cell2<<<1,1>>>(x2,y2,c);
}
bool create_board24() {
  printf("h_board_width: %d \n",h_board_width);
  printf("h_board_height: %d \n",h_board_height);

  init_color_palette();
  cudaMemcpyToSymbol(BOARD_WIDTH,&h_board_width,4,0,cudaMemcpyHostToDevice);
  cudaMemcpyToSymbol(BOARD_HEIGHT,&h_board_height,4,0,cudaMemcpyHostToDevice);
  cudaError_t e;//h_numStateBits
  for (int i = 0;i < 3;i++) {
    long long s = h_board_width;
    s = (s*h_board_height)>>3;
    //cutilSafeCall();
    int s2 = (int) (s>>20);
    printf("%d MB \n",s2);
    e = cudaMalloc(&h_bitboard[i],s);
    if (e == cudaErrorMemoryAllocation) {return false;}
    cudaMemset(h_bitboard[i],0,s);
    //cutilSafeCall();
    cudaMemcpyToSymbol(bitboard,&h_bitboard[i],8,i*8,cudaMemcpyHostToDevice);
  }
  return true;
}
bool create_board23() {
  //allow_run = true;
  printf("p_board_width: %d \n",p_board_width);
  printf("p_board_height: %d \n",p_board_height);
  h_board_width = 1 << p_board_width;
  h_board_height = 1 << p_board_height;
  return create_board24();
}

void init_cellauto(int seed) {
  printf("init_cellauto %d \n",RAND_MAX);
  //if (allow_run == false) {return;}
  time_t ttm = time(NULL);
  srand(seed);
  int sw = 8;//8*8*2;
  int sh = 8;//8*8*2;
  h_offsetx = -8;
  h_offsety = -8;
  for (int i = 0;i <= h_numStateBits;i++) {
    int s = (h_board_width*h_board_height)>>3;
    cudaMemset(h_bitboard[i],0,s);
  }
  for (int c = 0;c <= h_numStateBits;c++) {
    for (int i = 0;i <= ((sw*sh)>>5);i++) {
      h_rand_bits[i] = rand();
    }

    cudaMemcpyToSymbol(d_rand_bits,&h_rand_bits,((sw*sh)>>3)+1,0,cudaMemcpyHostToDevice);

    init_board2<<<1,1>>>(c,sw,sh);
  }
  //num_generations = 0;
  //int time = (int) (num_generations & 0xFFFFFFFF);
  //cudaMemcpyToSymbol(d_num_generations,&time, sizeof(unsigned int), 0,   cudaMemcpyHostToDevice );

}

__global__ void display_cellauto_zoom_in(uchar4 *dst,int imageW, int imageH,int psize) {
  __shared__ unsigned int blockIndex;
  int gridWidth = imageW;
  int numBlocks = imageW*imageH;
  int cx = (imageW >> psize) / 2;
  int cy = (imageH >> psize) / 2;

  //int s = 1 << psize;
  while(1) {
    if (threadIdx.x==0) {
      blockIndex = atomicAdd(&blockCounter, 1);
      blockIndex = blockIndex * blockDim.x;
    }
    __syncthreads();
    int blockIndex2 = blockIndex + threadIdx.x;
    int blockX = blockIndex2 % (gridWidth);
    int blockY = blockIndex2 / (gridWidth);
    if (blockIndex2 >= numBlocks) break;     
    int x = ((blockX >> psize)+(offsetx)-cx) & (BOARD_WIDTH-1);
    int y = ((blockY >> psize)+(offsety)-cy) & (BOARD_HEIGHT-1);
    int c = (d_num_generations&1);
    int bw6 = (BOARD_WIDTH>>5);
    int i = (x >> 5) + (y*bw6);
    int a = 0;
    for (int c = 0;c < 3;c++) {
      a = a | (((bitboard[c][i] >> (x & 31))&1) << c);
    }
    uchar4 color;
    color.x = gpu_palette[(a*3)+0];
    color.y = gpu_palette[(a*3)+1];
    color.z = gpu_palette[(a*3)+2];
    color.w = 0;
    dst[(blockY*imageW)+blockX] = color;
  }
}
__global__ void display_cellauto_zoom_out(uchar4 *dst,int imageW, int imageH,int psize) {
  __shared__ unsigned int blockIndex;
   int gridWidth = imageW;
   int numBlocks = imageW*imageH;
  //int bs = (BOARD_WIDTH*BOARD_HEIGHT)>>6;
  int cx = (imageW << psize) / 2;
  int cy = (imageH << psize) / 2;
  int s = 1 << psize;
  while(1) {
    if (threadIdx.x==0) {
      blockIndex = atomicAdd(&blockCounter, 1);
      blockIndex = blockIndex * blockDim.x;
    }
    __syncthreads();
    int blockIndex2 = blockIndex + threadIdx.x;
    int blockX = blockIndex2 % (gridWidth);
    int blockY = blockIndex2 / (gridWidth);
    if (blockIndex2 >= numBlocks) break;     
    int x = ((blockX << psize)+(offsetx)-cx) & (BOARD_WIDTH-1);
    int y = ((blockY << psize)+(offsety)-cy) & (BOARD_HEIGHT-1);
    int c = (d_num_generations&1);
    int red = 0; 
    int green = 0; 
    int blue = 0; 
    int bw6 = (BOARD_WIDTH>>5);
    for (int x3 = 0;x3 < s;x3++) {
      for (int y3 = 0;y3 < s;y3++) {
        int x2 = (x+x3) & (BOARD_WIDTH-1);
        int y2 = (y+y3) & (BOARD_HEIGHT-1);

        int i = (x2 >> 5) + (y2*bw6);
        for (int c = 0;c < 3;c++) {
          a = a | (((bitboard[c][i] >> (x & 31))&1) << c);
        }
        red = red + gpu_palette[(a*3)+0];
        green = green + gpu_palette[(a*3)+1];
        blue = blue + gpu_palette[(a*3)+2];
      }
    }
    int ps2 = psize*2;
    uchar4 color;
    color.x = red>>ps2;color.y = green>>ps2;
    color.z = blue>>ps2;color.w = 0;
    dst[(blockY*imageW)+blockX] = color;
  }
}

__global__ void move_green1(int gridWidth,int numBlocks) {
  __shared__ unsigned int blockIndex;  
  while(1) {
    if (threadIdx.x==0) {
      blockIndex = atomicAdd(&blockCounter, 1);
      blockIndex = blockIndex * blockDim.x;
    }
    __syncthreads();
    int blockIndex2 = blockIndex + threadIdx.x;
    int blockX = blockIndex2 % gridWidth;
    int blockY = blockIndex2 / gridWidth;
    if (blockIndex2 >= numBlocks) break;  
    int gw = gridWidth;
    int bx = blockX;
    int by = blockY;
    int bh = BOARD_HEIGHT;
    int i43 = bx+((by&(bh-1))*gw);
    bitboard[BLUE][i43] = bitboard[GREEN][i43];
    bitboard[GREEN][i43] = bitboard[RED][i43];
  }
}

__global__ void calc_red1(int gridWidth,int numBlocks) {
  __shared__ unsigned int blockIndex;  
  while(1) {
    if (threadIdx.x==0) {
      blockIndex = atomicAdd(&blockCounter, 1);
      blockIndex = blockIndex * blockDim.x;
    }
    __syncthreads();
    int blockIndex2 = blockIndex + threadIdx.x;
    int blockX = blockIndex2 % gridWidth;
    int blockY = blockIndex2 / gridWidth;
    if (blockIndex2 >= numBlocks) break;  
    int gw = gridWidth;
    int bx = blockX;
    int by = blockY;
    int bh = BOARD_HEIGHT;

    unsigned int r2 = bitboard[GREEN][bx+(((by+1)&(bh-1))*gw)];
    unsigned int r1 = bitboard[GREEN][bx+((by&(bh-1))*gw)];
    unsigned int r0 = bitboard[GREEN][bx+(((by-1)&(bh-1))*gw)];

    int i43 = bx+((by&(bh-1))*gw);
    int i = 0;
    bitboard[RED][i43] = 0;

    i =     (r2&3);
    i = i | ((r1&3)<<3);
    i = i | ((r0&3)<<6);      
    i = i << 1;
    i = i | ((bitboard[GREEN][((bx-1)&(gw-1))+(((by+1)&(bh-1))*gw)])>>31);
    i = i | (((bitboard[GREEN][((bx-1)&(gw-1))+((by&(bh-1))*gw)])>>31)<<3);
    i = i | (((bitboard[GREEN][((bx-1)&(gw-1))+(((by-1)&(bh-1))*gw)])>>31)<<6);
    i = i | (((bitboard[BLUE][i43])&1)<<9);

    yb = ((ruletable[i>>3]>>(i&7))&1);


    for (int j = 2;j < 31;j = j + 2) {
      i =     ((r2 >> (j-1))&7);
      i = i | (((r1 >> (j-1))&7)<<3);
      i = i | (((r0 >> (j-1))&7)<<6);
      
      i = i | (((bitboard[BLUE][i43] >> j)&1)<<9);
      yb = yb | (((ruletable[i>>3]>>(i&7))&1) << j);

    }
    bitboard[RED][i43] = bitboard[RED][i43] | (yb & 0x55555555);

    yb = 0;

    i =     (r2>>30);
    i = i | ((r1>>30)<<3);
    i = i | ((r0>>30)<<6);      

    i = i | (((bitboard[GREEN][((bx+1)&(gw-1))+(((by+1)&(bh-1))*gw)])&1)<<2);      
    i = i | (((bitboard[GREEN][((bx+1)&(gw-1))+((by&(bh-1))*gw)])&1)<<5);      
    i = i | (((bitboard[GREEN][((bx+1)&(gw-1))+(((by-1)&(bh-1))*gw)])&1)<<8);      
    i = i | (((bitboard[BLUE][i43] >> 31)&1)<<9);

    yb = (((ruletable[i>>3]>>(i&7))&1) << 30);

    for (int j = 1;j < 31;j = j + 2) {
      i =     ((r2 >> (j-1))&7);
      i = i | (((r1 >> (j-1))&7)<<3);
      i = i | (((r0 >> (j-1))&7)<<6);
      i = i | (((bitboard[BLUE][i43] >> j)&1)<<9);
      yb = yb | (((ruletable[i>>3]>>(i&7))&1) << (j-1));
    }

    bitboard[RED][i43] = bitboard[RED][i43] | ((yb & 0x55555555) << 1);

    //bitboard[c^1][bx+(by*gw)] = b[0];

  }
}

void calc_red(int numSMs) {
  unsigned int hBlockCounter = 0;
    cudaMemcpyToSymbol(ruletable,&h_ruletable,4096,0,cudaMemcpyHostToDevice);
    cudaMemcpyToSymbol(blockCounter, &hBlockCounter, sizeof(unsigned int), 0,   cudaMemcpyHostToDevice );

    dim3 threads(0x300);
    calc_red<<<numSMs,threads>>>(h_board_width>>5,(h_board_width*h_board_height)>>5);}

    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess) printf("Error: %s\n", cudaGetErrorString(err));
}

void move_green(int numSMs) {
  unsigned int hBlockCounter = 0;
    //cudaMemcpyToSymbol(ruletable,&h_ruletable,4096,0,cudaMemcpyHostToDevice);
    cudaMemcpyToSymbol(blockCounter, &hBlockCounter, sizeof(unsigned int), 0,   cudaMemcpyHostToDevice );

    dim3 threads(0x300);
    move_green<<<numSMs,threads>>>(h_board_width>>5,(h_board_width*h_board_height)>>5);}

    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess) printf("Error: %s\n", cudaGetErrorString(err));
}
void display_cellauto(uchar4 *dst,int imageW, int imageH,int numSMs) {
  dim3 threads(256);
  unsigned int hBlockCounter = 0;
 cudaMemcpyToSymbol(gpu_palette,&pal_color_byte, PAL_SIZE*3,0,cudaMemcpyHostToDevice);
 cudaMemcpyToSymbol(blockCounter, &hBlockCounter, sizeof(unsigned int), 0,   cudaMemcpyHostToDevice );
  cudaMemcpyToSymbol(offsetx,&h_offsetx,sizeof(int),0,cudaMemcpyHostToDevice);
  cudaMemcpyToSymbol(offsety,&h_offsety,sizeof(int),0,cudaMemcpyHostToDevice);
  if (zoom < 0) {
    display_cellauto_zoom_out<<<numSMs,threads>>>(dst,imageW,imageH,-zoom);  
  } else {
    display_cellauto_zoom_in<<<numSMs,threads>>>(dst,imageW,imageH,zoom);  
  }
}

