#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <zlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <stdlib.h>
#include <png.h>
#include <math.h>

unsigned int h_board_width = 0;
unsigned int h_board_height = 0;

char p_board_width = 22;
char p_board_height = 10;

unsigned long long * buffer;
int png_bit_depth;
__device__ unsigned int BOARD_WIDTH;
__device__ unsigned int BOARD_HEIGHT;

__device__ unsigned int ruletable;
extern unsigned int h_ruletable;

extern char p_board_width;
extern char p_board_height;

extern int h_offsetx;
//extern int h_offsety;
extern int zoom;

extern int selch;
int selch;

extern "C" void random_colors();
extern "C" void init_cellauto(int seed);
extern "C" void run_cellauto(int numSMs);
extern "C" void display_cellauto(uchar4 *dst,int imageW, int imageH,int numSMs);
//extern "C" void paint_cell(int x,int y,int c);

//extern "C" void saveF(char *filename);
//extern "C" void saveImg(char *filename);
extern "C" void init_color_palette();

//extern "C" bool openF(char *filename);
extern "C" bool create_board23();
extern "C" int get_red(int i);
extern "C" int get_green(int i);
extern "C" int get_blue(int i);
extern "C" void set_color(int i,int red,int green,int blue);

__device__ int blockCounter = 0;
__device__ int offsetx;
//__device__ int offsety;

extern unsigned int pal_blue;
extern unsigned int pal_green;
extern unsigned int pal_red;

extern short color_mask;
short color_mask = 1;
__device__ short d_color_mask;
unsigned int pal_blue;
unsigned int pal_green;
unsigned int pal_red;

__device__ unsigned int dpal_blue;
__device__ unsigned int dpal_green;
__device__ unsigned int dpal_red;

unsigned short *h_bitboard[1024];
extern unsigned long long num_generations;
unsigned long long num_generations = 0;
__device__ unsigned int d_num_generations = 0;

__device__ unsigned short *bitboard[1024];
#define PAL_SIZE 8
__device__ unsigned char gpu_palette[PAL_SIZE*3];
png_color pal_color[PAL_SIZE];
unsigned char pal_color_byte[PAL_SIZE*3];
//void clear_board(int time,int numSMs);
//bool create_board23();
void set_color(int i,int red,int green,int blue) {
  pal_color[i].red = red;
  pal_color[i].green = green;
  pal_color[i].blue = blue;
  pal_color_byte[(i*3)+0] = red;
  pal_color_byte[(i*3)+1] = green;
  pal_color_byte[(i*3)+2] = blue;
}
void set_colorf(int i,double red,double green,double blue) {
  set_color(i,(int) (red*255.0),(int) (green*255.0),(int) (blue*255.0));
}
int get_red(int i) {return pal_color[i].red;}
int get_green(int i) {return pal_color[i].green;}
int get_blue(int i) {return pal_color[i].blue;}

void init_color_palette() {
  for (int i = 0;i < 8;i++) {
    set_color(i,((i >> 2) & 1)*255,((i >> 1) & 1)*255,(i & 1)*255);
  }
}

bool create_board24() {
  printf("h_board_width: %d \n",h_board_width);
  printf("h_board_height: %d \n",h_board_height);
  init_color_palette();

  cudaMemcpyToSymbol(BOARD_WIDTH,&h_board_width,4,0,cudaMemcpyHostToDevice);
  cudaMemcpyToSymbol(BOARD_HEIGHT,&h_board_height,4,0,cudaMemcpyHostToDevice);

  cudaError_t e;
  for (int i = 0;i <= h_board_height;i++) {
    long long s = h_board_width>>3;
    e = cudaMalloc(&h_bitboard[i],s);
    if (e == cudaErrorMemoryAllocation) {return false;}
    cudaMemcpyToSymbol(bitboard,&h_bitboard[i],8,i*8,cudaMemcpyHostToDevice);
  }
  return true;
}
bool create_board23() {
  //allow_run = true;
  printf("p_board_width: %d \n",p_board_width);
  printf("p_board_height: %d \n",p_board_height);
  h_board_width = 1 << p_board_width;
  h_board_height = 1 << p_board_height;
  return create_board24();
}
//void random_colors64() {
  //pal_blue  = rand() << 1;pal_blue  = (pal_blue  << 32) ^ (rand() << 16) ^ rand();
  //pal_green = rand() << 1;pal_green = (pal_green << 32) ^ (rand() << 16) ^ rand();
  //pal_red   = rand() << 1;pal_red   = (pal_red   << 32) ^ (rand() << 16) ^ rand();
  //cudaMemcpyToSymbol(dpal_blue,&pal_blue,8,0,cudaMemcpyHostToDevice);
  //cudaMemcpyToSymbol(dpal_green,&pal_green,8,0,cudaMemcpyHostToDevice);
  //cudaMemcpyToSymbol(dpal_red,&pal_red,8,0,cudaMemcpyHostToDevice);
//}
void random_colors() {
  pal_blue  = (rand() << 1); //^ rand();
  pal_green = (rand() << 1); //^ rand();
  pal_red   = (rand() << 1); //^ rand();
}
void init_cellauto(int seed) {
  srand(seed);
  //h_ruletable = (short) rand();
  //h_ruletable = (unsigned short) 0xCCCC;
  num_generations = 0;
  unsigned short s = 0;
  for (int i = 0;i < (h_board_width>>4);i++) {
    s = (unsigned short) rand();
    cudaMemcpy(&h_bitboard[num_generations & (h_board_height-1)][i],&s,2,cudaMemcpyHostToDevice);    
  }
  //cudaMemcpyToSymbol(d_rand_bits,&h_rand_bits,((sw*sh)>>3)+1,0,cudaMemcpyHostToDevice);
  
  int time = (int) (num_generations & 0xFFFFFFFF);
  cudaMemcpyToSymbol(d_num_generations,&time, sizeof(unsigned int), 0,   cudaMemcpyHostToDevice );

}
__global__ void display_cellauto1(uchar4 *dst,int imageW, int imageH) {
  __shared__ unsigned int blockIndex;
  int gridWidth = imageW;
  int numBlocks = imageW*imageH;
  while(1) {
    if (threadIdx.x==0) {
      blockIndex = atomicAdd(&blockCounter, 1);
      blockIndex = blockIndex * blockDim.x;
    }
    __syncthreads();
    int blockIndex2 = blockIndex + threadIdx.x;
    int blockX = blockIndex2 % (gridWidth);
    int blockY = blockIndex2 / (gridWidth);
    if (blockIndex2 >= numBlocks) break; 
    int g = d_num_generations;    
    int x = (blockX+offsetx+((blockY-g)*2)) & (BOARD_WIDTH-1);
    int y = (g-blockY) & (BOARD_HEIGHT-1);
    //int a = (bitboard[y][x >> 4] >> (x & 15)) & 1;
    int x4 = x >> 4;
    int a = bitboard[y][(x4 + 1) & ((BOARD_WIDTH-1)>>4)];
    a = (((a << 16) | bitboard[y][x4]) >> (x & 15)) & d_color_mask;
    uchar4 color;
      
    //color.x = gpu_palette[((a&1)*3)+0];
    //color.y = gpu_palette[((a&1)*3)+1];
    //color.z = gpu_palette[((a&1)*3)+2];
    color.x = ((dpal_red >> a) & 1) * 255;
    color.y = ((dpal_green >> a) & 1) * 255;
    color.z = ((dpal_blue >> a) & 1) * 255;
    color.w = 0;
    dst[(blockY*imageW)+blockX] = color;    
  }
}
__global__ void run_cellauto1(int numBlocks) {
  __shared__ unsigned int blockIndex;  
  while(1) {
    if (threadIdx.x==0) {
      blockIndex = atomicAdd(&blockCounter, 1);
      blockIndex = blockIndex * blockDim.x;
    }
    __syncthreads();
    int blockIndex2 = blockIndex + threadIdx.x;
    //int blockX = blockIndex2 % gridWidth;
    //int blockY = blockIndex2 / gridWidth;
    if (blockIndex2 >= numBlocks) break;  
    //int gw = gridWidth;
    int bx = blockIndex2;
    //int by = blockY;
    int bw = BOARD_WIDTH >> 4;
    int bh = BOARD_HEIGHT;
    int g0 = (d_num_generations&(bh-1));
    int g1 = ((d_num_generations+1)&(bh-1));
    int bb = bitboard[g0][(bx+1)&(bw-1)];
    bb = (bb << 16) | bitboard[g0][bx];
    int bb2 = 0;
    for (int i = 0;i < 16;i++) {
      int b = (ruletable >> ((bb >> i) & 31)) & 1;
      bb2 = bb2 | (b << i);
    }
    bitboard[g1][bx] = (short) bb2;
  }    
}

void run_cellauto(int numSMs) {
  unsigned int hBlockCounter = 0;
  cudaMemcpyToSymbol(ruletable,&h_ruletable,4,0,cudaMemcpyHostToDevice);
  cudaMemcpyToSymbol(blockCounter, &hBlockCounter, sizeof(unsigned int), 0,   cudaMemcpyHostToDevice );

  dim3 threads(0x380);
  run_cellauto1<<<numSMs,threads>>>(h_board_width>>4);

  cudaError_t err = cudaGetLastError();
  if (err != cudaSuccess) printf("Error7: %s\n", cudaGetErrorString(err));
  num_generations = num_generations + 1;
  int time = (int) (num_generations & 0xFFFFFFFF);
  cudaMemcpyToSymbol(d_num_generations,&time, sizeof(unsigned int), 0,   cudaMemcpyHostToDevice );

}
void display_cellauto(uchar4 *dst,int imageW, int imageH,int numSMs) {
  dim3 threads(256);
  unsigned int hBlockCounter = 0;
  unsigned int pal_blue2 = pal_blue;
  unsigned int pal_green2 = pal_green;
  unsigned int pal_red2 = pal_red;

  //if ((selch&0x1) != 0)  {pal_red2    = h_ruletable;}
  if ((selch&0x1) != 0)  {pal_red2    = ~pal_red2;}
  //if ((selch&0x4) != 0)  {pal_green2 = h_ruletable;}
  if ((selch&0x2) != 0)  {pal_green2  = ~pal_green2;}
  //if ((selch&0x10) != 0) {pal_blue2  = h_ruletable;}
  if ((selch&0x4) != 0) {pal_blue2   = ~pal_blue2;}


  cudaMemcpyToSymbol(d_color_mask, &color_mask, 2,0,cudaMemcpyHostToDevice);
  cudaMemcpyToSymbol(dpal_blue, &pal_blue2, 4,0,cudaMemcpyHostToDevice);
  cudaMemcpyToSymbol(dpal_green,&pal_green2,4,0,cudaMemcpyHostToDevice);
  cudaMemcpyToSymbol(dpal_red,  &pal_red2,  4,0,cudaMemcpyHostToDevice);


  cudaMemcpyToSymbol(gpu_palette,&pal_color_byte, PAL_SIZE*3,0,cudaMemcpyHostToDevice);
  cudaMemcpyToSymbol(blockCounter, &hBlockCounter, sizeof(unsigned int), 0,   cudaMemcpyHostToDevice );
  cudaMemcpyToSymbol(offsetx,&h_offsetx,sizeof(int),0,cudaMemcpyHostToDevice);
  //cudaMemcpyToSymbol(offsety,&h_offsety,sizeof(int),0,cudaMemcpyHostToDevice);
  display_cellauto1<<<numSMs,threads>>>(dst,imageW,imageH);  

}

