#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <zlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <stdlib.h>
#include <png.h>
#include <math.h>
//#include "cutil_inline.h"

unsigned int h_board_width = 0;
unsigned int h_board_height = 0;
char p_board_width = 12;
char p_board_height = 12;
unsigned long long * buffer;
int png_bit_depth;
__device__ unsigned int BOARD_WIDTH;
__device__ unsigned int BOARD_HEIGHT;

__device__ unsigned char ruletable[4096];
extern unsigned char h_ruletable[4096];

//__device__ unsigned long long ruletableb[8];
//extern unsigned long long h_ruletableb[8];

//extern int h_brule;
//extern int h_srule;
//bool allow_run = false;
//extern bool allow_run;
extern char p_board_width;
extern char p_board_height;

extern int h_offsetx;
extern int h_offsety;
extern int zoom;
int color_phase;
extern "C" void init_cellauto(int seed);
extern "C" void run_cellauto(int numSMs);
extern "C" void display_cellauto(uchar4 *dst,int imageW, int imageH,int numSMs);
extern "C" void paint_cell(int x,int y,int c);

extern "C" void saveF(char *filename);
extern "C" void saveImg(char *filename);
extern "C" void init_color_palette();

extern "C" bool openF(char *filename);
extern "C" bool create_board23();
extern "C" int get_red(int i);
extern "C" int get_green(int i);
extern "C" int get_blue(int i);

unsigned int *h_bitboard[5] = {NULL,NULL,NULL,NULL,NULL};
unsigned long long num_generations = 0;
__device__ unsigned int d_num_generations = 0;

__device__ unsigned int *bitboard[5];

int h_numStateBits = 2;
__device__ int d_numStateBits = 0;

__device__ int blockCounter = 0;
__device__ int offsetx;
__device__ int offsety;

__device__ unsigned int d_rand_bits[10240];

unsigned int h_rand_bits[10240];

#define PAL_SIZE 16
__device__ unsigned char gpu_palette[PAL_SIZE*3];
png_color pal_color[PAL_SIZE];
unsigned char pal_color_byte[PAL_SIZE*3];
void clear_board(int time,int numSMs);
bool create_board23();
void set_color(int i,int red,int green,int blue) {
  pal_color[i].red = red;
  pal_color[i].green = green;
  pal_color[i].blue = blue;
  pal_color_byte[(i*3)+0] = red;
  pal_color_byte[(i*3)+1] = green;
  pal_color_byte[(i*3)+2] = blue;
}
void set_colorf(int i,double red,double green,double blue) {
  set_color(i,(int) (red*255.0),(int) (green*255.0),(int) (blue*255.0));
}
int get_red(int i) {return pal_color[i].red;}
int get_green(int i) {return pal_color[i].green;}
int get_blue(int i) {return pal_color[i].blue;}
void update_pal_color(png_colorp p,int ps) {
  if (ps > PAL_SIZE) {ps = PAL_SIZE;}
  for (int i = 0;i < ps;i++) {
    set_color(i,p[i].red,p[i].green,p[i].blue);
    //pal_color_byte[(i*3)+0] = pal_color[i].red;
    //pal_color_byte[(i*3)+1] = pal_color[i].green;
    //pal_color_byte[(i*3)+2] = pal_color[i].blue;
  }
}
__global__ void paint_cell2(int x,int y,int c) {
  int bw = (BOARD_WIDTH>>5);
  x = x & (BOARD_WIDTH-1);
  y = y & (BOARD_HEIGHT-1);
  int g = d_num_generations & 1;
  for (int a = 0;a <= d_numStateBits;a++) {
    int m = bitboard[a][(bw*y)+(x >> 5)];
    m = m & ~(1 << (x&31));
    int b = c & 1;
    if (a == (g^1)) {
      b = (c >> d_numStateBits) & 1;
    }
    if (a >= 2) {b = (c >> (a-1)) & 1;}
    bitboard[a][(bw*y)+(x >> 5)] = m | (b << (x&31));
  }
}
void paint_cell(int x,int y,int c) {
  int x2 = h_offsetx;
  int y2 = h_offsety;
  if (zoom > 0) {
    x2 = x2 + (x>>zoom);
    y2 = y2 + (y>>zoom);
  } else {
    x2 = x2 + (x<<-zoom);
    y2 = y2 + (y<<-zoom);
  }
  paint_cell2<<<1,1>>>(x2,y2,c);
}

__global__ void init_board2(int c,int w,int h) {
  int bw = (BOARD_WIDTH>>5);
  for (int y = 0;y < h;y++) {
    for (int x = 0;x < w;x++) {
      int i = (y*w)+x; 
      if (i > (32*10240)) {break;}
      int b = ((d_rand_bits[i >> 5] >> (i & 31))&1);
      int m = bitboard[c][(bw*y)+(x >> 5)];
      bitboard[c][(bw*y)+(x >> 5)] = m | (b << (x&31));
    }
  }
}
void init_color_palette() {
  double PI = atan2(0,-1);
  double p = color_phase;
  double sat = 0.3;
  p = p / 8;
  double s = 1 << h_numStateBits;
  for (int i = 0;i < 16;i++) {
    double r = sin((PI*2*((s*(p+1))+(i*3)))/(s*3))*sat;
    double g = sin((PI*2*((s*(p+2))+(i*3)))/(s*3))*sat;
    double b = sin((PI*2*((s*(p+3))+(i*3)))/(s*3))*sat;
    if ((i & 1) == 0) {
      set_colorf(i,r+sat,g+sat,b+sat);
    } else {
      set_colorf(i,r+1.0-sat,g+1.0-sat,b+1.0-sat);
    }
  }
}

void init_color_palette54() {
  for (int i = 0;i < 16;i++) {
    double r =  (i >> 1)&1;
    double g = (((i >> 3)&1) + ((i&1)*2)) / 3.0;
    double b =  (i >> 2)&1;
    //double r = (i&1);
    //double g = ((i >> 1) & 3) / 3.0;
    //double b = (i >> 3);
    set_colorf(i,r,g,b);
  }
}
void init_color_palette43() {
  //int i = 0;
  for (int i = 0;i < 16;i++) {
    double r = (i >> 1)&1;
    double g = (i >> 2)&1;
    double b = (i >> 3)&1;
    double s = 0.4;
    double t = 0.4;
    if ((i & 1) == 0) {
      set_colorf(i,r*s,g*s,b*s);
    } else {
      set_colorf(i,1-(t-(r*t)),1-(t-(g*t)),1-(t-(b*t)));
    }
  }
}
void init_color_palette34() {
  for (int i = 0;i < 16;i = i + 4) {
    set_color(i+0,0,0,0);
    set_color(i+1,255,255,0);
    set_color(i+2,0,0,255);
    set_color(i+3,255,255,255);
  }
}
bool create_board24() {
  printf("h_board_width: %d \n",h_board_width);
  printf("h_board_height: %d \n",h_board_height);

  init_color_palette();
cudaMemcpyToSymbol(BOARD_WIDTH,&h_board_width,4,0,cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(BOARD_HEIGHT,&h_board_height,4,0,cudaMemcpyHostToDevice);
  //cutilSafeCall(cudaMemcpyToSymbol(BOARD_WIDTH,&h_board_width,4,0,cudaMemcpyHostToDevice));
  //cutilSafeCall(cudaMemcpyToSymbol(BOARD_HEIGHT,&h_board_height,4,0,cudaMemcpyHostToDevice));
  
  cudaMemcpyToSymbol(d_numStateBits,&h_numStateBits, sizeof(unsigned int), 0,   cudaMemcpyHostToDevice );

  cudaError_t e;
  for (int i = 0;i <= h_numStateBits;i++) {
    long long s = h_board_width;
    s = (s*h_board_height)>>3;
    //cutilSafeCall();
    int s2 = (int) (s>>20);
    printf("%d MB \n",s2);
    e = cudaMalloc(&h_bitboard[i],s);
    if (e == cudaErrorMemoryAllocation) {return false;}
    cudaMemset(h_bitboard[i],0,s);
    //cutilSafeCall();
    cudaMemcpyToSymbol(bitboard,&h_bitboard[i],8,i*8,cudaMemcpyHostToDevice);
  }
  return true;
}
bool create_board23() {
  //allow_run = true;
  printf("p_board_width: %d \n",p_board_width);
  printf("p_board_height: %d \n",p_board_height);
  h_board_width = 1 << p_board_width;
  h_board_height = 1 << p_board_height;
  return create_board24();
}

void init_cellauto(int seed) {
  printf("init_cellauto\n");
  //if (allow_run == false) {return;}
  time_t ttm = time(NULL);
  srand(seed);
  int sw = 8*8*2;
  int sh = 8*8*2;
  h_offsetx = -8;
  h_offsety = -8;
  for (int i = 0;i <= h_numStateBits;i++) {
    int s = (h_board_width*h_board_height)>>3;
    cudaMemset(h_bitboard[i],0,s);
  }
  for (int c = 0;c <= h_numStateBits;c++) {
    for (int i = 0;i <= ((sw*sh)>>5);i++) {
      h_rand_bits[i] = rand();
    }

    cudaMemcpyToSymbol(d_rand_bits,&h_rand_bits,((sw*sh)>>3)+1,0,cudaMemcpyHostToDevice);

    //init_board2<<<1,1>>>(c,sw,sh);
  }
  num_generations = 0;
  int time = (int) (num_generations & 0xFFFFFFFF);
  cudaMemcpyToSymbol(d_num_generations,&time, sizeof(unsigned int), 0,   cudaMemcpyHostToDevice );

}
int id46[5] = {419725663,-332145988,850349736,469373861,890236017};

void read_from_row(png_bytep row,int y) {
  int y2 = (h_board_height-1)-y;
  //int g1 = (int) (num_generations & 1);
  int d = png_bit_depth;
  for (int i = 1;i < h_numStateBits;i++) {
    for (int x = 0;x < (h_board_width>>6);x++) {buffer[x] = 0;}
    for (int x = 0;x < h_board_width;x++) {
      unsigned long long b = (row[(x*d) >> 3] >> (((x*d)+i)&7)) & 1;
      buffer[x >> 6] = buffer[x >> 6] | (b << (x & 63));
      //row[x] = (buffer[x >> 6] >> (x & 63)) & 1;  
    }
    cudaMemcpy(&h_bitboard[i+1][y2*(h_board_width>>5)],buffer,(h_board_width >> 3),cudaMemcpyHostToDevice);    
  }
  for (int x = 0;x < (h_board_width>>6);x++) {buffer[x] = 0;}
  for (int x = 0;x < h_board_width;x++) {
    unsigned long long b = (row[(x*d) >> 3] >> ((x*d)&7)) & 1;
    buffer[x >> 6] = buffer[x >> 6] | (b << (x & 63));
    //row[x] = (row[x]<<1) | ((buffer[x >> 6] >> (x & 63)) & 1);  
  }
  int g = num_generations & 1;
  cudaMemcpy(&h_bitboard[g][y2*(h_board_width>>5)],buffer,(h_board_width >> 3),cudaMemcpyHostToDevice);    
  //for (int x = 0;x < (h_board_width>>6);x++) {buffer[x] = 0;}
  //int i = h_numStateBits;
  //for (int x = 0;x < h_board_width;x++) {
  //  unsigned long long b = (row[(x*d) >> 3] >> (((x*d)+i)&7)) & 1;
  //  buffer[x >> 6] = buffer[x >> 6] | (b << (x & 63));
  //}
  //cudaMemcpy(&h_bitboard[g^1][y2*(h_board_width>>5)],buffer,(h_board_width >> 3),cudaMemcpyHostToDevice);    
}
bool read_png_file(const char *filename) {

  FILE *fp = fopen(filename, "rb");

  png_structp png = png_create_read_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
  if(!png) {return false;}

  png_infop info = png_create_info_struct(png);
  if(!info) {return false;}

  png_init_io(png, fp);

  png_read_info(png, info);

  int png_width      = png_get_image_width(png, info);
  int png_height     = png_get_image_height(png, info);
  int png_color_type = png_get_color_type(png, info);
  png_bit_depth  = png_get_bit_depth(png, info);
  //printf("png_bit_depth: %d \n",png_bit_depth);
  if (png_color_type != PNG_COLOR_TYPE_PALETTE) {return false;}
  if ((png_width & (png_width-1)) != 0) {return false;}
  if ((png_height & (png_height-1)) != 0) {return false;}

  h_board_width = png_width;
  h_board_height = png_height;
  if (create_board24() == false) {return false;}
  int np = 0;
  png_colorp pal_colorp;
  png_get_PLTE(png, info, &pal_colorp,&np);
  update_pal_color(pal_colorp,np);
  free(pal_colorp);
  buffer = (unsigned long long *) malloc(h_board_width >> 3);
  png_byte* row = (png_byte*)malloc(png_get_rowbytes(png,info));
  for(int y = 0; y < png_height; y++) {
    png_read_row(png,row,NULL);
    read_from_row(row,y);
  }
  free(buffer);
  free(row);
  fclose(fp);
  return true;

}
bool openF(char *filename) {

  gzFile f = gzopen(filename,"rb");
  if (f == NULL) {
    printf("error: can not open \n");
    return false;
  }
  int id2 = 0;
  for (int i = 0;i < 5;i++) {
    gzread(f,&id2,4);
    if (id46[i] != id2) {
      printf("error: wrong id \n");
      return false;
    }
  }
  unsigned short v = 0;
  gzread(f,&v,2);
  if (v > 0) {
    printf("error: wrong version \n");
    return false;
  }
  unsigned char ch = 0;
  for (int i = 0;i < 6;i++) {
    gzread(f,&ch,1);
  }
  h_numStateBits = ch + 1;
  gzread(f,&num_generations,8);
  gzread(f,&h_ruletable,1<<(h_numStateBits+9));
  char *filename2 = (char *) malloc(strlen(filename)+10);
  strcpy(filename2,filename);
  strcat(filename2,".png");
  if (read_png_file(filename2) == false) {
    if (create_board23() == false) {
      gzclose(f);
      printf("error: out of memory\n");
      return false; 
    }
    init_cellauto(0); 
  }
  gzclose(f);
  return true;
}

void saveF(char *filename) {

  gzFile f = gzopen(filename,"wb1");
  gzwrite(f,id46,4*5);
  char ch = 0;
  unsigned short v = 0;
  gzwrite(f,&v,2);
  for (int i = 0;i < 4;i++) {
  gzwrite(f,&ch,1);
  }
  ch = (char) 0;
  gzwrite(f,&ch,1);
  ch = (char) h_numStateBits-1;
  gzwrite(f,&ch,1);
  gzwrite(f,&num_generations,8);
  gzwrite(f,&h_ruletable,1<<(h_numStateBits+9));
  gzclose(f);

}


__global__ void blank_display(uchar4 *dst,int imageW, int imageH) {
  __shared__ unsigned int blockIndex;
  int gridWidth = imageW;
  int numBlocks = imageW*imageH;
  //int cx = (imageW >> psize) / 2;
  //int cy = (imageH >> psize) / 2;

  while(1) {
    if (threadIdx.x==0) {
      blockIndex = atomicAdd(&blockCounter, 1);
      blockIndex = blockIndex * blockDim.x;
    }
    __syncthreads();
    int blockIndex2 = blockIndex + threadIdx.x;
    int blockX = blockIndex2 % (gridWidth);
    int blockY = blockIndex2 / (gridWidth);
    if (blockIndex2 >= numBlocks) break;     
    uchar4 color;
    int a = (blockX >> 5) & 15;
    //color.x = blockX & 255;
    //color.y = blockY & 255;
    //color.z = 255;
    //color.w = 0;
    color.x = gpu_palette[(a*3)+0];
    color.y = gpu_palette[(a*3)+1];
    color.z = gpu_palette[(a*3)+2];
    color.w = 0;
    dst[(blockY*imageW)+blockX] = color;
  }
}

__global__ void display_cellauto_zoom_in(uchar4 *dst,int imageW, int imageH,int psize) {
  __shared__ unsigned int blockIndex;
  int gridWidth = imageW;
  int numBlocks = imageW*imageH;
  int cx = (imageW >> psize) / 2;
  int cy = (imageH >> psize) / 2;

  //int s = 1 << psize;
  while(1) {
    if (threadIdx.x==0) {
      blockIndex = atomicAdd(&blockCounter, 1);
      blockIndex = blockIndex * blockDim.x;
    }
    __syncthreads();
    int blockIndex2 = blockIndex + threadIdx.x;
    int blockX = blockIndex2 % (gridWidth);
    int blockY = blockIndex2 / (gridWidth);
    if (blockIndex2 >= numBlocks) break;     
    int x = ((blockX >> psize)+(offsetx)-cx) & (BOARD_WIDTH-1);
    int y = ((blockY >> psize)+(offsety)-cy) & (BOARD_HEIGHT-1);
    int c = (d_num_generations&1);
    int bw6 = (BOARD_WIDTH>>5);
    int i = (x >> 5) + (y*bw6);
    int a = 0;//((bitboard[4][i] >> (x & 31))&1);
    //a = (a * 2) + ((bitboard[3][i] >> (x & 31))&1);
    //a = (a * 2) + ((bitboard[c^1][i] >> (x & 31))&1);
    for (int j = d_numStateBits;j >= 2;j--) {    
      a = (a * 2) + ((bitboard[j][i] >> (x & 31))&1);
    }
    a = (a * 2) + ((bitboard[c][i] >> (x & 31))&1);
    //if ((x & 31) == 0) {a = a ^ 2;}
    uchar4 color;
    color.x = gpu_palette[(a*3)+0];
    color.y = gpu_palette[(a*3)+1];
    color.z = gpu_palette[(a*3)+2];
    color.w = 0;
    dst[(blockY*imageW)+blockX] = color;
  }
}
__global__ void display_cellauto_zoom_out(uchar4 *dst,int imageW, int imageH,int psize) {
  __shared__ unsigned int blockIndex;
   int gridWidth = imageW;
   int numBlocks = imageW*imageH;
  //int bs = (BOARD_WIDTH*BOARD_HEIGHT)>>6;
  int cx = (imageW << psize) / 2;
  int cy = (imageH << psize) / 2;
  int s = 1 << psize;
  while(1) {
    if (threadIdx.x==0) {
      blockIndex = atomicAdd(&blockCounter, 1);
      blockIndex = blockIndex * blockDim.x;
    }
    __syncthreads();
    int blockIndex2 = blockIndex + threadIdx.x;
    int blockX = blockIndex2 % (gridWidth);
    int blockY = blockIndex2 / (gridWidth);
    if (blockIndex2 >= numBlocks) break;     
    int x = ((blockX << psize)+(offsetx)-cx) & (BOARD_WIDTH-1);
    int y = ((blockY << psize)+(offsety)-cy) & (BOARD_HEIGHT-1);
    int c = (d_num_generations&1);
    int red = 0; 
    int green = 0; 
    int blue = 0; 
    int bw6 = (BOARD_WIDTH>>5);
    for (int x3 = 0;x3 < s;x3++) {
      for (int y3 = 0;y3 < s;y3++) {
        int x2 = (x+x3) & (BOARD_WIDTH-1);
        int y2 = (y+y3) & (BOARD_HEIGHT-1);

        int i = (x2 >> 5) + (y2*bw6);
        int a = 0;//((bitboard[4][i] >> (x2 & 31))&1);
        //a = (a * 2) + ((bitboard[3][i] >> (x2 & 31))&1);
        //a = (a * 2) + ((bitboard[c^1][i] >> (x2 & 31))&1);
        for (int j = d_numStateBits;j >= 2;j--) {    
          a = (a * 2) + ((bitboard[j][i] >> (x2 & 31))&1);
        }
        a = (a * 2) + ((bitboard[c][i] >> (x2 & 31))&1);
        red = red + gpu_palette[(a*3)+0];
        green = green + gpu_palette[(a*3)+1];
        blue = blue + gpu_palette[(a*3)+2];
      }
    }
    int ps2 = psize*2;
    uchar4 color;
    color.x = red>>ps2;color.y = green>>ps2;
    color.z = blue>>ps2;color.w = 0;
    dst[(blockY*imageW)+blockX] = color;
  }
}
__global__ void clear_board2(int time,int gridWidth,int numBlocks) {
  __shared__ unsigned int blockIndex;  

  while(1) {
    if (threadIdx.x==0) {
      blockIndex = atomicAdd(&blockCounter, 1);
      blockIndex = blockIndex * blockDim.x;
    }
    __syncthreads();
    int blockIndex2 = blockIndex + threadIdx.x;
    int blockX = blockIndex2 % gridWidth;
    int blockY = blockIndex2 / gridWidth;
    if (blockIndex2 >= numBlocks) break;  
    int gw = gridWidth;
    int bx = blockX;
    int by = blockY;
    for (int i = 0;i <= d_numStateBits;i++) {
      bitboard[i][bx+(by*gw)] = 0;
    }
  }
}

__global__ void run_cellauto1(int gridWidth,int numBlocks) {
  __shared__ unsigned int blockIndex;  
  while(1) {
    if (threadIdx.x==0) {
      blockIndex = atomicAdd(&blockCounter, 1);
      blockIndex = blockIndex * blockDim.x;
    }
    __syncthreads();
    int blockIndex2 = blockIndex + threadIdx.x;
    int blockX = blockIndex2 % gridWidth;
    int blockY = blockIndex2 / gridWidth;
    if (blockIndex2 >= numBlocks) break;  
    int gw = gridWidth;
    int bx = blockX;
    int by = blockY;
    int bh = BOARD_HEIGHT;

    int c = (d_num_generations&1);
    unsigned int b[2];
    for (int i = 0;i < 2;i++) {
      b[i] = 0;
    }

    unsigned int yb = 0;
    unsigned int one = 1;

    unsigned int r2 = bitboard[c][bx+(((by+1)&(bh-1))*gw)];
    unsigned int r1 = bitboard[c][bx+((by&(bh-1))*gw)];
    unsigned int r0 = bitboard[c][bx+(((by-1)&(bh-1))*gw)];

    int i43 = bx+((by&(bh-1))*gw);
    int i = 0;

    i =     (r2&3);
    i = i | ((r1&3)<<3);
    i = i | ((r0&3)<<6);      
    i = i << 1;
    i = i | ((bitboard[c][((bx-1)&(gw-1))+(((by+1)&(bh-1))*gw)])>>31);
    i = i | (((bitboard[c][((bx-1)&(gw-1))+((by&(bh-1))*gw)])>>31)<<3);
    i = i | (((bitboard[c][((bx-1)&(gw-1))+(((by-1)&(bh-1))*gw)])>>31)<<6);
    //i = i | (((bitboard[c^1][i43])&1)<<9);

    yb = ruletable[i] & 3;

    for (int j = 2;j < 31;j = j + 2) {
      i =     ((r2 >> (j-1))&7);
      i = i | (((r1 >> (j-1))&7)<<3);
      i = i | (((r0 >> (j-1))&7)<<6);
      
      //i = i | (((bitboard[c^1][i43] >> j)&1)<<9);
      yb = yb | ((ruletable[i]&3) << j);
    }
    b[0] = b[0] | (yb & 0x55555555);
    //b[1] = b[1] | (yb & 0xaaaaaaaa) >> 1;
    yb = 0;

    i =     (r2>>30);
    i = i | ((r1>>30)<<3);
    i = i | ((r0>>30)<<6);      

    i = i | (((bitboard[c][((bx+1)&(gw-1))+(((by+1)&(bh-1))*gw)])&1)<<2);      
    i = i | (((bitboard[c][((bx+1)&(gw-1))+((by&(bh-1))*gw)])&1)<<5);      
    i = i | (((bitboard[c][((bx+1)&(gw-1))+(((by-1)&(bh-1))*gw)])&1)<<8);      
    //i = i | (((bitboard[c^1][i43] >> 31)&1)<<9);

    yb = ((ruletable[i]&3) << 30);

    for (int j = 1;j < 31;j = j + 2) {
      i =     ((r2 >> (j-1))&7);
      i = i | (((r1 >> (j-1))&7)<<3);
      i = i | (((r0 >> (j-1))&7)<<6);
      //i = i | (((bitboard[c^1][i43] >> j)&1)<<9);
      yb = yb | ((ruletable[i]&3) << (j-1));
    }

    b[0] = b[0] | ((yb & 0x55555555) << 1);

    bitboard[c^1][bx+(by*gw)] = b[0];
  }
}
__global__ void run_cellauto2(int gridWidth,int numBlocks) {
  __shared__ unsigned int blockIndex;  
  while(1) {
    if (threadIdx.x==0) {
      blockIndex = atomicAdd(&blockCounter, 1);
      blockIndex = blockIndex * blockDim.x;
    }
    __syncthreads();
    int blockIndex2 = blockIndex + threadIdx.x;
    int blockX = blockIndex2 % gridWidth;
    int blockY = blockIndex2 / gridWidth;
    if (blockIndex2 >= numBlocks) break;  
    int gw = gridWidth;
    int bx = blockX;
    int by = blockY;
    int bh = BOARD_HEIGHT;

    int c = (d_num_generations&1);
    unsigned int b[2];
    for (int i = 0;i < 2;i++) {
      b[i] = 0;
    }

    unsigned int yb = 0;
    //unsigned int rg = 0;

    unsigned int one = 1;

    unsigned int r2 = bitboard[c][bx+(((by+1)&(bh-1))*gw)];
    unsigned int r1 = bitboard[c][bx+((by&(bh-1))*gw)];
    unsigned int r0 = bitboard[c][bx+(((by-1)&(bh-1))*gw)];

    int i43 = bx+((by&(bh-1))*gw);
    int i = 0;
    unsigned int bitboard243 = bitboard[2][i43];
    //unsigned int bitboard243 = bitboard[c^1][i43];

    i =     (r2&3);
    i = i | ((r1&3)<<3);
    i = i | ((r0&3)<<6);      
    i = i << 1;
    i = i | ((bitboard[c][((bx-1)&(gw-1))+(((by+1)&(bh-1))*gw)])>>31);
    i = i | (((bitboard[c][((bx-1)&(gw-1))+((by&(bh-1))*gw)])>>31)<<3);
    i = i | (((bitboard[c][((bx-1)&(gw-1))+(((by-1)&(bh-1))*gw)])>>31)<<6);

    //for (int a = 2;a <= d_numStateBits;a++) {
    //for (int a = 2;a <= 2;a++) {
      //i = i | (((bitboard[a][i43])&1)<<(a+7));
      i = i | (((bitboard243)&1)<<9);
      //i = i | (((bitboard[c^1][i43])&1)<<10);

    //}
    //i = i | (((bitboard243)&1)<<9);
    yb = ruletable[i] & 3;
    //rg = ruletable[i] >> 2;

    for (int j = 2;j < 31;j = j + 2) {
      i =     ((r2 >> (j-1))&7);
      i = i | (((r1 >> (j-1))&7)<<3);
      i = i | (((r0 >> (j-1))&7)<<6);
      //for (int a = 2;a <= d_numStateBits;a++) {
      //  i = i | (((bitboard[a][i43] >> j)&1)<<(a+7));
      //}
      i = i | (((bitboard243 >> j)&1)<<9);
      //i = i | (((bitboard[c^1][i43] >> j)&1)<<10);

      yb = yb | ((ruletable[i]&3) << j);
      //rg = rg | ((ruletable[i]>>2) << j);
    }

    b[0] = b[0] | (yb & 0x55555555);
    b[1] = b[1] | (yb & 0xaaaaaaaa) >> 1;
    yb = 0;

    //b[2] = b[2] | (rg & 0x55555555);
    //b[3] = b[3] | (rg & 0xaaaaaaaa) >> 1;
    //rg = 0;
    i =     (r2>>30);
    i = i | ((r1>>30)<<3);
    i = i | ((r0>>30)<<6);      

    i = i | (((bitboard[c][((bx+1)&(gw-1))+(((by+1)&(bh-1))*gw)])&1)<<2);      
    i = i | (((bitboard[c][((bx+1)&(gw-1))+((by&(bh-1))*gw)])&1)<<5);      
    i = i | (((bitboard[c][((bx+1)&(gw-1))+(((by-1)&(bh-1))*gw)])&1)<<8);      

    //for (int a = 2;a <= d_numStateBits;a++) {
    //  i = i | (((bitboard[a][i43] >> 31)&1)<<(a+7));//i = i + i;
    //}
    i = i | (((bitboard243 >> 31)&1)<<(2+7));//i = i + i;
    //i = i | (((bitboard[c^1][i43] >> 31)&1)<<10);

    yb = ((ruletable[i]&3) << 30);
    //rg = ((ruletable[i]>>2) << 30);

    for (int j = 1;j < 31;j = j + 2) {
      i =     ((r2 >> (j-1))&7);
      i = i | (((r1 >> (j-1))&7)<<3);
      i = i | (((r0 >> (j-1))&7)<<6);
      //for (int a = 2;a <= d_numStateBits;a++) {
      //  i = i | (((bitboard[a][i43] >> j)&1)<<(a+7));
      //}
      i = i | (((bitboard243 >> j)&1)<<9);
      //i = i | (((bitboard[c^1][i43] >> j)&1)<<10);


      yb = yb | ((ruletable[i]&3) << (j-1));
      //rg = rg | ((ruletable[i]>>2) << (j-1));
    }

    b[0] = b[0] | ((yb & 0x55555555) << 1);
    b[1] = b[1] | (yb & 0xaaaaaaaa);
    //b[2] = b[2] | ((rg & 0x55555555) << 1);
    //b[3] = b[3] | (rg & 0xaaaaaaaa);

    bitboard[c^1][bx+(by*gw)] = b[0];
    for (int a = 2;a <= d_numStateBits;a++) {
      bitboard[a][bx+(by*gw)] = b[a-1];
    }
  }

}
__global__ void run_cellauto3(int gridWidth,int numBlocks) {
  __shared__ unsigned int blockIndex;  
  while(1) {
    if (threadIdx.x==0) {
      blockIndex = atomicAdd(&blockCounter, 1);
      blockIndex = blockIndex * blockDim.x;
    }
    __syncthreads();
    int blockIndex2 = blockIndex + threadIdx.x;
    int blockX = blockIndex2 % gridWidth;
    int blockY = blockIndex2 / gridWidth;
    if (blockIndex2 >= numBlocks) break;  
    int gw = gridWidth;
    int bx = blockX;
    int by = blockY;
    int bh = BOARD_HEIGHT;

    int c = (d_num_generations&1);
    //unsigned int yellow0 = 0;//bitboard[c][bx+(by*gw)];
    //unsigned int blue0 = 0;
    //unsigned int red0 = 0;
    //unsigned int green0 = 0;
    unsigned int b[4];
    for (int i = 0;i < 4;i++) {
      b[i] = 0;
    }

    unsigned int yb = 0;
    unsigned int rg = 0;

    unsigned int one = 1;

    unsigned int r2 = bitboard[c][bx+(((by+1)&(bh-1))*gw)];
    unsigned int r1 = bitboard[c][bx+((by&(bh-1))*gw)];
    unsigned int r0 = bitboard[c][bx+(((by-1)&(bh-1))*gw)];
    //unsigned int d[4];
    //int i43 = bx+((by&(bh-1))*gw);
    //for (int j = 0;j < 4;j++) {
    //  d[j] =        ((bitboard[2][i43] >> j) & 0x11111111) << 1;
    //  d[j] = d[j] | ((bitboard[3][i43] >> j) & 0x11111111) << 2;
    //  d[j] = d[j] | ((bitboard[4][i43] >> j) & 0x11111111) << 3;
    //}
    int i43 = bx+((by&(bh-1))*gw);
      int i = 0;
        i =     (r2&3);
        i = i | ((r1&3)<<3);
        i = i | ((r0&3)<<6);      
        i = i << 1;
        i = i | ((bitboard[c][((bx-1)&(gw-1))+(((by+1)&(bh-1))*gw)])>>31);
        i = i | (((bitboard[c][((bx-1)&(gw-1))+((by&(bh-1))*gw)])>>31)<<3);
        i = i | (((bitboard[c][((bx-1)&(gw-1))+(((by-1)&(bh-1))*gw)])>>31)<<6);

      for (int a = 2;a <= d_numStateBits;a++) {
        i = i | (((bitboard[a][i43])&1)<<(a+7));
      }
      //i = i | (((bitboard[c^1][i43])&1)<<(d_numStateBits+8));

        //i = i | (((bitboard[3][i43])&1)<<10);
        //i = i | (((bitboard[4][i43])&1)<<11);
      
        //i = i + i;
        yb = ruletable[i] & 3;
        rg = ruletable[i] >> 2;
    //unsigned int d = (bitboard[3][i43] & 0x55555555);
    //d = d | ((bitboard[4][i43] & 0x55555555) << 1);
    for (int j = 2;j < 31;j = j + 2) {
        i =     ((r2 >> (j-1))&7);
        i = i | (((r1 >> (j-1))&7)<<3);
        i = i | (((r0 >> (j-1))&7)<<6);//734 bug fix
        //i = i | (((d[j & 3] >> (j&28)) & 15) << 8);
        for (int a = 2;a <= d_numStateBits;a++) {
          i = i | (((bitboard[a][i43] >> j)&1)<<(a+7));//i = i + i;
        }
        //i = i | (((bitboard[c^1][i43] >> j)&1)<<(d_numStateBits+8));
        //i = i | (((bitboard[3][i43] >> j)&1)<<10);//i = i + i;
        //i = i | (((bitboard[4][i43] >> j)&1)<<11);//i = i + i;

        //i = i | (((d >> j) & 3)<<10);
        //i = i | (((bitboard[3][i43] >> j)&1)<<10);
        //i = i | (((bitboard[4][i43] >> j)&1)<<11);
        yb = yb | ((ruletable[i]&3) << j);
        rg = rg | ((ruletable[i]>>2) << j);

    }

    b[0] = b[0] | (yb & 0x55555555);
    b[1] = b[1] |     (yb & 0xaaaaaaaa) >> 1;
    yb = 0;

    b[2] = b[2] | (rg & 0x55555555);
    b[3] = b[3] |     (rg & 0xaaaaaaaa) >> 1;
    rg = 0;
        i =     (r2>>30);
        i = i | ((r1>>30)<<3);
        i = i | ((r0>>30)<<6);      

        i = i | (((bitboard[c][((bx+1)&(gw-1))+(((by+1)&(bh-1))*gw)])&1)<<2);      
        i = i | (((bitboard[c][((bx+1)&(gw-1))+((by&(bh-1))*gw)])&1)<<5);      
        i = i | (((bitboard[c][((bx+1)&(gw-1))+(((by-1)&(bh-1))*gw)])&1)<<8);      

        for (int a = 2;a <= d_numStateBits;a++) {
          i = i | (((bitboard[a][i43] >> 31)&1)<<(a+7));//i = i + i;
        }
        //i = i | (((bitboard[c^1][i43] >> 31)&1)<<(d_numStateBits+8));
        //i = i | (((bitboard[3][i43] >> 31)&1)<<10);
        //i = i | (((bitboard[4][i43] >> 31)&1)<<11);
        yb = ((ruletable[i]&3) << 30);
        rg = ((ruletable[i]>>2) << 30);

     
    //d = (bitboard[3][i43] & 0xaaaaaaaa) >> 1;
    //d = d | (bitboard[4][i43] & 0xaaaaaaaa);

    for (int j = 1;j < 31;j = j + 2) {

        i =     ((r2 >> (j-1))&7);
        i = i | (((r1 >> (j-1))&7)<<3);
        i = i | (((r0 >> (j-1))&7)<<6);
        //int i43 = bx+((by&(bh-1))*gw);
        for (int a = 2;a <= d_numStateBits;a++) {
          i = i | (((bitboard[a][i43] >> j)&1)<<(a+7));
        }
        //i = i | (((bitboard[c^1][i43] >> j)&1)<<(d_numStateBits+8));
        //i = i | (((bitboard[3][i43] >> j)&1)<<10);
        //i = i | (((bitboard[4][i43] >> j)&1)<<11);
        //i = i | (((d >> (j-1)) & 3)<<10);

        //i = i | (((bitboard[3][i43] >> j)&1)<<10);
        //i = i | (((bitboard[4][i43] >> j)&1)<<11);

        //i = i + i;
        yb = yb | ((ruletable[i]&3) << (j-1));
        rg = rg | ((ruletable[i]>>2) << (j-1));
    }

    b[0] = b[0] | ((yb & 0x55555555) << 1);
    b[1] = b[1] |      (yb & 0xaaaaaaaa);
    b[2] = b[2] | ((rg & 0x55555555) << 1);
    b[3] = b[3] |      (rg & 0xaaaaaaaa);
    //green0 = green0 ^ (a << 48);

    bitboard[c^1][bx+(by*gw)] = b[0];
    for (int a = 2;a <= d_numStateBits;a++) {
      bitboard[a][bx+(by*gw)] = b[a-1];
    }
    //bitboard[3][bx+(by*gw)] = b[2];
    //bitboard[4][bx+(by*gw)] = b[3];
  }
}
void run_cellauto(int numSMs) {
  unsigned int hBlockCounter = 0;
cudaMemcpyToSymbol(ruletable,&h_ruletable,4096,0,cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(blockCounter, &hBlockCounter, sizeof(unsigned int), 0,   cudaMemcpyHostToDevice );
  //cutilSafeCall(cudaMemcpyToSymbol(ruletable,&h_ruletable,256,0,cudaMemcpyHostToDevice));
  //cutilSafeCall( cudaMemcpyToSymbol(blockCounter, &hBlockCounter, sizeof(unsigned int), 0,   cudaMemcpyHostToDevice ) );
  if (h_numStateBits == 1) {
    dim3 threads(0x380);
    run_cellauto1<<<numSMs,threads>>>(h_board_width>>5,(h_board_width*h_board_height)>>5);
  }
  if (h_numStateBits == 2) {
    dim3 threads(0x380);
    run_cellauto2<<<numSMs,threads>>>(h_board_width>>5,(h_board_width*h_board_height)>>5);
  }
  if ((h_numStateBits == 3) | (h_numStateBits == 4)) {
    dim3 threads(0x400);
    run_cellauto3<<<numSMs,threads>>>(h_board_width>>5,(h_board_width*h_board_height)>>5);
  }
  cudaError_t err = cudaGetLastError();
  if (err != cudaSuccess) printf("Error: %s\n", cudaGetErrorString(err));
  num_generations = num_generations + 1;
  int time = (int) (num_generations & 0xFFFFFFFF);
  cudaMemcpyToSymbol(d_num_generations,&time, sizeof(unsigned int), 0,   cudaMemcpyHostToDevice );
}
void write_to_row(png_bytep row,int y) {
  int y2 = (h_board_height-1)-y;
  int g1 = (int) (num_generations & 1);
  for (int x = 0;x < h_board_width;x++) {row[x] = 0;}

  //cudaMemcpy(buffer,&h_bitboard[g1^1][y2*(h_board_width>>5)],(h_board_width >> 3),cudaMemcpyDeviceToHost);    
  //for (int x = 0;x < h_board_width;x++) {
  //  row[x] = (row[x]<<1) | ((buffer[x >> 6] >> (x & 63)) & 1);  
  //}
  for (int i =  h_numStateBits;i >= 2;i--) {
    cudaMemcpy(buffer,&h_bitboard[i][y2*(h_board_width>>5)],(h_board_width >> 3),cudaMemcpyDeviceToHost);    
    for (int x = 0;x < h_board_width;x++) {
      row[x] = (row[x]<<1) | (buffer[x >> 6] >> (x & 63)) & 1;  
    }
  } 
  cudaMemcpy(buffer,&h_bitboard[g1][y2*(h_board_width>>5)],(h_board_width >> 3),cudaMemcpyDeviceToHost);    
  for (int x = 0;x < h_board_width;x++) {
    row[x] = (row[x]<<1) | ((buffer[x >> 6] >> (x & 63)) & 1);  
  }
}
void saveImg(char *filename) {
  png_structp png_ptr;
  png_infop info_ptr;
  png_bytep * row_pointers;
  buffer = (unsigned long long *) malloc(h_board_width >> 3);
  FILE *fp = fopen(filename, "wb");
  png_ptr = png_create_write_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
  info_ptr = png_create_info_struct(png_ptr);
  png_init_io(png_ptr, fp);

    png_set_PLTE(png_ptr, info_ptr, pal_color,PAL_SIZE);
  png_set_IHDR(png_ptr, info_ptr, h_board_width, h_board_height,8, PNG_COLOR_TYPE_PALETTE,PNG_INTERLACE_NONE,PNG_COMPRESSION_TYPE_BASE, PNG_FILTER_TYPE_BASE);
  png_write_info(png_ptr, info_ptr);
  png_byte* row = (png_byte*) malloc(sizeof(png_bytep) * h_board_width*4);
  for (int y=0; y < h_board_height; y++) {
     write_to_row(row,y);
     png_write_row(png_ptr,row);
  }
  free(row);
  free(buffer);
  png_write_end(png_ptr, NULL);
  fclose(fp);
}
void display_cellauto(uchar4 *dst,int imageW, int imageH,int numSMs) {
  dim3 threads(256);
  unsigned int hBlockCounter = 0;
  //cutilSafeCall();
  //int time = (int) (num_generations & 0xFFFFFFFF);

 cudaMemcpyToSymbol(gpu_palette,&pal_color_byte, PAL_SIZE*3,0,cudaMemcpyHostToDevice);
 cudaMemcpyToSymbol(blockCounter, &hBlockCounter, sizeof(unsigned int), 0,   cudaMemcpyHostToDevice );
  cudaMemcpyToSymbol(offsetx,&h_offsetx,sizeof(int),0,cudaMemcpyHostToDevice);
  cudaMemcpyToSymbol(offsety,&h_offsety,sizeof(int),0,cudaMemcpyHostToDevice);
  if (zoom < 0) {
    display_cellauto_zoom_out<<<numSMs,threads>>>(dst,imageW,imageH,-zoom);  
  } else {
    display_cellauto_zoom_in<<<numSMs,threads>>>(dst,imageW,imageH,zoom);  
  }
  //blank_display<<<numSMs,threads>>>(dst,imageW,imageH);
}

void clear_board(int time,int numSMs) {
  dim3 threads(256);
  unsigned int hBlockCounter = 0;
  //cutilSafeCall(); 
  cudaMemcpyToSymbol(blockCounter, &hBlockCounter, sizeof(unsigned int), 0,   cudaMemcpyHostToDevice );
  clear_board2<<<numSMs,threads>>>(time,h_board_width>>5,(h_board_width*h_board_height)>>5);
}

