#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <zlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <stdlib.h>
#include <png.h>
#include <math.h>
//#include "cutil_inline.h"

unsigned int h_board_width = 0;
unsigned int h_board_height = 0;
char p_board_width = 12;
char p_board_height = 12;
unsigned long long * buffer;
int png_bit_depth;
__device__ unsigned int BOARD_WIDTH;
__device__ unsigned int BOARD_HEIGHT;

__device__ unsigned char ruletable[8];
extern unsigned char h_ruletable[8];

extern char p_board_width;
extern char p_board_height;

extern int h_offsetx;
extern int h_offsety;
extern int zoom;
int color_phase;
extern "C" void init_cellauto(int seed);
extern "C" void run_cellauto(int numSMs);
extern "C" void display_cellauto(uchar4 *dst,int imageW, int imageH,int numSMs);
extern "C" void paint_cell(int x,int y,int c);

extern "C" void saveF(char *filename);
extern "C" void saveImg(char *filename);
extern "C" void init_color_palette();

extern "C" bool openF(char *filename);
extern "C" bool create_board23();
extern "C" int get_red(int i);
extern "C" int get_green(int i);
extern "C" int get_blue(int i);


unsigned int *h_bitboard[5] = {NULL,NULL,NULL,NULL,NULL};
unsigned long long num_generations = 0;
__device__ unsigned int d_num_generations = 0;

__device__ unsigned int *bitboard[5];

int h_numStateBits = 1;
__device__ int d_numStateBits = 0;

__device__ int blockCounter = 0;
__device__ int offsetx;
__device__ int offsety;

__device__ unsigned int d_rand_bits[10240];

unsigned int h_rand_bits[10240];

#define PAL_SIZE 4
unsigned int my_palette[PAL_SIZE] = {0x600000,0xFF00FF,0x0000FF,0xFFFF00};
__device__ unsigned char gpu_palette[PAL_SIZE*3];
png_color pal_color[PAL_SIZE];
unsigned char pal_color_byte[PAL_SIZE*3];
void clear_board(int time,int numSMs);
bool create_board23();
void set_color(int i,int red,int green,int blue) {
  pal_color[i].red = red;
  pal_color[i].green = green;
  pal_color[i].blue = blue;
  pal_color_byte[(i*3)+0] = red;
  pal_color_byte[(i*3)+1] = green;
  pal_color_byte[(i*3)+2] = blue;
}
void set_colorf(int i,double red,double green,double blue) {
  set_color(i,(int) (red*255.0),(int) (green*255.0),(int) (blue*255.0));
}
int get_red(int i) {return pal_color[i].red;}
int get_green(int i) {return pal_color[i].green;}
int get_blue(int i) {return pal_color[i].blue;}
void update_pal_color(png_colorp p,int ps) {
  if (ps > PAL_SIZE) {ps = PAL_SIZE;}
  for (int i = 0;i < ps;i++) {
    set_color(i,p[i].red,p[i].green,p[i].blue);
    //pal_color_byte[(i*3)+0] = pal_color[i].red;
    //pal_color_byte[(i*3)+1] = pal_color[i].green;
    //pal_color_byte[(i*3)+2] = pal_color[i].blue;
  }
}
__global__ void paint_cell2(int x,int y,int c) {
  int bw = (BOARD_WIDTH>>5);
  x = x & (BOARD_WIDTH-1);
  y = y & (BOARD_HEIGHT-1);
  int g = d_num_generations & 1;
  for (int a = 0;a <= d_numStateBits;a++) {
    int m = bitboard[a][(bw*y)+(x >> 5)];
    m = m & ~(1 << (x&31));
    int b = c & 1;
    if (a == (g^1)) {
      b = (c >> d_numStateBits) & 1;
    }
    if (a >= 2) {b = (c >> (a-1)) & 1;}
    bitboard[a][(bw*y)+(x >> 5)] = m | (b << (x&31));
  }
}
void paint_cell(int x,int y,int c) {
  int x2 = h_offsetx;
  int y2 = h_offsety;
  if (zoom > 0) {
    x2 = x2 + (x>>zoom);
    y2 = y2 + (y>>zoom);
  } else {
    x2 = x2 + (x<<-zoom);
    y2 = y2 + (y<<-zoom);
  }
  paint_cell2<<<1,1>>>(x2,y2,c);
}

__global__ void init_board2(int c,int w,int h) {
  int bw = (BOARD_WIDTH>>5);
  for (int y = 0;y < h;y++) {
    for (int x = 0;x < w;x++) {
      int i = (y*w)+x; 
      if (i > (32*10240)) {break;}
      int b = ((d_rand_bits[i >> 5] >> (i & 31))&1);
      int m = bitboard[c][(bw*y)+(x >> 5)];
      bitboard[c][(bw*y)+(x >> 5)] = m | (b << (x&31));
    }
  }
}

void init_color_palette() {
  for (int i = 0;i < PAL_SIZE;i++) {
    int c = my_palette[i];
    set_color(i,c & 255,(c >> 8) & 255,(c >> 16) & 255);
  }
}

bool create_board24() {
  printf("h_board_width: %d \n",h_board_width);
  printf("h_board_height: %d \n",h_board_height);

  init_color_palette();
cudaMemcpyToSymbol(BOARD_WIDTH,&h_board_width,4,0,cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(BOARD_HEIGHT,&h_board_height,4,0,cudaMemcpyHostToDevice);
  //cutilSafeCall(cudaMemcpyToSymbol(BOARD_WIDTH,&h_board_width,4,0,cudaMemcpyHostToDevice));
  //cutilSafeCall(cudaMemcpyToSymbol(BOARD_HEIGHT,&h_board_height,4,0,cudaMemcpyHostToDevice));
  
  cudaMemcpyToSymbol(d_numStateBits,&h_numStateBits, sizeof(unsigned int), 0,   cudaMemcpyHostToDevice );

  cudaError_t e;
  for (int i = 0;i <= h_numStateBits;i++) {
    long long s = h_board_width;
    s = (s*h_board_height)>>3;
    //cutilSafeCall();
    int s2 = (int) (s>>20);
    printf("%d MB \n",s2);
    e = cudaMalloc(&h_bitboard[i],s);
    if (e == cudaErrorMemoryAllocation) {return false;}
    cudaMemset(h_bitboard[i],0,s);
    //cutilSafeCall();
    cudaMemcpyToSymbol(bitboard,&h_bitboard[i],8,i*8,cudaMemcpyHostToDevice);
  }
  return true;
}
bool create_board23() {
  //allow_run = true;
  printf("p_board_width: %d \n",p_board_width);
  printf("p_board_height: %d \n",p_board_height);
  h_board_width = 1 << p_board_width;
  h_board_height = 1 << p_board_height;
  return create_board24();
}

void init_cellauto(int seed) {
  printf("init_cellauto %d \n",RAND_MAX);
  //if (allow_run == false) {return;}
  time_t ttm = time(NULL);
  srand(seed);
  int sw = 8*8*2;
  int sh = 8*8*2;
  h_offsetx = -8;
  h_offsety = -8;
  for (int i = 0;i <= h_numStateBits;i++) {
    int s = (h_board_width*h_board_height)>>3;
    cudaMemset(h_bitboard[i],0,s);
  }
  for (int c = 0;c <= h_numStateBits;c++) {
    for (int i = 0;i <= ((sw*sh)>>5);i++) {
      h_rand_bits[i] = rand();
    }

    cudaMemcpyToSymbol(d_rand_bits,&h_rand_bits,((sw*sh)>>3)+1,0,cudaMemcpyHostToDevice);

    init_board2<<<1,1>>>(c,sw,sh);
  }
  num_generations = 0;
  int time = (int) (num_generations & 0xFFFFFFFF);
  cudaMemcpyToSymbol(d_num_generations,&time, sizeof(unsigned int), 0,   cudaMemcpyHostToDevice );

}
int id46[5] = {-345784765,-751255234,180337918,-1244203445,-756621646};
void read_from_row(png_bytep row,int y) {
  int y2 = (h_board_height-1)-y;
  //int g1 = (int) (num_generations & 1);
  int d = png_bit_depth;
  for (int i = 1;i < h_numStateBits;i++) {
    for (int x = 0;x < (h_board_width>>6);x++) {buffer[x] = 0;}
    for (int x = 0;x < h_board_width;x++) {
      unsigned long long b = (row[(x*d) >> 3] >> (((x*d)+i)&7)) & 1;
      buffer[x >> 6] = buffer[x >> 6] | (b << (x & 63));
      //row[x] = (buffer[x >> 6] >> (x & 63)) & 1;  
    }
    cudaMemcpy(&h_bitboard[i+1][y2*(h_board_width>>5)],buffer,(h_board_width >> 3),cudaMemcpyHostToDevice);    
  }
  for (int x = 0;x < (h_board_width>>6);x++) {buffer[x] = 0;}
  for (int x = 0;x < h_board_width;x++) {
    unsigned long long b = (row[(x*d) >> 3] >> ((x*d)&7)) & 1;
    buffer[x >> 6] = buffer[x >> 6] | (b << (x & 63));
    //row[x] = (row[x]<<1) | ((buffer[x >> 6] >> (x & 63)) & 1);  
  }
  int g = num_generations & 1;
  cudaMemcpy(&h_bitboard[g][y2*(h_board_width>>5)],buffer,(h_board_width >> 3),cudaMemcpyHostToDevice);    
  for (int x = 0;x < (h_board_width>>6);x++) {buffer[x] = 0;}
  int i = h_numStateBits;
  for (int x = 0;x < h_board_width;x++) {
    unsigned long long b = (row[(x*d) >> 3] >> (((x*d)+i)&7)) & 1;
    buffer[x >> 6] = buffer[x >> 6] | (b << (x & 63));
  }
  cudaMemcpy(&h_bitboard[g^1][y2*(h_board_width>>5)],buffer,(h_board_width >> 3),cudaMemcpyHostToDevice);    
}
bool read_png_file(const char *filename) {

  FILE *fp = fopen(filename, "rb");

  png_structp png = png_create_read_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
  if(!png) {return false;}

  png_infop info = png_create_info_struct(png);
  if(!info) {return false;}

  png_init_io(png, fp);

  png_read_info(png, info);

  int png_width      = png_get_image_width(png, info);
  int png_height     = png_get_image_height(png, info);
  int png_color_type = png_get_color_type(png, info);
  png_bit_depth  = png_get_bit_depth(png, info);
  //printf("png_bit_depth: %d \n",png_bit_depth);
  if (png_color_type != PNG_COLOR_TYPE_PALETTE) {return false;}
  if ((png_width & (png_width-1)) != 0) {return false;}
  if ((png_height & (png_height-1)) != 0) {return false;}

  h_board_width = png_width;
  h_board_height = png_height;
  if (create_board24() == false) {return false;}
  int np = 0;
  png_colorp pal_colorp;
  png_get_PLTE(png, info, &pal_colorp,&np);
  update_pal_color(pal_colorp,np);
  free(pal_colorp);
  buffer = (unsigned long long *) malloc(h_board_width >> 3);
  png_byte* row = (png_byte*)malloc(png_get_rowbytes(png,info));
  for(int y = 0; y < png_height; y++) {
    //printf("a %d\n",y);
    png_read_row(png,row,NULL);
    //printf("b %d\n",y);
    read_from_row(row,y);
  }
  free(buffer);
  free(row);
  fclose(fp);
  return true;

}

bool openF(char *filename) {

  FILE * f = fopen(filename,"rb");
  if (f == NULL) {
    printf("error: can not open \n");
    return false;
  }
  int id2 = 0;
  for (int i = 0;i < 5;i++) {
    fread(&id2,4,1,f);
    if (id46[i] != id2) {
      printf("error: wrong id \n");
      return false;
    }
  }
  unsigned short v = 0;
  fread(&v,2,1,f);
  if (v > 0) {
    printf("error: wrong version \n");
    return false;
  }
  unsigned char ch = 0;
  for (int i = 0;i < 8;i++) {
    fread(&ch,1,1,f);
  }
  fread(&num_generations,8,1,f);
  fread(&h_ruletable,1,8,f);
  char *filename2 = (char *) malloc(strlen(filename)+10);
  strcpy(filename2,filename);
  strcat(filename2,".png");
  if (read_png_file(filename2) == false) {
    if (create_board23() == false) {
      fclose(f);
      printf("error: out of memory\n");
      return false; 
    }
    init_cellauto(0); 
  }
  fclose(f);
  return true;
}

void saveF(char *filename) {

  FILE * f = fopen(filename,"wb");
  fwrite(id46,4,5,f);
  char ch = 0;
  unsigned short v = 0;
  fwrite(&v,2,1,f);
  for (int i = 0;i < 8;i++) {
  fwrite(&ch,1,1,f);
  }
  fwrite(&num_generations,8,1,f);
  fwrite(&h_ruletable,1,8,f);//(h_numStateBits+9)
  fclose(f);

}

__global__ void display_cellauto_zoom_in(uchar4 *dst,int imageW, int imageH,int psize) {
  __shared__ unsigned int blockIndex;
  int gridWidth = imageW;
  int numBlocks = imageW*imageH;
  int cx = (imageW >> psize) / 2;
  int cy = (imageH >> psize) / 2;

  //int s = 1 << psize;
  while(1) {
    if (threadIdx.x==0) {
      blockIndex = atomicAdd(&blockCounter, 1);
      blockIndex = blockIndex * blockDim.x;
    }
    __syncthreads();
    int blockIndex2 = blockIndex + threadIdx.x;
    int blockX = blockIndex2 % (gridWidth);
    int blockY = blockIndex2 / (gridWidth);
    if (blockIndex2 >= numBlocks) break;     
    int x = ((blockX >> psize)+(offsetx)-cx) & (BOARD_WIDTH-1);
    int y = ((blockY >> psize)+(offsety)-cy) & (BOARD_HEIGHT-1);
    int c = (d_num_generations&1);
    int bw6 = (BOARD_WIDTH>>5);
    int i = (x >> 5) + (y*bw6);
    int a = ((bitboard[c^1][i] >> (x & 31))&1);
    a = (a << 1) | ((bitboard[c][i] >> (x & 31))&1);
    uchar4 color;
    color.x = gpu_palette[(a*3)+0];
    color.y = gpu_palette[(a*3)+1];
    color.z = gpu_palette[(a*3)+2];
    color.w = 0;
    dst[(blockY*imageW)+blockX] = color;
  }
}
__global__ void display_cellauto_zoom_out(uchar4 *dst,int imageW, int imageH,int psize) {
  __shared__ unsigned int blockIndex;
   int gridWidth = imageW;
   int numBlocks = imageW*imageH;
  //int bs = (BOARD_WIDTH*BOARD_HEIGHT)>>6;
  int cx = (imageW << psize) / 2;
  int cy = (imageH << psize) / 2;
  int s = 1 << psize;
  while(1) {
    if (threadIdx.x==0) {
      blockIndex = atomicAdd(&blockCounter, 1);
      blockIndex = blockIndex * blockDim.x;
    }
    __syncthreads();
    int blockIndex2 = blockIndex + threadIdx.x;
    int blockX = blockIndex2 % (gridWidth);
    int blockY = blockIndex2 / (gridWidth);
    if (blockIndex2 >= numBlocks) break;     
    int x = ((blockX << psize)+(offsetx)-cx) & (BOARD_WIDTH-1);
    int y = ((blockY << psize)+(offsety)-cy) & (BOARD_HEIGHT-1);
    int c = (d_num_generations&1);
    int red = 0; 
    int green = 0; 
    int blue = 0; 
    int bw6 = (BOARD_WIDTH>>5);
    for (int x3 = 0;x3 < s;x3++) {
      for (int y3 = 0;y3 < s;y3++) {
        int x2 = (x+x3) & (BOARD_WIDTH-1);
        int y2 = (y+y3) & (BOARD_HEIGHT-1);

        int i = (x2 >> 5) + (y2*bw6);
        int a = ((bitboard[c^1][i] >> (x2 & 31))&1);
        a = (a << 1) | ((bitboard[c][i] >> (x2 & 31))&1);
        red = red + gpu_palette[(a*3)+0];
        green = green + gpu_palette[(a*3)+1];
        blue = blue + gpu_palette[(a*3)+2];
      }
    }
    int ps2 = psize*2;
    uchar4 color;
    color.x = red>>ps2;color.y = green>>ps2;
    color.z = blue>>ps2;color.w = 0;
    dst[(blockY*imageW)+blockX] = color;
  }
}
__global__ void clear_board2(int time,int gridWidth,int numBlocks) {
  __shared__ unsigned int blockIndex;  

  while(1) {
    if (threadIdx.x==0) {
      blockIndex = atomicAdd(&blockCounter, 1);
      blockIndex = blockIndex * blockDim.x;
    }
    __syncthreads();
    int blockIndex2 = blockIndex + threadIdx.x;
    int blockX = blockIndex2 % gridWidth;
    int blockY = blockIndex2 / gridWidth;
    if (blockIndex2 >= numBlocks) break;  
    int gw = gridWidth;
    int bx = blockX;
    int by = blockY;
    for (int i = 0;i <= d_numStateBits;i++) {
      bitboard[i][bx+(by*gw)] = 0;
    }
  }
}

__global__ void run_cellauto1(int gridWidth,int numBlocks) {
  __shared__ unsigned int blockIndex;  
  while(1) {
    if (threadIdx.x==0) {
      blockIndex = atomicAdd(&blockCounter, 1);
      blockIndex = blockIndex * blockDim.x;
    }
    __syncthreads();
    int blockIndex2 = blockIndex + threadIdx.x;
    int blockX = blockIndex2 % gridWidth;
    int blockY = blockIndex2 / gridWidth;
    if (blockIndex2 >= numBlocks) break;  
    int gw = gridWidth;
    int bx = blockX;
    int by = blockY;
    int bh = BOARD_HEIGHT;
    int g = (d_num_generations&1);

    unsigned int c = bitboard[g][bx+((by&(bh-1))*gw)];
    unsigned int d = bitboard[g^1][bx+((by&(bh-1))*gw)];
    unsigned int n1 = bitboard[g][bx+(((by-1)&(bh-1))*gw)];
    unsigned int n2 = (n1 >> 1) | (bitboard[g][((bx+1)&(gw-1))+(((by-1)&(bh-1))*gw)]<<31);
    unsigned int n3 = (c >> 1) | (bitboard[g][((bx+1)&(gw-1))+((by&(bh-1))*gw)]<<31);
    unsigned int n4 = bitboard[g][bx+(((by+1)&(bh-1))*gw)];
    unsigned int n5 = (n4 << 1) | (bitboard[g][((bx-1)&(gw-1))+(((by+1)&(bh-1))*gw)]>>31);
    unsigned int n6 = (c << 1) | (bitboard[g][((bx-1)&(gw-1))+((by&(bh-1))*gw)] >> 31);

    int cb1 = n1;
    int cb2 = n1 & n2;
    int cb3 = n1 & n2 & n3 & n4; 
    cb1 = cb1 ^ n2;
    cb2 = cb2 ^ (cb1 & n3);
    cb1 = cb1 ^ n3;
    cb2 = cb2 ^ (cb1 & n4);
    cb1 = cb1 ^ n4;
    cb3 = cb3 ^ (cb2 & cb1 & n5);
    cb2 = cb2 ^ (cb1 & n5);
    cb1 = cb1 ^ n5;
    cb3 = cb3 ^ (cb2 & cb1 & n6);
    cb2 = cb2 ^ (cb1 & n6);
    cb1 = cb1 ^ n6;
    int b = ~cb3 & ~cb2 & ~cb1;
    int r = ruletable[0];
    int a = 0;
    a = a | (b & ~d & ~c & -(r & 1)) | (b & ~d &  c & -((r >> 1) & 1));
    a = a | (b &  d & ~c & -((r >> 2) & 1)) | (b &  d &  c & -((r >> 3) & 1));
    b = ~cb3 & ~cb2 & cb1;r = ruletable[1];
    a = a | (b & ~d & ~c & -(r & 1)) | (b & ~d &  c & -((r >> 1) & 1));
    a = a | (b &  d & ~c & -((r >> 2) & 1)) | (b &  d &  c & -((r >> 3) & 1));
    b = ~cb3 & cb2 & ~cb1;r = ruletable[2];
    a = a | (b & ~d & ~c & -(r & 1)) | (b & ~d &  c & -((r >> 1) & 1));
    a = a | (b &  d & ~c & -((r >> 2) & 1)) | (b &  d &  c & -((r >> 3) & 1));
    b = ~cb3 & cb2 & cb1;r = ruletable[3];
    a = a | (b & ~d & ~c & -(r & 1)) | (b & ~d &  c & -((r >> 1) & 1));
    a = a | (b &  d & ~c & -((r >> 2) & 1)) | (b &  d &  c & -((r >> 3) & 1));
    b = cb3 & ~cb2 & ~cb1;r = ruletable[4];
    a = a | (b & ~d & ~c & -(r & 1)) | (b & ~d &  c & -((r >> 1) & 1));
    a = a | (b &  d & ~c & -((r >> 2) & 1)) | (b &  d &  c & -((r >> 3) & 1));
    b = cb3 & ~cb2 & cb1;r = ruletable[5];
    a = a | (b & ~d & ~c & -(r & 1)) | (b & ~d &  c & -((r >> 1) & 1));
    a = a | (b &  d & ~c & -((r >> 2) & 1)) | (b &  d &  c & -((r >> 3) & 1));
    b = cb3 & cb2 & ~cb1;r = ruletable[6];
    a = a | (b & ~d & ~c & -(r & 1)) | (b & ~d &  c & -((r >> 1) & 1));
    a = a | (b &  d & ~c & -((r >> 2) & 1)) | (b &  d &  c & -((r >> 3) & 1));
    bitboard[g^1][bx+(by*gw)] = a;
  }
}
void run_cellauto(int numSMs) {
  unsigned int hBlockCounter = 0;
cudaMemcpyToSymbol(ruletable,&h_ruletable,8,0,cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(blockCounter, &hBlockCounter, sizeof(unsigned int), 0,   cudaMemcpyHostToDevice );
  dim3 threads(0x300);
  run_cellauto1<<<numSMs,threads>>>(h_board_width>>5,(h_board_width*h_board_height)>>5);
  cudaError_t err = cudaGetLastError();
  if (err != cudaSuccess) printf("Error: %s\n", cudaGetErrorString(err));
  num_generations = num_generations + 1;
  int time = (int) (num_generations & 0xFFFFFFFF);
  cudaMemcpyToSymbol(d_num_generations,&time, sizeof(unsigned int), 0,   cudaMemcpyHostToDevice );
}
void write_to_row(png_bytep row,int y) {
  int y2 = (h_board_height-1)-y;
  int g1 = (int) (num_generations & 1);
  for (int x = 0;x < h_board_width;x++) {row[x] = 0;}

  cudaMemcpy(buffer,&h_bitboard[g1^1][y2*(h_board_width>>5)],(h_board_width >> 3),cudaMemcpyDeviceToHost);
  for (int x = 0;x < h_board_width;x++) {
    row[x] = (row[x]<<1) | ((buffer[x >> 6] >> (x & 63)) & 1);
  }
  for (int i =  h_numStateBits;i >= 2;i--) {
    cudaMemcpy(buffer,&h_bitboard[i][y2*(h_board_width>>5)],(h_board_width >> 3),cudaMemcpyDeviceToHost);    
    for (int x = 0;x < h_board_width;x++) {
      row[x] = (row[x]<<1) | ((buffer[x >> 6] >> (x & 63)) & 1);
    }
  }
  cudaMemcpy(buffer,&h_bitboard[g1][y2*(h_board_width>>5)],(h_board_width >> 3),cudaMemcpyDeviceToHost);    
  for (int x = 0;x < h_board_width;x++) {
    row[x] = (row[x]<<1) | ((buffer[x >> 6] >> (x & 63)) & 1);  
  }
}
void saveImg(char *filename) {
  png_structp png_ptr;
  png_infop info_ptr;
  png_bytep * row_pointers;
  buffer = (unsigned long long *) malloc(h_board_width >> 3);
  FILE *fp = fopen(filename, "wb");
  png_ptr = png_create_write_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
  info_ptr = png_create_info_struct(png_ptr);
  png_init_io(png_ptr, fp);

    png_set_PLTE(png_ptr, info_ptr, pal_color,PAL_SIZE);
  png_set_IHDR(png_ptr, info_ptr, h_board_width, h_board_height,8, PNG_COLOR_TYPE_PALETTE,PNG_INTERLACE_NONE,PNG_COMPRESSION_TYPE_BASE, PNG_FILTER_TYPE_BASE);
  png_write_info(png_ptr, info_ptr);
  png_byte* row = (png_byte*) malloc(sizeof(png_bytep) * h_board_width*4);
  for (int y=0; y < h_board_height; y++) {
     write_to_row(row,y);
     png_write_row(png_ptr,row);
  }
  free(row);
  free(buffer);
  png_write_end(png_ptr, NULL);
  fclose(fp);
}
void display_cellauto(uchar4 *dst,int imageW, int imageH,int numSMs) {
  dim3 threads(256);
  unsigned int hBlockCounter = 0;
  //cutilSafeCall();
  //int time = (int) (num_generations & 0xFFFFFFFF);

 cudaMemcpyToSymbol(gpu_palette,&pal_color_byte, PAL_SIZE*3,0,cudaMemcpyHostToDevice);
 cudaMemcpyToSymbol(blockCounter, &hBlockCounter, sizeof(unsigned int), 0,   cudaMemcpyHostToDevice );
  cudaMemcpyToSymbol(offsetx,&h_offsetx,sizeof(int),0,cudaMemcpyHostToDevice);
  cudaMemcpyToSymbol(offsety,&h_offsety,sizeof(int),0,cudaMemcpyHostToDevice);
  if (zoom < 0) {
    display_cellauto_zoom_out<<<numSMs,threads>>>(dst,imageW,imageH,-zoom);  
  } else {
    display_cellauto_zoom_in<<<numSMs,threads>>>(dst,imageW,imageH,zoom);  
  }
  //blank_display<<<numSMs,threads>>>(dst,imageW,imageH);
}

void clear_board(int time,int numSMs) {
  dim3 threads(256);
  unsigned int hBlockCounter = 0;
  //cutilSafeCall(); 
  cudaMemcpyToSymbol(blockCounter, &hBlockCounter, sizeof(unsigned int), 0,   cudaMemcpyHostToDevice );
  clear_board2<<<numSMs,threads>>>(time,h_board_width>>5,(h_board_width*h_board_height)>>5);
}

