#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <time.h>

void main_cal(int,int);
void mul_fft_fuku(double *,double *,int);
void mul_fft_dataset(short int *,int,double *,int);
void mul_fft_dataput_type3(short int *,int,int,double *,int);
void mul_fft_dataput_type2(short int *,int,int,double *,int);
void lmulmt_herf(short int *,short int *,short int *,int,int,int,double *,int);
void mul_harf_hosei(short int *,short int *,short int *,short int *,short int *,int,int,int,int,int);
void gyaku_herf(short int *,short int *,int,double *);
void gmul_xa(short int *,short int *,int,double *);
void gmul_precalc(short int *,short int *,int,double *);
void rdft(int, int, double *);

void main(void)
{
	main_cal(1024,1);
}

void main_cal(int bx,int seed)
{
	short int *stsu,*st1,*su1;
	int i,mr;
	double *cp;
	long t_1, t_2;
	
	stsu=(short int *)malloc((bx/4+4+2)*sizeof(short int)*2+10);
	cp=(double *)malloc((bx/4+4)*sizeof(double)*2+(bx/4+2)*sizeof(short int));
	if(cp == NULL){
		printf("Memory Allocation Failure!\n");
		exit(1);
	}
	printf("use memory = %d\n",(bx/4+4)*sizeof(short int)*2+2+(bx/4+4)*sizeof(double)*2+(bx/4+2)*sizeof(short int));
	
	for(i=0;i<(bx/4+4)*2;i++) stsu[i]=rand()%10000;
	srand(seed);
	
	mr=bx/4-1;
	st1=stsu;
	su1=st1+mr+2+3;
	
	*((int *)st1)=mr;
	for(i=2;i<2+mr+3;i++){
		st1[i]=rand()%10000;
	}
	if(st1[2]==0) st1[2]=rand()%10000;
	
	
t_1=clock();
	st1[2+mr+0]=0;
	st1[2+mr+1]=0;
	st1[2+mr+2]=0;
	gyaku_herf(st1,su1,mr,cp);
t_2=clock();
	lmulmt_herf(st1,su1,st1,mr,mr,mr,cp,mr);
	
	for(i=0;i<mr+2;i++) printf("%5d",st1[i]); printf("\n");
	if( st1[mr+2-2]!=9999 && st1[mr+2-2]!=0 ){printf("error seed=%d\n",seed);exit(1);}
	
	printf("\n%d msec.\n",t_2-t_1);
	
	free(stsu);
	free(cp);
}

void gyaku_herf(short int *a,short int *x,int kt,double *cp)
{
	int i,bp;
	double gy=0.0;
	
	for(i=2;i<2+4+2;i++){
		gy+=a[i];
		gy*=10000.0;
	}
	gy/=10000.0;
	gy=1.0/gy;
	
	*((int *)x)=4+2-*((int *)a);
	while((int)(gy*=10000.0)==0){
		(*((int *)x))--;
	}
	
	for(i=2;i<2+4+2;i++){
		x[i]=gy;
		gy-=x[i];
		gy*=10000.0;
	}
	
	for(i=2+6;i<2+15;i++) x[i]=0;
	gmul_precalc(a+2,x+2,15,cp);
	gmul_precalc(a+2,x+2,15,cp);
	
	bp=8;
	while(bp<(kt+1)){
		//x=2*x-A*x*x
		gmul_xa(a+2,x+2,bp-1,cp);
		bp*=2;
	}
	
}

void gmul_precalc(short int *a,short int *b,int kt,double *acp)
{
	int i,shr;
	double *bcp;
	
	shr=(kt+1)*2;
	bcp=acp+shr+4;
	
	mul_fft_dataset(a,kt,acp,shr);
	rdft(shr, 1, acp);
	mul_fft_dataset(b,kt,bcp,shr);
	rdft(shr, 1, bcp);
	mul_fft_fuku(acp,bcp,shr);
	rdft(shr,-1, acp);
	mul_fft_dataput_type3(b,kt+kt,kt,acp,shr);
	
	if(b[0]==1) b[0]=0; else b[0]=1;
	for(i=kt-1;i>0;i--){
		b[i]=9999-b[i];
	}
	
	mul_fft_dataset(b,kt,acp,shr);
	rdft(shr, 1, acp);
	mul_fft_fuku(acp,bcp,shr);
	rdft(shr,-1, acp);
	mul_fft_dataput_type3(b,kt+kt-1,kt,acp,shr);
}

void gmul_xa(short int *a,short int *b,int kt,double *acp)
{
	int i,shr,b1,b2,b3,b4,be1,be2,be3,over_chk;
	double *bcp,und1,und2,und3;
	
	shr=(kt+1)*2;
	bcp=acp+shr+4;
	
	b1=b[0];
	b2=b[1];
	b3=b[2];
	b4=b[3];
	be1=b[kt+0];
	be2=b[kt+1];
	be3=b[kt+2];
	
	mul_fft_dataset(a,kt,acp,shr);
	rdft(shr, 1, acp);
	mul_fft_dataset(b,kt,bcp,shr);
	rdft(shr, 1, bcp);
	mul_fft_fuku(acp,bcp,shr);
	rdft(shr,-1, acp);
	
	mul_fft_dataput_type3(b+kt,kt,kt,acp,shr);
	
	mul_fft_dataset(a+kt,kt,acp,shr);
	rdft(shr, 1, acp);
	mul_fft_fuku(acp,bcp,shr);
	rdft(shr,-1, acp);
	
	und1=(double)be1*(shr/2.0);
	und2=(double)be2*(shr/2.0);
	und3=(double)be3*(shr/2.0);
	for(i=0;i<kt+4;i++){
		acp[i+2]+=a[i]*und1;
		acp[i+3]+=a[i]*und2;
		acp[i+4]+=a[i]*und3;
	}
	
	acp[kt+2]+=a[kt*2  ]*b1*(shr/2.0);
	acp[kt+3]+=a[kt*2  ]*b2*(shr/2.0);
	acp[kt+4]+=a[kt*2  ]*b3*(shr/2.0);
	acp[kt+5]+=a[kt*2  ]*b4*(shr/2.0);
	
	acp[kt+3]+=a[kt*2+1]*b1*(shr/2.0);
	acp[kt+4]+=a[kt*2+1]*b2*(shr/2.0);
	acp[kt+5]+=a[kt*2+1]*b3*(shr/2.0);
	acp[kt+6]+=a[kt*2+1]*b4*(shr/2.0);
	
	acp[kt+4]+=a[kt*2+2]*b1*(shr/2.0);
	acp[kt+5]+=a[kt*2+2]*b2*(shr/2.0);
	acp[kt+6]+=a[kt*2+2]*b3*(shr/2.0);
	acp[kt+7]+=a[kt*2+2]*b4*(shr/2.0);
	
	acp[kt+5]+=a[kt*2+3]*b1*(shr/2.0);
	acp[kt+6]+=a[kt*2+3]*b2*(shr/2.0);
	acp[kt+7]+=a[kt*2+3]*b3*(shr/2.0);
	acp[kt+8]+=a[kt*2+3]*b4*(shr/2.0);
	
	b[kt*2+0]=0;
	b[kt*2+1]=0;
	b[kt*2+2]=0;
	b[kt*2+3]=0;
	mul_fft_dataput_type2(b+kt,kt+kt,kt+4,acp,shr);
	
	over_chk=0;
	if(b[kt]==10000){
		over_chk=1;
	}
	else{
		for(i=kt*2+3;i>=kt;i--){
			b[i]=9999-b[i];
		}
	}
	
	mul_fft_dataset(b+kt+2,kt,acp,shr);
	rdft(shr, 1, acp);
	mul_fft_fuku(acp,bcp,shr);
	rdft(shr,-1, acp);
	
	acp[kt+2]+=b[kt+2]*be1*(shr/2.0);
	acp[kt+3]+=b[kt+2]*be2*(shr/2.0);
	acp[kt+4]+=b[kt+2]*be3*(shr/2.0);
	
	acp[kt+3]+=b[kt+3]*be1*(shr/2.0);
	acp[kt+4]+=b[kt+3]*be2*(shr/2.0);
	acp[kt+5]+=b[kt+3]*be3*(shr/2.0);
	
	acp[kt+4]+=b[kt+4]*be1*(shr/2.0);
	acp[kt+5]+=b[kt+4]*be2*(shr/2.0);
	acp[kt+6]+=b[kt+4]*be3*(shr/2.0);
	
	
	acp[kt+2]+=b[kt*2+2]*b1*(shr/2.0);
	acp[kt+3]+=b[kt*2+2]*b2*(shr/2.0);
	acp[kt+4]+=b[kt*2+2]*b3*(shr/2.0);
	
	acp[kt+3]+=b[kt*2+3]*b1*(shr/2.0);
	acp[kt+4]+=b[kt*2+3]*b2*(shr/2.0);
	acp[kt+5]+=b[kt*2+3]*b3*(shr/2.0);
	
	mul_fft_dataput_type3(b+kt+1,kt+kt,kt+3,acp,shr);
	
	if(over_chk==1){
		for(i=kt*2+3;i>=kt;i--){
			b[i]=9999-b[i];
		}
	}
	
	b[kt+2]+=be3;
	b[kt+1]+=be2;
	b[kt+0]+=be1;
	
	if(b[kt+2]>9999){b[kt+2]-=10000; b[kt+1]++;}
	if(b[kt+1]>9999){b[kt+1]-=10000; b[kt+0]++;}
	if(b[kt+0]>9999){b[kt+0]-=10000; }
}

void lmulmt_herf(short int *a,short int *b,short int *out,int kta,int ktb,int rkt,double *acp,int limit)
{
	int i,hlim,d,cy,shr,sis,ktachk,ktbchk,new_rkt,new_kta,new_ktb;
	short int *a1,*a2,*b1,*b2,*tmp,*t1,*t2;
	double *bcp;
	
	hlim=limit/2;
	
	shr=2;
	while(hlim*2+1>shr){
		shr*=2;
	}
	
	bcp=acp+shr+4;
	tmp=(short int *)(bcp+shr+4);
	a1=a+2;
	a2=a1+hlim;
	b1=b+2;
	b2=b1+hlim;
	t1=tmp;
	t2=tmp+hlim;
	
	ktachk=0;
	ktbchk=0;
	new_rkt=hlim*2;
	if(kta>new_rkt) { new_kta=hlim; ktachk=1;} else new_kta=kta-hlim;
	if(ktb>new_rkt) { new_ktb=hlim; ktbchk=1;} else new_ktb=ktb-hlim;
	
	for(i=0;i<new_rkt+3;i++) tmp[i]=0;
	
	mul_harf_hosei( a1, a2, b1, b2, tmp, new_kta, new_ktb, new_rkt, ktachk, ktbchk);
	
	mul_fft_dataset(b2,new_ktb,acp,shr);
	rdft(shr, 1, acp);
	mul_fft_dataset(a1,hlim,bcp,shr);
	rdft(shr, 1, bcp);
	mul_fft_fuku(acp,bcp,shr);
	rdft(shr,-1, acp);
	mul_fft_dataput_type2(t2,new_ktb+hlim,hlim+3,acp,shr);
	
	mul_fft_dataset(b1,hlim,acp,shr);
	rdft(shr, 1, acp);
	mul_fft_fuku(bcp,acp,shr);
	rdft(shr,-1, bcp);
	mul_fft_dataput_type2(t1,hlim+hlim,new_rkt,bcp,shr);
	
	mul_fft_dataset(a2,new_kta,bcp,shr);
	rdft(shr, 1, bcp);
	mul_fft_fuku(acp,bcp,shr);
	rdft(shr,-1, acp);
	mul_fft_dataput_type2(t2,hlim+new_kta,hlim+3,acp,shr);
	
	cy=0;
	for(i=hlim;i>=0;i--){
		d=tmp[i]+cy;
		cy=d*0.0001;
		tmp[i]=d-cy*10000;
		if(cy==0) break;
	}
	
	sis=*((int *)a)+*((int *)b);
	if(tmp[0]!=0){
		for(i=rkt-1;i>=0;i--){
			out[i+2]=tmp[i];
		}
	}
	else{
		for(i=rkt-1;i>=0;i--){
			out[i+2]=tmp[i+1];
		}
		sis--;
	}
	*((int *)out)=sis;
}

void mul_harf_hosei(short int *a1,short int *a2,short int *b1,short int *b2,short int *tmp,int kta,int ktb,int rkt,int ktachk,int ktbchk)
{
	int tmp1,tmp2,tmp3;
	long double gs;
	
	if(kta<2 && ktb<2){
		gs=(long double)(a2[0]*10000)*(b2[0]*10000);
	}
	else if(kta<2){
		gs=(long double)(a2[0]*10000)*(b2[0]*10000+b2[1]);
	}
	else if(ktb<2){
		gs=(long double)(a2[0]*10000+a2[1])*(b2[0]*10000);
	}
	else{
		gs=(long double)(a2[0]*10000+a2[1])*(b2[0]*10000+b2[1]);
	}
	tmp1=gs*0.000000000001;
	tmp2=(gs-(long double)tmp1*1000000000000)*0.00000001;
	tmp3=(gs-(long double)tmp1*1000000000000-(long double)tmp2*100000000)*0.0001;
	tmp[rkt  ]+=tmp1;
	tmp[rkt+1]+=tmp2;
	tmp[rkt+2]+=tmp3;
	
	gs=0.0;
	if(ktbchk==1) gs+=(long double)(a1[0]*100000000.0+a1[1]*10000.0+a2[2])*b1[rkt];
	if(ktachk==1) gs+=(long double)(b1[0]*100000000.0+b1[1]*10000.0+b2[2])*a1[rkt];
	tmp1=gs*0.000000000001;
	tmp2=(gs-(long double)tmp1*1000000000000.0)*0.00000001;
	tmp3=(gs-(long double)tmp1*1000000000000.0-(long double)tmp2*100000000.0)*0.0001;
	tmp[rkt  ]+=tmp1;
	tmp[rkt+1]+=tmp2;
	tmp[rkt+2]+=tmp3;
	
	tmp[rkt+1]+=tmp[rkt+2]/10000;
	tmp[rkt+2]%=10000;
	tmp[rkt  ]+=tmp[rkt+1]/10000;
	tmp[rkt+1]%=10000;
	tmp[rkt-1]+=tmp[rkt  ]/10000;
	tmp[rkt  ]%=10000;
}

void mul_fft_dataput_type3(short int *c,int loc,int out_len,double *acp,int shr)
{
	int i,start,deep;
	double scale,rgs;
	__int64 cy;
	
	scale=2.0/shr;
	start=shr-(loc+1);
	
	deep=3;
	while( deep > -1 ){
		if( start + out_len + deep < shr+4 ) break;
		deep--;
	}
	
	cy=0;
	for(i=out_len+deep;i>out_len-1;i--){
		rgs=acp[i+start]*scale+0.5+cy;
		cy=rgs*0.0001;
	}
	for(i=out_len-1;i>=0;i--){
		rgs=acp[i+start]*scale+0.5+cy;
		cy=rgs*0.0001;
		c[i]=rgs-(double)cy*10000.0;
	}
}

void mul_fft_dataput_type2(short int *c,int loc,int out_len,double *acp,int shr)
{
	int i,start,deep;
	double scale,rgs;
	__int64 cy;
	
	scale=2.0/shr;
	start=shr-(loc+1);
	
	deep=3;
	while( deep > -1 ){
		if( start + out_len + deep < shr+4 ) break;
		deep--;
	}
	
	cy=0;
	for(i=out_len+deep;i>out_len-1;i--){
		rgs=acp[i+start]*scale+0.5+cy;
		cy=rgs*0.0001;
	}
	for(i=out_len-1;i>=0;i--){
		rgs=acp[i+start]*scale+0.5+cy+c[i];
		cy=rgs*0.0001;
		c[i]=rgs-(double)cy*10000.0;
	}
	if((int)cy==1) c[0]+=10000;
}

void mul_fft_dataset(short int *a,int kta,double *acp,int shr)
{
	int i,start;
	
	start=shr-kta;
	for(i=0;i<start;i++) acp[i]=0.0;
	for(i=0;i<kta;i++) acp[start+i]=a[i];
	acp[shr]=0.0;acp[shr+1]=0.0;acp[shr+2]=0.0;acp[shr+3]=0.0;
}

void mul_fft_fuku(double *acp,double *bcp,int shr)
{

	int i;
	double ra,rb,rc,rd;
	
	acp[0]=acp[0]*bcp[0];
	acp[1]=acp[1]*bcp[1];
	for(i=2;i<shr;i+=2){
		ra=acp[i  ];
		rb=acp[i+1];
		rc=bcp[i  ];
		rd=bcp[i+1];
		acp[i  ]=ra*rc-rb*rd;
		acp[i+1]=rb*rc+ra*rd;
	}
}

–ß‚é