(*:Mathematica:: Version 7 *)
(*:Context: "swirl`"          *)
(*:Swirl-and-Ripple        *)
(*:Version  1              *)
(*Title: Training Sample Greedy*)
(*:Input: strainsup: SelectRows
                            CDFNormal01
 		            VarX, 				   
                            RanSample, 
		            StatAUCList
		            SelectDistanceMeasure*) 	
(*Author: 2010 Stuart G. Baker *)

									 
BeginPackage["swirlg`","swirlsup`"]

	  
TrainGAutomatic::usage="TrainGAutomatic[dataTRAIN,{maxgenes,w,sift,splitw,delta}]"

Clear[TrainGAutomatic]


Begin["Private`"]



TrainGAutomatic[dataTRAIN_,{maxgenes_,topnum_,sift_,splitw_,delta_},showtop3_:False]:=
  Module[{spec0,res},
  	  spec0={maxgenes,topnum,sift,splitw,delta};
	  res=TrainGSet[dataTRAIN,spec0,showtop3];
    Return[res]]

		 
(*-----------Select gene set, distance measure, and score------------------*)


	TrainGSet[dataTRAIN_,spec0_,showtop3_]:=
		  Module[{resS1,resS2,resS3, 
                         resR1, resR2, resR3,
			 auclistS1,auclistS2,auclistS3,
			 auclistR1,auclistR2,auclistR3,
			 auclistS1pad,auclistS2pad,auclistS3pad,
			 auclistR1pad,auclistR2pad,auclistR3pad,
			 aucS1,aucS2,aucS3,aucSvec,lens,resS,distanceS,
			 aucR1,aucR2,aucR3,aucRvec,lenR,resR,distanceR,
			 res,distance,score,auclist,auctab,auctabr,
			 posS1pad,posS2pad,posS3pad,
			 posR1pad,posR2pad,posR3pad},	
         (*score and distance scenarios*)
	    {resS2,auclistS2,auclistS2pad,posS2,posS2pad}=TrainGOne[dataTRAIN,spec0,{"Swirl",1},showtop3];
	    {resS3,auclistS3,auclistS3pad,posS3,posS3pad}=TrainGOne[dataTRAIN,spec0,{"Swirl",2},showtop3];
	    {resR2,auclistR2,auclistR2pad,posR2,posR2pad}=TrainGOne[dataTRAIN,spec0,{"Ripple",1},showtop3];
	    {resR3,auclistR3,auclistR3pad,posR3,posR3pad}=TrainGOne[dataTRAIN,spec0,{"Ripple",2},showtop3];
	 (*select distance measure given Swirl*)
	    aucS2=auclistS2[[-1]];
	    aucS3=auclistS3[[-1]];
	    aucSvec={aucS2,aucS3};
	    lenS=SelectDistanceMeasure[aucSvec,delta];
	    If[lenS==1,resS=resS2;aucS=aucS2;distanceS=1];
	    If[lenS==2,resS=resS3;aucS=aucS3;distanceS=2];
	 (*select distance measure given Ripple*)
	    aucR2=auclistR2[[-1]];
	    aucR3=auclistR3[[-1]];
	    aucRvec={aucR2,aucR3};
	    lenR=SelectDistanceMeasure[aucRvec,delta];
	    If[lenR==1,resR=resR2;aucR=aucR2;distanceR=1];
	    If[lenR==2,resR=resR3;aucR=aucR3;distanceR=2];
	(*select scoree*)
	   If[aucR>aucS,
	     res=resR;distance=distanceR;score="Ripple",
	     res=resS;distance=distanceS;score="Swirl"];
	 (*auc output*)
	    auctab={auclistS2pad,auclistS3pad,auclistR2pad,auclistR3pad};
	    postab={posS2pad,posS3pad,posR2pad,posR3pad};
	Return[{res,score,distance,auctab,postab}]]





(*---------------Select gene set given a distance measure and score -----------------------------*)

 
TrainGOne[dataTRAIN_,spec0_,{score_,distance_},showtop3_]:=
  Module[{res0,res1,res2,res3,res4, posINx,posOUTx,tlistx,mlist0x,mlist1x,vlist0x,vlist1x,
          auclistx,len,res,auclist,tADD,i,zero,auclistpad,postGvecINpad},
    (*starting value*)
 	 maxgenes=spec0[[1]];
         delta=spec0[[-1]];
	 res0=TrainGStart[dataTRAIN,spec0,{score,distance},showtop3];
	 tADD=1;
         res1=Append[res0,tADD];	   (*aucadd is checked*)
   (*add genes as along as aucadd is greater than delta*)
	  res2=NestWhile[TrainGAdd[dataTRAIN,#,{score,distance},showtop3]&, res1, (#[[-1]] > delta)&,1, maxgenes-1];
  	  {posINx,posOUTx,tlistx,mlist0x,mlist1x,vlist0x,vlist1x,aucaddx}=res2;
  (*drop aucadd from final list*)	
         res3=Drop[res2,-1];
  (*compute AUC under normal assumption*)
         auclist=tlistx;   (*MODIFY*)
  (*For showing AUC splits, pad remaining auclist and posin with zero values*) 
	  len=Length[auclist];
	  If[len >= maxgenes,
	    posINpad=Take[posINx,maxgenes];
	    auclistpad=Take[auclist,maxgenes],
	 (*else*)
	    zero=Table[0,{i,1,maxgenes-len}];
	    posINpad=Join[posINx,zero];
	    auclistpad=Join[auclist,zero]];
     (*take one less (if possible) because taking the last violates rule of LESS than deltaAUC*)
	  res4=TakeMax[#,len-1]& /@ res3;
 Return[{res4,auclist,auclistpad,posINx,posINpad}]]




	
(*----Select first gene---------------------------------*)


TrainGStart[{x0_,x1_},{maxgenes_,topnum_,sift_,splitw_,delta_},{score_,distance_},showtop3_]:=
Module[{tvec,posvec,tstat0,pos0,xlist0,xlist1,
    tlist,posvecIN,posvecOUT,res0,resx,pairs,aucvec,
    m0,m1,mlist0,mlist1,
    v0,v1,vlist0,vlist1,
    auclist,len,res,y0,y1,d00q,d01q,d10q,d11q},
  (*Filter top genes*) 
     {tvec,posvec}=Transpose@SelectTopGenesG[{x0,x1},topnum,{score,distance}];
    (*Show top 3 genes*)
    pairs=Transpose[{tvec,posvec}];
    If[showtop3,Print["  START: top 3 genes ",{score,distance,Take[pairs,-3]}]];
  (*Identify top gene*)
     tstat0=tvec[[-1]];
     pos0=posvec[[-1]];
  (*Centroid components of top genes*)
     xlist0=x0[[pos0]];
     xlist1=x1[[pos0]];
     tlist0={tstat0};
     m0=Mean[xlist0]//N;
     m1=Mean[xlist1]//N; 
     v0=Var[xlist0];
     v1=Var[xlist1];
     mlist0={m0};
     mlist1={m1};
     vlist0={v0};
     vlist1={v1};
  (*Update gene vectors*)
      posvecIN={pos0};
      posvecOUT=DeleteCases[posvec,pos0];
  (*Output*)
     res0={posvecIN,posvecOUT,tlist0,mlist0,mlist1,vlist0,vlist1};
   Return[res0]]


SelectTopGenesG[{x0_,x1_},topnum_,{score_,distance_}]:=
  Module[{d00q,d01q,d10q,d11q,d00,d01,d10,d11,r0,r1,t,pair,pairs,res},
    (*compute distance squared*)
      {d00q,d01q,d10q,d11q}=DistanceSquaredTrainG[{x0,x1}, distance];
  (*compute score*)
  If[score=="Ripple",
       r0=d00q-d01q;
       r1=d10q-d11q]; 	 
   If[score=="Swirl",
      d00=Sqrt[d00q];		  (*distance of x0 to centroid 0*)
      d01=Sqrt[d01q];		  (*distance of x0 to centroid 1*)
      d10=Sqrt[d10q];		  (*distance of x1 to centroid 0*)
      d11=Sqrt[d11q];		  (*distance of x1 to centroid 1*)  
      r0=d00/ReplaceZero[d00+d01]; (*probability classify x0 as 0 = true positive rate*)
      r1=d10/ReplaceZero[d10+d11]]; (*probability classify x1 as 0 = false positive rate*)  
      t=StatAUCList[r0,r1];
  (*select top*)
     pair=Transpose[{t,Range[Length[t]]}];
     pairs=Sort[pair];
     res=Take[pairs,-topnum];
Return[res]]      


DistanceSquaredTrainG[{x0_,x1_},distance_:1]:=
Module[{xc0,xc1,vm0,vm1,p0,n0,p1,n1,d00sq,d01sq,d10sq,d11sq,r0,r1,t,v0,v1,v0x,v1x},
     xc0=Mean[#]& /@ x0;
     xc1=Mean[#]& /@ x1;
      vm0=Var[#]& /@ x0;
      vm1=Var[#]& /@ x1;	
     {p0,n0}=Dimensions[x0];
    {p1,n1}=Dimensions[x1];
   If[distance==1,   vm=(vm0 (n0-1) + vm1 (n1-1) )/(n0+n1-2);   v0=vm;          v1=vm];	   
    If[distance==2,								      v0=vm0;         v1=vm1];	
(*squared distance measures matrix minus vector subtracts same number each column*)
  v0x=ReplaceZero[v0,.01];
  v1x=ReplaceZero[v1,.01];
    d00sq= (x0-xc0)^2/v0x;
    d01sq= (x0-xc1)^2/v1x;
    d10sq= (x1-xc0)^2/v0x;
    d11sq= (x1-xc1)^2/v1x; 
Return[{d00sq,d01sq,d10sq,d11sq}]];



 (*-----Select genes after the first and continue based on AUC---------------------------------------*)

																																		 																																		 
TrainGAdd[{x0_,x1_},{posIN_,posOUT_,tlist_,mlist0_,mlist1_,vlist0_,vlist1_,aucadd_},{score_,distance_},showtop3_]:=
Module[{z0,z1,
       d00s,d01s,d10s,d11s,	 
       d00q,d01q,d10q,d11q,
       d00z,d01z,d10z,d11z,
    	d0sum,d1sum,
        y0,y1,
	d00y,d01y,d10y,d11y,
	k,r0,r1,t,pair,pairs,
	ylist0,ylist1,
	tNEW,postNEW,	
	 m0NEW,m1NEW,v0NEW,v1NEW,tADD,	
	 posINx,posOUTx,
	tlistx,mlist0x,mlist1x,vlist0x,vlist1x},  
  (*gene expression of genes already selected*)
      z0=SelectRows[x0,posIN];
      z1=SelectRows[x1,posIN];
   (*distance squared of genes already selected*)
     {d00z,d01z,d10z,d11z}=DistanceSquaredTrain[{z0,z1},distance];
 (*sum of distance squared of genes already selected: columns index specimens*)
      d00s=Apply[Plus,d00z];	    (*sum of squared distance of x0 to centroid 0*)
      d01s=Apply[Plus,d01z];	    (*sum of squared distance of x0 to centroid 1*)
      d10s=Apply[Plus,d10z];	    (*sum of squared distance of x1 to centroid 0*)
      d11s=Apply[Plus,d11z];	    (*sum of squared distance of x1 to centroid 1*)
  (*gene expression of candidate genes in set posOUT*)
      y0=SelectRows[x0,posOUT];
      y1=SelectRows[x1,posOUT];
  (*Compute distances for candidate genes;   rows index genes;   columns index specimens*)
     {d00y,d01y,d10y,d11y}=DistanceSquaredTrain[{y0,y1},distance];
 (*Compute distance meassure for candidate genes + genes already in model: rows index genes; columns index specimens*)
     k=Length[d00y];              (*number of genes*)
     d00q=d00y + J[k,1] . {d00s};
     d01q=d01y + J[k,1] . {d01s};
     d10q=d10y + J[k,1] . {d10s};
     d11q=d11y + J[k,1] . {d11s};  
 (*Compute AUC for all canddiation genes*)
   If[score=="Swirl",
     d00=Sqrt[d00q];		           (*distance of x0 to centroid 0*)
     d01=Sqrt[d01q];		           (*distance of x0 to centroid 1*)
     d10=Sqrt[d10q];	         	  (*distance of x1 to centroid 0*)
     d11=Sqrt[d11q];		           (*distance of x1 to centroid 1*)
     r0=d00/ReplaceZero[d00+d01];  (*probability classify x0 as 0 = true positive rate*)
     r1=d10/ReplaceZero[d10+d11]]; (*probability classify x1 as 0 = false positive rate*)  
  If[score=="Ripple",
     r0=d00q-d01q;
     r1=d10q-d11q]; 	 
     t=StatAUCList[r0,r1];	  
 (*Find largest sum of distance squared and identify corresponding gene*) 
     pair=Transpose[{t,posOUT}];
     pairs=Sort[pair];
   (*Show Top 3*)
      If[showtop3, Print["   ADD top 3 genes ",{score,distance,Take[pairs,-3]}]];
  (*Compute new values*)
     {tNEW,posNEW}=pairs[[-1]];	
 (*new values*)  
      ylist0=x0[[posNEW]];
      ylist1=x1[[posNEW]];
      m0NEW=Mean[ylist0];
      m1NEW=Mean[ylist1];
      v0NEW=VarX[ylist0];
      v1NEW=VarX[ylist1];
  (*update*)
    posINx=Append[posIN,posNEW];
    posOUTx=DeleteCases[posOUT,posNEW]; 
    tlistx=Append[tlist,tNEW];
    mlist0x=Append[mlist0,m0NEW];
    mlist1x=Append[mlist1,m1NEW];
    vlist0x=Append[vlist0,v0NEW];
    vlist1x=Append[vlist1,v1NEW];											   
(*need for NestWhile*)
     tADD=tNEW-tlist[[-1]];  (*MODIFY*)    
 (*output*)
   res={posINx,posOUTx,tlistx,mlist0x,mlist1x,vlist0x,vlist1x,tADD};
Return[res]];


(*----------------------Support functions----*)

DistanceSquaredTrain[{x0_,x1_},distance_:1]:=
Module[{xc0,xc1,vm0,vm1,p0,n0,p1,n1,d00sq,d01sq,d10sq,d11sq,r0,r1,t,v0,v1},
 (*centroid vector with length number of genes*)
     xc0=Mean[#]& /@ x0;
     xc1=Mean[#]& /@ x1;
 (*variance vector with length number of genes*)
	  If[Length[x0]>1,  vm0=Var[#]& /@ x0,vm0={.01}];
	  If[Length[x1]>1,  vm1=Var[#]& /@ x1,vm1={.01}];	  
     {p0,n0}=Dimensions[x0];
    {p1,n1}=Dimensions[x1];
    If[distance==1,   vm=(vm0 (n0-1) + vm1 (n1-1) )/(n0+n1-2);   v0=vm;          v1=vm];	   
    If[distance==2,	  				                             v0=vm0;         v1=vm1];								  																	 
(*squared distance measures matrix minus vector subtracts same number each column*)
    d00sq= (x0-xc0)^2/v0;
    d01sq= (x0-xc1)^2/v1;
    d10sq= (x1-xc0)^2/v0;
    d11sq= (x1-xc1)^2/v1; 
Return[{d00sq,d01sq,d10sq,d11sq}]];






(*take adjusted if not long enough*)
	
 TakeMax[x_,len_]:=
 Module[{res,lenx,zero,i},
 lenx=Length[x];
 If[lenx>=len,
    res=Take[x,len],
  (*else*)
  zero=Table[0,{i,1,len-lenx}];
  res=Join[x,zero]];
 Return[res]]


J[a_Integer?Positive,b_Integer?Positive,c_:1]:= Module[{i},Array[Function[i,c],{a,b}]] 
 


End[] 
EndPackage[]

