(*:Mathematica:: Version 7 *)
(*:Context: "swirl`"            *)
(*:Swirl-and-Ripple           *)
(*:Version  1                    *)
(*Title: Training Sample Wrapper*)
(*Input  swirlroc.m	 ComputeROCTestSample
	   		         PlotROC	
         swirlreport.m	 ReportWrapper
         swirlsup.m:        RanSample, 
			         SelectRows,
			         StatAUCList
  			         SelectDistanceMeasure *)  
(*Author: Stuart G. Baker,  2010                                   *)


BeginPackage["swirlw`","swirlsup`","swirlg`","swirlroc`","swirlreport`"]

TrainWAutomatic::usage="TrainWAutomatic[dataTRAIN,{maxgenes,w,sift,splitw,delta},{fset,pi,genename,goal}]"

Clear[TrainWAutomatic]


Begin["Private`"]

(*-----------Select rule over random splits, with highest AUC------------------------------*)

TrainWAutomatic[dataTRAIN_,{maxgenes_,topnum_,sift_,splitw_,maxsplitsw_,deltaw_},
            {fset_,pi_,genename_,showsplitw_,showw_,goal_},showtop3_:False]:=
Module[{mat,mat1,res1,res2,i},
	(*Compute for random splits of training sample*)
	    mat=Table[TrainWAutomaticOne[dataTRAIN,{maxgenes,topnum,sift,splitw,deltaw},
	                    {fset,pi,genename},showsplitw,showtop3],{i,1,maxsplitsw}];
	 (*Sort classification components by AUC which is first in list*)
	    mat1=Sort[mat];
	 (*Select classification components with highest AUC*)
	    res1=mat1[[-1]];
	(*Drop first three components to obtain resB,score,distance,auctab,postab*)
	   res2=Drop[res1,3];
	(*Report if goal =1 *)
	     If[goal==1 && showw,ReportWrapper[mat,fset,sift,pi,maxsplitsw,maxgenes]];
	Return[res2]]


(*---Select rule for a single random split---------------------------------------------*)
	

  TrainWAutomaticOne[dataTRAIN_,{maxgenes_,topnum_,sift_,splitw_,deltaw_},
                               {fset_,pi_,genename_},showsplitw_,showtop3_]:=
	Module[{shownumsplit,trainsubjects,dataA,dataB,resA,resAW,resB, resTRAIN,score,distance,auctab,postab,posin},
	If[showtop3,Print["RANDOM SPLIT IN WRAPPER"]];
          (*Randomly split training sample into training-training (A) and training-test (B) *)
 	      {dataA,dataB,trainsubjects}=RanSample[dataTRAIN,splitw,showsplitw]; 
 	  (*Show splits in wrapper*)
             If[showsplitw,
	         Print[" split into training-training  sample in wrapper"];
		 {trainsubjects0,trainsubjects1}=trainsubjects; 
		 Print["    class 0  ", Sort[trainsubjects0]];
 	         Print["    class 1  ", Sort[trainsubjects1]]]; 
	  (*Compute components of each model*)
		{aucfin,resB,score,distance,auctab,postab}= TrainWSet[{dataA,dataB},{maxgenes,topnum,sift,splitw,deltaw},showtop3];
	  (*Compute ROC information of each model in training-test sample*)
                {aucB,tprB1,tprB2,tsetB}=ComputeROCTestSample[dataB,resB,{score,distance},fset,False];
           (*Gene set in training-test sample*)
		 posin=resB[[1]];
	    res={aucfin,posin,tsetB,resB,score,distance,auctab,postab};
	 Return[res]]



(*------------Select gene set, distance measures and score-----------------------------------------*)  	

TrainWSet[{dataA_,dataB_},{maxgenes_,topnum_,sift_,splitw_,deltaw_},showtop3_]:=
  Module[{dataTRAIN,spec,
          resS1,resS2,resS3, 
          resR1,resR2,resR3,
	   aucS1,aucS2,aucS3,aucSvec,lens,resS,distanceS,
	  aucR1,aucR2,aucR3,aucRvec,lenR,resR,distanceR,
	  auclistS1,auclistS2,auclistS3,
	  auclistR1,auclistR2,auclistR3,
   	  auclistpadS1,auclistpadS2,auclistpadS3,
     	   auclistpadR1,auclistpadR2,auclistpadR3,
           distance,score,auclist,auctabR,
	    res,auctab,postab,aucfin},
	 (*Compute classification components for each distance-score scenario*)
	    dataTRAIN={dataA,dataB};
	    spec={maxgenes,topnum,sift,splitw,deltaw};
	    {resS2,auclistS2,auclistpadS2,posS2,posS2pad}=TrainWOne[dataTRAIN,spec,{"Swirl",1},showtop3];
	    {resS3,auclistS3,auclistpadS3,posS3,posS3pad}=TrainWOne[dataTRAIN,spec,{"Swirl",2},showtop3];
	    {resR2,auclistR2,auclistpadR2,posR2,posR2pad}=TrainWOne[dataTRAIN,spec,{"Ripple",1},showtop3];
	    {resR3,auclistR3,auclistpadR3,posR3,posR3pad}=TrainWOne[dataTRAIN,spec,{"Ripple",2},showtop3];
	 (*Select distance measure for Swirl*)			  
  	    aucS2=auclistS2[[-1]];
	    aucS3=auclistS3[[-1]];
	    aucSvec={aucS2,aucS3};
	    lenS=SelectDistanceMeasure[aucSvec,delta];
	    If[lenS==1,resS=resS2;aucS=aucS2;distanceS=1];
	    If[lenS==2,resS=resS3;aucS=aucS3;distanceS=2];
	 (*Select distance measure for Ripple*)			   
 	    aucR2=auclistR2[[-1]];
	    aucR3=auclistR3[[-1]];
	    aucRvec={aucR2,aucR3};
	    lenR=SelectDistanceMeasure[aucRvec,delta];
	    If[lenR==1,resR=resR2;aucR=aucR2;distanceR=1];
	    If[lenR==2,resR=resR3;aucR=aucR3;distanceR=2];
	(*Choose between Swirl and Ripple*)
	    If[aucR>aucS,
		aucfin=aucR;
	 	res=resR;distance=distanceR; score="Ripple",
	 	(*else*)
		aucfin=aucS;
	       res=resS;distance=distanceS;score="Swirl"];
	 (*output*)
	     auctab={auclistpadS2,auclistpadS3,auclistpadR2,auclistpadR3};
	     postab={posS2pad,posS3pad,posR2pad,posR3pad};
	 Return[{aucfin,res,score,distance,auctab,postab}]]


	
(*-----Select gene set given distance measure and score------------------------------------*)									

TrainWOne[{dataA_,dataB_},spec0_,{score_,distance_},showtop3_]:=
Module[{maxgenes,deltaw,res0,res1,res2,res3,res4,aucadd,posin,posout,tlist,mlist0,mlist1,vlist0,vlist1,
            auclistpad,len,res,auclist,i,zero},
   (*maximum number of selections*)
      maxgenes=spec0[[1]];
      deltaw=spec0[[-1]];
      res0=TrainWStart[{dataA,dataB},spec0,{score,distance},showtop3];
  (*intitialize test for NestWhile*)
     aucadd=1;
     res1=Append[res0,aucadd];
 (*adds genes based on Train-test sample*)
     res2=NestWhile[TrainWAdd[{dataA,dataB},#,{score,distance},showtop3]&, res1,(#[[-1]] > deltaw)&,1, maxgenes-1];
     {posin,posout,tlist,mlist0,mlist1,vlist0,vlist1,aucadd}=res2;
 (*drop aucadd, which was only for NestWhile*)	
     res3=Drop[res2,-1];
 (*compute AUC under normal assumption*)
       auclist =tlist;                                                     (*MODIFY*)
 (*for reporting splits, pad with zeros*)
     len=Length[auclist];
     zero=Table[0,{i,1,maxgenes}];
     auclistpad=Join[auclist,Drop[zero,len]];
      posinpad=Join[posin,Drop[zero,len]];
 (*take one less because last violates rule*)
     res4=TakeMax[#,len-1]& /@ res3;	 
  Return[{res4,auclist,auclistpad,posin,posinpad}]]


 

(*-----------Select first gene given distance measure and score------------------------------*)
 

TrainWStart[{dataA_,dataB_},{maxgenes_,topnum_,sift_,splitw_,deltaw_},{score_,distance_},showtop3_]:=
Module[{tvecA,posvecA,tA,posA,   auvecA,pairsA,   posvecAOUT,
          aucvecA,pairvecA,
         xA0,xA1,   xlistA0, xlistA1,  mA0,mA1,  vA0,vA1, resA},
(*Sample A= training-training; Sample B=training-test*)	
(*Select top genes  components in sample A based on fit in Sample B*) 
    {tvecA,posvecA}=Transpose@SelectTopGenesW[dataA,dataB,topnum,{score,distance}];
     posA=posvecA[[-1]];
     tA=tvecA[[-1]];
     posvecAOUT=DeleteCases[posvecA,posA];
 (*Show top 3 genes and AUC values*)
     aucvecA=CDFNormal01[tvecA]; 
     aucvecA=tvecA;                        (*MODIFY*)
     pairvecA=Transpose[{aucvecA,posvecA}];
     If[showtop3,Print["  START: top 3 genes ",{score,distance,Take[pairvecA,-3]}]];
 (*Centroid components for genes from sample A*)
     {xA0,xA1}=dataA;
     xlistA0=xA0[[posA]];
     xlistA1=xA1[[posA]];
     mA0=Mean[xlistA0]//N;
     mA1=Mean[xlistA1]//N; 
      vA0=VarX[xlistA0];
     vA1=VarX[xlistA1];
     resA={{posA},posvecAOUT,{tA},{mA0}, {mA1},{vA0},{vA1}};
  Return[resA]]


SelectTopGenesW[dataA_,dataB_,topnum_,{score_,distance_}]:=
  Module[{xA0,xA1,xB0,xB1,mlistA0,mlistA1,vlistA0,vlistA1,pA0,nA0,pA1,nA1,
             d00q,d01q,d10q,d11q,d00,d01,d10,d11,r0,r1,t,pair,pairs,res},
    (*compute components of sample A*)
        {xA0,xA1}=dataA;
        {xB0,xB1}=dataA;
        mlistA0=Mean[#]& /@ xA0;
        mlistA1=Mean[#]& /@ xA1;
        vlistA0=Var[#]& /@ xA0;
        vlistA1=Var[#]& /@ xA1;  	
        {pA0,nA0}=Dimensions[xA0];
       {pA1,nA1}=Dimensions[xA1];
    (*compute distance squared in sample B*)  
        {d00q,d01q,d10q,d11q}=DistanceSquaredTrainW[{xB0,xB1}, {mlistA0,mlistA1,vlistA0,vlistA1,nA0,nA1},distance];
   (*compute score based on Sample B*)
  If[score=="Ripple",
       r0=d00q-d01q;
       r1=d10q-d11q]; 	 
   If[score=="Swirl",
      d00=Sqrt[d00q];		  (*distance of x0 to centroid 0*)
      d01=Sqrt[d01q];		  (*distance of x0 to centroid 1*)
      d10=Sqrt[d10q];		  (*distance of x1 to centroid 0*)
      d11=Sqrt[d11q];		  (*distance of x1 to centroid 1*)   
      r0=d00/ReplaceZero[d00+d01]; (*probability classify x0 as 0 = true positive rate*)
      r1=d10/ReplaceZero[d10+d11]]; (*probability classify x1 as 0 = false positive rate*)  
      t=StatAUCList[r0,r1];
  (*select top*)
     pair=Transpose[{t,Range[Length[t]]}];
     pairs=Sort[pair];
     res=Take[pairs,-topnum];
Return[res]]      





  (*------------Select additional genes given distance and score and continue based on AUC-------------*)


TrainWAdd[{{xA0_,xA1_},{xB0_,xB1_}},{posIN_,posOUT_,tlist_,mlistA0_,mlistA1_,vlistA0_,vlistA1_,aucadd_},
     {score_,distance_},showtop3_]:=
 Module[{xlistB0,xlistB1,
         d00z,d01z,d10z,d11z,
	 d00s,d01s,d10s,d11s,	 
   	 xsetB0,xsetB1,
	 msetA0,msetA1,vsetA0,vsetA1,vsetAP,
   	 d00y,d01y,d10y,d11y,
	 k,r0,r1,t,pair,pairs,
	 tNEW,postNEW,	
	 m0NEW,m1NEW,v0NEW,v1NEW,aucaddNEW,	
	 posINx,posOUTx,
	 tlistx,mlist0x,mlist1x,vlist0x,vlist1x},  
  (*gene expression of genes already selected*)
      xlistB0=SelectRows[xB0,posIN];
      xlistB1=SelectRows[xB1,posIN];
(*distance squared of genes already selected using centroid variance of training training sample*)
     {pA0,nA0}=Dimensions[xA0];
     {pA1,nA1}=Dimensions[xA1];
     {d00z,d01z,d10z,d11z}=DistanceSquaredTrainW[{xlistB0,xlistB1},{mlistA0,mlistA1,vlistA0,vlistA1,nA0,nA1},distance];
 (*sum of distance squared of genes already selected*)
     d00s=Apply[Plus,d00z];	    (*sum of squared distance of xB0 to centroid 0*)
     d01s=Apply[Plus,d01z];	    (*sum of squared distance of xB0 to centroid 1*)
     d10s=Apply[Plus,d10z];	    (*sum of squared distance of xB1 to centroid 0*)
      d11s=Apply[Plus,d11z];	    (*sum of squared distance of xB1 to centroid 1*)
  (*gene expression of candidate genes in set posOUT*)
      xsetB0=SelectRows[xB0,posOUT];
      xsetB1=SelectRows[xB1,posOUT];
      xsetA0=SelectRows[xA0,posOUT];
      xsetA1=SelectRows[xA1,posOUT];
      msetA0=Mean[#]& /@ xsetA0;
      msetA1=Mean[#]& /@ xsetA1;
      vsetA0=Var[#]& /@ xsetA0;
      vsetA1=Var[#]& /@ xsetA1;  	
   (*matrices of distance squared rows is genes columns is specimens*)
      {d00y,d01y,d10y,d11y}=DistanceSquaredTrainW[{xsetB0,xsetB1},{msetA0,msetA1,vsetA0,vsetA1,nA0,nA1},distance];
   (*distance squared of candidate genes + sum of distance squared of already selected genes=total distance squared*)
      k=Length[d00y];  (*number of genes*)
      d00q=d00y + J[k,1] . {d00s};
      d01q=d01y + J[k,1] . {d01s};
      d10q=d10y + J[k,1] . {d10s};
      d11q=d11y + J[k,1] . {d11s}; 
  (*SCORE*)
   If[score=="Swirl",
     d00=Sqrt[d00q];		  (*distance of xB0 to centroid 0*)
     d01=Sqrt[d01q];		  (*distance of xB0 to centroid 1*)
     d10=Sqrt[d10q];		  (*distance of xB1 to centroid 0*)
     d11=Sqrt[d11q];		  (*distance of xB1 to centroid 1*)
     r0=d00/ReplaceZero[d00+d01]; (*probability classify xB0 as 0 = true positive rate*)
     r1=d10/ReplaceZero[d10+d11]]; (*probability classify xB1 as 0 = false positive rate*)  
   If[score=="Ripple",
     r0=d00q-d01q;
     r1=d10q-d11q]; 	 
     t=StatAUCList[r0,r1];	  
 (*find largest sum of distance squared and corresponding gene*) 
     pair=Transpose[{t,posOUT}];
     pairs=Sort[pair];
    {tNEW,posNEW}=pairs[[-1]];	
  (*show Top 3*)
    If[showtop3, Print["   ADD top 3 genes ",{score,distance,Take[pairs,-3]}]];
 (*new values*)  
    ylist0=xB0[[posNEW]];
    ylist1=xB1[[posNEW]];
    m0NEW=Mean[ylist0];
    m1NEW=Mean[ylist1];
    v0NEW=VarX[ylist0];
    v1NEW=VarX[ylist1]; 
 (*update*)
    posINx=Append[posIN,posNEW];
    posOUTx=DeleteCases[posOUT,posNEW]; 
    tlistx=Append[tlist,tNEW];
    mlist0x=Append[mlistA0,m0NEW];
    mlist1x=Append[mlistA1,m1NEW];
    vlist0x=Append[vlistA0,v0NEW];
    vlist1x=Append[vlistA1,v1NEW];											   
 (*need aucaddNEW for NestWhile*)
    aucaddNEW=CDFNormal01[tNEW]-CDFNormal01[tlist[[-1]]];
    aucaddNEW=tNEW-tlist[[-1]];                                           (*MODIFY*)
    
 (*output*)
    res={posINx,posOUTx,tlistx,mlist0x,mlist1x,vlist0x,vlist1x,aucaddNEW};
Return[res]];


(*-----Support Functions---------------------*)



DistanceSquaredTrainW[{xB0_,xB1_},{mA0_,mA1_,vA0_,vA1_,nA0_,nA1_},distance_]:=
Module[{vAP,d00sq,d01sq,d10sq,d11sq},
     vAP= (vA0 (nA0-1) + vA1 (nA1-1) )/(nA0+nA1-2);
     If[distance==1, varA0=vAP;  varA1=vAP];	   
    If[distance==2, varA0=vA0;  varA1=vA1];								  																	 
(*squared distance in measures matrix minus vector subtracts same number each column*)
(*distance in set B*)
    d00sq= (xB0-mA0)^2/varA0;
    d01sq= (xB0-mA1)^2/varA1;
    d10sq= (xB1-mA0)^2/varA0;
    d11sq= (xB1-mA1)^2/varA1; 
Return[{d00sq,d01sq,d10sq,d11sq}]];


TakeMax[x_,len_]:=
 Module[{res,lenx,zero,i},
 lenx=Length[x];
 If[lenx>=len,
    res=Take[x,len],
  (*else*)
  zero=Table[0,{i,1,len-lenx}];
  res=Join[x,zero]];
 Return[res]]
 

End[] 
EndPackage[]

