(*: Mathematica:: Version 7  *)
(*: Context: "swirl`"        *)
(*: Swirl and Ripple         *)
(*: Version 1                *)
(* Key Function  Classify[datalist,options] *)	 
(*Reproducibilility:      ClassifyMS1[]  classify under goal 1 and generate figure 3
                                ClassifyMS2[] classify under goal 2 
                                ClassifySim1[] classify simulated data under goal 1
                                ClassifySim2[] classify simulated data under goal 2
                                ClassifyFig12[{x,y,k,a,b}] generate figures 1 and 2*)
(*: Input:          swirlcore.m:             key program	
			swirlg.m	               greedy algorithm
			swirlw.m	                wrapper algorithm
			swirlsup.m	               support functions 
			swirlroc.m               computation of ROC curve in test sample
			swirlplot.m	              plotting boundary curves for two genes using data)
			swirlreport.m            reporting functions									 
			swirlplotsym.m          plot hypothetical boundary curves
	  	        swirlsim.m                plot simulated data for two genes                            *) 	                        
 (*:Data:           swirldata.m               create data sets: detailed references: calls following: 
			swirldata1.m             colon cancer data                     Alon et al  (1999)
			swirldataname1.m     colon cancer gene names          Alon et al  (1999)
			swirldata20.m           leukemia class 0 data                 Golub et al (1999)
			swirldata21.m           leukemia class 1 data                 Golub et al (1999)
   	                 swirldataname2.m    leukemia gene names                Golub et al (1999)
			swirldata3.m             medulloblastoma data from          Pomerory et al (2002)
		        swirldataname3.m     medulloblastoma gene names       Pomerory et al (2002)
		        swirldata4.m             prostate cancer data                   Singh et al (2002)
			swirldataname4.m     prostate cancer gene names        Singh et al (2002)
			swirldata50.m             pedidatric leukemia class 0 data  Yeoh et al (2002)
			swirldata51.m             pedidatric leukemia class 1 data  Yeoh et al (2002)
			swirldataname5.m     pediatric leukemia gene names    Yeoh et al (2002)  *)  
(*: Article: Simple and flexible classification of gene expression      microarrays via Swirls and Ripples   *)
(*: Author: 2010 Stuart G. Baker                                          *)		 

 
 (*This code is provided "as is", without warranty of any kind, express or implied, 
 including but not limited to the warranties of merchantability, fitness for a  particular purpose and noninfringement.
 In no event shall the NCI or the individual developers be liable for any claim, 
    damages or other liability of any kind. Use of this code by recipient is at recipient's own risk. 
    NCI makes no representations that the use of the code will not infringe any patent or proprietary
    rights of third parties.*)

 
Share[]

  
BeginPackage["swirl`","swirlcore`","swirlg`", "swirlw`", "swirlsup`","swirlroc`", 
                       "swirlplot`","swirldata`","swirlplotsym`","swirlsim`"]



Classify::usage="Classify[{data,genename,dataname},options] where
    data=  {x0,x1} where x0 is matrix for class 0 (rows=genes and columns=specimens)
	                          x1 is matrix for class 1 (rows=genes and columns=specimens)
	                           minimum of 30 specimens
	           genename = list of names of all genes in same order as data rows
	           dataname= name of dataset"

							
Options[Classify]=
 {Goal->1,
  MaxSplits->100,
  FracSplit->.7,
  MaxGenes->Automatic,
  TopNum->50,
  DeltaAUC->.02,
  PhiAUC->.5,
  FracSplitW ->.5,
  MaxSplitsW->5, 
  ParSym->{x,y,k},
  FPRSet->N[Range[9]/10],
  MaxBoot->20,
  ShowGeneKey->True,
  ShowProgress1->False,
  ShowProgress2->False,
  ShowPlot2D->False,
  ShowFinalModel->True,
  ShowWrapper->False,
  ShowSplitW->False,
  Prev->Automatic,
  Quick->False}




 (*------------------------OPTION LIST------------------------------------*)
 (*Goal*)
 Goal::usage="1=DiscoveryAndTest  sets MaxSplits->1, ShowProgress1->True,
                      2=DiscoveryOnly       sets MaxSplitsW->1"
 								  
  (*Random Splits*)
  MaxSplits::usage="number of random splits into training and test samples: for Goal 2 need more than 1"
  FracSplit::usage="fraction split into training sample"							 

(*Rule Selection*)
  MaxGenes::usage="maximum number of genes to include in classification rule (default is Automatic =TopNum)"
  TopNum::usage="number of genes in preliminary filter for training sample"
  DeltaAUC::usage="minimum change in AUC to select more complex rule in training sample"
  PhiAUC::usage="multiply DeltaAUC for Wrapper between 0 and 1"

 (*For Wrapper*)
  FracSplitW::usage="fraction split into training-training sample with use of wrapper; restricted to [.3 .7]"
  MaxSplitsW::usage="number of random splits into training and test samples"
  ShowSplitW::usage"Show split in Wrapper"

(*Plotting Boundary Shapes for Two Genes*)
  ShowPlot2D::usage="Show boundary plot if the rule has two genes"
  ParSym::usage= "variables for plotting boundaries: x and y at level k" 

(*ROC Curve Calculation*)
   FPRSet::usage="set of values of FPR for ROC curve in test sample at which TPR is computed"
   MaxBoot::usage="number of bootstraps to compute ROC curves in test sample under goal 1"

(*For Relative Utility Calculation*)
   Prev::usage="prevalence of outcome; default is Automatic= prevalence in data" 

(*Reporting*)
  ShowGeneKey::usage="Print names corresponding to genes"
  ShowProgress1::usage="Print  AUC and genes selected on each split"
  ShowProgress2::usage="Print top 3 genes selected at each step"
  ShowFinalModel::usage="Compute classification rule for final model"
  ShowWrapper::usage="Show details about wrapper"


 (*Special Options--generally do not change*)
  Quick::usage="True or False (set to TRUE for software development only)"


Clear[Classify]


Begin["Private`"]




Classify[{data00_,genename0_,datanameset_},options___Rule]:=
 Module[{minmaxQ,parsym,maxgenes0,maxsplits0,show1,maxsplits,split,splitw,maxsplitsw,topnum,delta,phi,
			  showgenekey,showprog1,showprog2,showplot2D,showfinalmodel,showw,fset,pi0,
			  spec1,spec2,resplot,
			   mat0,mat1,ng0,ng1,num0,num1,pi,checkQ,rocplotset,res,
				tvec,posvec,a0,a1,data0,genename},
		(*invoke options*)
	      {goal,quick,showsplitw,parsym,maxgenes0,maxsplits0,split,splitw,maxsplitsw0,topnum,delta,phi,
			  showgenekey,showprog10,showprog2,showplot2D,showfinalmodel,showw,fset,maxboot,pi0}=
	      {Goal,Quick,ShowSplitW,ParSym,MaxGenes,MaxSplits,FracSplit,
			   FracSplitW,MaxSplitsW,TopNum,DeltaAUC,PhiAUC,
	                  ShowGeneKey,ShowProgress1,ShowProgress2,ShowPlot2D,
				ShowFinalModel,ShowWrapper,FPRSet,MaxBoot,Prev}/.{options}/.Options[Classify];	
		 (*GOAL sets splits and show*)
		  If[goal==1,
		          showprog1=True;
			  maxsplitsw=maxsplitsw0;
			  maxsplits=1];
		   If[goal==2,
		         showprog1=showprog10;
			  maxsplitsw=1;
			  maxsplits=maxsplits0];	   
		 (*QUICK MODE to REDUCE number of genes based on all data-- only for development*)
		  If[quick,
		       maxsplitsw=1;
		       Print["Select top 100 genes before analysis"];
 			 {tvec,posvec}=Transpose@SelectTopGenesQuick[data00,100];
			 {a0,a1}=data00;
			 data={a0[[posvec]],a1[[posvec]]};
		         genename=genename0[[posvec]],
		    (*else*)
			 data=data00;
			 genename=genename0]; 	    
	  (*prevalence for RU curve*)
	      If[pi0===Automatic,
	           {mat0,mat1}=data;
		    {ng0,num0}=Dimensions[mat0];
                    {ng1,num1}=Dimensions[mat1];
	             pi=num1/(num0+num1)//N,
	          (*else*)
		    pi=pi0];
	 (*maxgenes defaults to topnum*)
	  	   If[maxgenes0===Automatic,maxgenes=topnum,maxgenes=Min[maxgenes0,topnum]];
           (*report inputs*)
                    spec1={parsym,maxgenes,maxsplits,split,splitw,maxsplitsw,topnum,delta,phi,pi};
                    ReportInput[goal,data,datanameset,spec1,showprog1,showprog2,showsplitw,maxboot];
 	   (*check inputs*)
		   checkQ=CheckInput[data,spec1,showprog1,showprog2,showsplitw,maxboot,goal];	 
	   (*core of program*)
	        If[checkQ, 
		   spec2={fset,showgenekey,showprog1,showprog2,showplot2D,showfinalmodel,showw,goal};
		    dname=datanameset[[1]];
		    res=ClassifyCore[data,spec1,spec2,genename,dname,showsplitw,maxboot];
		     If[goal==1,  Print["----ROC and RU CURVES IN TEST SAMPLE---"]; Print@GraphicsRow[res]],
		(*else*)
  	             Print["Invalid Inputs"]];				   		
	 Return[res]]



(*-------------------------CHECK INPUT---------------------------------------------------*)

						
CheckInput[data_,{parsym_,maxgenes_,maxsplits_,split_,splitw_,maxsplitsw_,topnum_,delta_,phi_,pi_},
    showprog1_,showprog2_,showsplitw_,maxboot_,goal_]:=
Module[{modelQ,splitQ,splitwQ,dataQ,wQ,rulesortlist,dataQC,dataQE,dataC,dataE,deltaQ,phiQ,
   numTRAIN,numTEST,num0,num1,numTRAINtrain,numTRAINtest,numTEST0Q,numTRAINtrain0Q,numTRAINtest0Q,
	numTEST1Q,numTRAINtrain1Q,numTRAINtest1Q,piQ,dsQ,
	mat0,mat1,ng0,ns0,ng1,ns1,
	numTRAIN0,numTRAINtrain0,numTRAINtest0,numTEST0,
	 numTRAIN1,numTRAINtrain1,numTRAINtest1,numTEST1,maxbootQ,dimsetQ},
 (*GENE NUMBER CHECK TO MATCH IN CLASSES*)
        {mat0,mat1}=data;
        {ng0,num0}=Dimensions[mat0]; (*num0 is number class 0*)
        {ng1,num1}=Dimensions[mat1]; (*num1 is number class 1*)	
        If[ng0 != ng1,Print["different numbers of feastures in groups 0 and 1 "]; dataQ=False, dataQ=True];
         If[ng0 < topnum,   Print["selecting more genes than available "];              dataQ=False, dataQ=True];
   (*SPLIT CHECK*)
     If[split >0 && split < 1,      splitQ=True,   splitQ=False;       Print["invalid split"]];
     If[splitw >=.3 && splitw <=.7, splitwQ=True,  splitwQ=False;      Print["frac splitW not in [.3,.7]"]];
	(*MINIMUM NUMBER OF SPECIMENS FOR WRAPPER TO HAVE MORE THAN ONE SPECIMENS IN CLASS*)
	(*num0 is number class 0 and num1 is number class 1*)
	(*class 0*) 
	 numTRAIN0=Round[split num0];
	  numTEST0=num0-numTRAIN0;
	  numTRAINtrain0=Round[splitw numTRAIN0];
	  numTRAINtest0=numTRAIN0-numTRAINtrain0;
	(*class 1*) 
	  numTRAIN1=Round[split num1];
	  numTEST1=num1-numTRAIN1;
	  numTRAINtrain1=Round[splitw numTRAIN1];
	  numTRAINtest1=numTRAIN1-numTRAINtrain1;
	 (*check*)
	  If[numTRAINtrain0>1, numTRAINtrain0Q=True; numTRAINtrain0Q==False, Print["TRAIN-train class 0 needs more than 1"]];
     If[numTRAINtest0>1,  numTRAINtest0Q=True;  numTRAINtest0Q==False,  Print["TRAIN-test class 0 needs more than 1"]];
	  If[numTEST0>1,       numTEST0Q=True;       numTEST0Q==False,       Print["TEST class 0 needs more than 1"]];
	  If[numTRAINtrain1>1, numTRAINtrain1Q=True; numTRAINtrain1Q==False, Print["TRAIN-train class 1 needs more than 1"]];
     If[numTRAINtest1>1,  numTRAINtest1Q=True;  numTRAINtest1Q==False,  Print["TRAIN-test class 1 needs more than 1"]];
	  If[numTEST1>1,       numTEST1Q=True;       numTEST1Q==False,       Print["TEST class 1 needs more than 1"]];
     If[goal==2 && maxsplits==1, dsQ=False; Print["need more than one split with Goal 2"], dsQ=True];
	 If[delta > 0  && delta <.5, deltaQ=True,  deltaQ=False;  Print["invalid DeltaAUC"]];
         If[phi >0 && phi < 1,        phiQ=True,   phiQ=False;    Print["invalid PhiAUC]"]];
        If[pi >=0 && phi < 1,        piQ=True,     piQ=False;    Print["invalid pi]"]];  
        If[showsplitw==True || showsplitw==False,showsplitwQ=True,showsplitwQ=False; Print["invalid showsplitw"]];
        If[(maxboot>=0 && IntegerQ[maxboot]),maxbootQ=True,maxbootQ=False; Print["invalid maxboot"]];
 (*combine all checks*)
    resQ=splitQ && splitwQ && dataQ && deltaQ && phiQ && 
	   numTRAINtrain0Q && numTRAINtest0Q && numTEST0Q &&
		numTRAINtrain1Q && numTRAINtest1Q && numTEST1Q && piQ && showsplitwQ && maxbootQ && dsQ;
Return[resQ]]

	




(*----------------------------REPORT INPUT -------------------------------*)


ReportInput[goal_,data_,datanameset_,{parsym_,maxgenes_,maxsplits_,split_,splitw_,maxsplitsw_,topnum_,delta_,phi_,pi_},
    showprog1_,showprog2_,showsplitw_,maxboot_]:=
  Module[{mat0,mat1,ng0,ng1,num0,num1,  gname,tname,dname,dataname,name0,name1,
     numTRAIN0,numTRAINtrain0,numTRAINtest0,numTEST0,
	  numTRAIN1,numTRAINtrain1,numTRAINtest1,numTEST1},
   (*names*)
	 {dataname,name0,name1}=datanameset;
   (*numbers in sample*) 
         {mat0,mat1}=data;  
        {ng0,num0}=Dimensions[mat0];
        {ng1,num1}=Dimensions[mat1];
	  numTRAINtrain=Round[splitw numTRAIN];
	  numTRAINtest=numTRAIN-numTRAINtrain;	   
	(*goal names*)
  	   If[goal==1,gname=" 1 = rule discovery and testing"];
             If[goal==2,gname=" 2 =gene discovery"]; 
	(*class 0*) 
	  numTRAIN0=Round[split num0];
	  numTEST0=num0-numTRAIN0;
	  numTRAINtrain0=Round[splitw numTRAIN0];
	  numTRAINtest0=numTRAIN0-numTRAINtrain0;
	(*class 1*) 
	  numTRAIN1=Round[split num1];
	  numTEST1=num1-numTRAIN1;
	  numTRAINtrain1=Round[splitw numTRAIN1];
	  numTRAINtest1=numTRAIN1-numTRAINtrain1;
	(*report *)
	Print["DATA SET: ",dataname];
	 Print["GOAL: ",gname];
	 Print["GENES number: ",ng0];
	 Print["CLASS: names: ",{name0,name1}];
	 Print["CLASS: number per classs: ",{num0,num1}];
         Print["SPLITS: fraction in training sample: ",split];
         Print["SPLITS  number ",maxsplits];
	 Print["SPLITS class 0 TR:TR, TR:TE, TEST ", {numTRAINtrain0,numTRAINtest0,numTEST0}];
	 Print["SPLITS class1  TR:TR, TR:TE, TEST ", {numTRAINtrain1,numTRAINtest1,numTEST1}];
         Print["SELECTION:  number in preliminary filter  ",topnum];  
         Print["SELECTION:  threshhold for change in AUC ",delta];
	 Print["WRAPPER:  fraction in TR:TR split: ",splitw];    
	 Print["WRAPPER:  mulitplier of threshold change in AUC for selection: ",phi];
	 Print["WRAPPER:  number of splits  ",maxsplitsw];
	 If[goal==1,
         Print["ROC curve: number of bootstraps ",maxboot]];
         Print["RU curve:  prevalence ",Round[pi 100]/100//N];
 Return[Null]]


(*--Support functions-------------------------------*)

SelectTopGenesQuick[{x0_,x1_},topnum_]:=
Module[{t,pair,pairs,res},
  t=StatAUCListQuick[x0,x1];
  pair=Transpose[{t,Range[Length[t]]}];
  pairs=Sort[pair];
  res=Take[pairs,-topnum];
Return[res]];


 StatAUCListQuick[y0_,y1_]:=MapThread[StatAUCQuick[#1,#2]&, {y0,y1}]


StatAUCQuick[y0_,y1_]:=
 Module[{m0,m1,v0,v1,n0,n1,tx},
  m0=Mean[y0];
  m1=Mean[y1];
  n0=Length[y0];
  n1=Length[y1];
  If[n0 >1 && n1>1,
   v0=ReplaceZero[Var[y0],.01];
   v1=ReplaceZero[Var[y1],.01];
   t=Abs[(m0-m1)/Sqrt[v0+v1]]//N,
   v0=.01;
   v1=.01];
 Return[t]]	




End[] 
EndPackage[]


(*------------APPLICATION IN MANUSCRIPT---------------------------*)



Clear[ClassifyMS1,ClassifMS2,ClassifyFig12,ClassifySim1,ClassifySim2]	

		
																		 
ClassifyMS1[]:=
Module[{plotset,plotmat,plotroc,plotruc},
   plotset=Classify[#,Goal->1]& /@  {datalist1,datalist2,datalist3,datalist4,datalist5};
   plotall=GraphicsGrid[plotset];
	Print@plotall;
	 Export["swirlfig4.jpg", plotall];
    Export["swirlfig4.eps", plotall];
    Export["swirlfig4.pdf", plotall];
  Return[Null]]
 
 
 ClassifyMS2[maxsplits_:100]:=
 Module[{plotset,plot},
    plotset=Classify[#,Goal->2,MaxSplitsW->5,MaxSplits->maxsplits]& /@  {datalist1,datalist2,datalist3,datalist4,datalist5};
  Return[Null]]
		
ClassifyFig12[]:=
 Module[{plot1,plot2,x,y,k,a,b},
   plot1= PlotSym2D[{x,y,k,a,b}];
   Print[plot1];
  Print["simulation with 100 per class"];
  plot2= PlotSim2D[{x,y,k},100,1];
  Print[plot2];
   Export["swirlfig1.jpeg", plot1];    
   Export["swirlfig2.jpeg", plot2]; 
   Export["swirlfig1.bmp", plot1];    
   Export["swirlfig2.bmp", plot2]; 
   Export["swirlfig1.svg", plot1];    
   Export["swirlfig2.svg", plot2]; 
Return[Null]]   

 ClassifySim1[]:=
 Module[{datasim50,datasim100,plot1,plot2,plotset,plomat,plotroc,plotruc,plotsim},
 (*sample size 50*)
     Print["Sample Size 50"];
     datasim50=GenSimData["datasim",1,50];
     plot1=Classify[datasim50,Goal->1];
 (*sample size 100*)
     Print[" "];
     Print["Sample Size 100"];
     datasim100=GenSimData["datasim",1,100];
     plot2=Classify[datasim100,Goal->1];
  (*combine*)
     plotset={plot1,plot2};
     plotsim=GraphicsGrid[plotset];
	  Print@plotsim;
	  Export["swirlfig3.jpg", plotsim];
     Export["swirlfig3.eps", plotsim];
     Export["swirlfig3.pdf", plotsim];
    Return[Null]]
 
 ClassifySim2[]:=
  Module[{datasim50,datasim100},
    Print["Sample Size 50"];
    datasim50=GenSimData["datasim",1,50];
    Classify[datasim50,MaxSplits->100,Goal->2];
    Print[" "];
    Print["Sample Size 100"];
    datasim100=GenSimData["datasim",1,100];
    Classify[datasim100,MaxSplits->100,Goal->2];
  Return[Null]]
 
 

	 
