NA Tree RCodeUncommented

From CVRG Wiki

Jump to: navigation, search
 make.fired.5050<-function(d)
 {
  
    d.fired<-d[d$fired==1,]
    d.not.fired<-d[!d$fired,]
    k.fired<-dim(d.fired)[1]
    k.not.fired<-dim(d.not.fired)[1]
    rat<-floor(k.not.fired/k.fired)
  
    if (rat>0)
    {
       d.temp<-d.fired
       if (rat>1)
       {
          for (i in seq(1,rat-1))
          {
             d.temp<-rbind(d.temp,d.fired)
          }
       }
    }
    else
    {
       print("error with replication: no firings for this dataset\n")
    }
    rbind(d.temp,d.not.fired)
 }
  
  
 #######################################################################################
 #
 # Function finds best split in the node given data for all the variables in list.vars
 # returns a list that includes the name of the feature best for a split, a measure for the goodness of the split, and the best splitting value for that feature
 # d.node is the subset of data this node contains.
 # list.vars is the list of column numbers of the features
 # node.number is the node label of the node that's being considered for a split
  
  
 best.split<-function(d.node,list.vars,node.number)
 {
 d<-d.node
 y<-d$fired
 node.n<-dim(d)[1]
  
 goodness.list<-list()
 var.key.list<-list()
 var.value.list<-list()
 for (i in list.vars) {
     var.index.vec<-c()
     goodness.vec<-c()
     var.value.vec<-c()
    
     if (is.numeric(d[,i])) {
        
         is.discrete<-F
         samp.qt<-c()
         node.not.na<-sum(!is.na(d[,i]))
       
         if (node.not.na >= 10){
             list.qt<-seq(0.2,0.8,by=0.3)
             samp.qt<- quantile(d[,i],probs=list.qt,na.rm=T)
        
         } else {
             samp.qt<-mean(d[,i],na.rm=T)
         }
         var.value.vec<-samp.qt
        
         for (j in samp.qt) {
             goodness.val<-goodness.of.split(y,d[,i],j,is.discrete)
             goodness.vec<-c(goodness.vec, goodness.val)
             var.index.vec<-c(var.index.vec,i)
         }
     }  else {
    
         disc.var<-as.factor(d[,i])
         is.discrete<-T
         disc.levels<-levels(disc.var)
         var.value.vec<-disc.levels
        
      
         for (thres in disc.levels) {
             goodness.val<-goodness.of.split(y, disc.var, thres, is.discrete)
             goodness.vec<-c(goodness.vec, goodness.val)
             var.index.vec<-c(var.index.vec,i)  
         }
     }
  
     goodness.list<-list(goodness.list,goodness.vec)
     var.key.list<-list(var.key.list,var.index.vec)
     var.value.list<-list(var.value.list, var.value.vec)
    
 }
 goodness.unlist  <- unlist(goodness.list)
 var.key.unlist   <- unlist(var.key.list)
 var.value.unlist <- unlist(var.value.list)
  
 argmax.goodness<-which.max(goodness.unlist)
 var.best.split<-var.key.unlist[argmax.goodness]
  
 if (is.numeric(d[, var.best.split])) {
     var.thres<-as.numeric(var.value.unlist[argmax.goodness])
 }  else {
     var.thres<-var.value.unlist[argmax.goodness]
 }
  
 best.goodness.in.list <- goodness.unlist[argmax.goodness]
  
 if (best.goodness.in.list>1e-06) {
     return(list(best.goodness = best.goodness.in.list, var.split.index = var.best.split ,
             var.split.value = var.thres ))
 } else {
 return(NULL)
 }
    
 }
  
  
 #######################################################################################
 #
 # Initializes tree frame data structure
 #
 tree.init <- function(d)
 {
 node.depth<-c(0)
 node.real.depth<-c(0)
 node.split.var.name<-c("<leaf>")
 node.split.var.value<-c(NA)
 labels<-c(0)
 node.dec.rule<-NA
 node.parent.node<-NA
  
 num.fired<-sum(d$fired==1)
 num.nonfired<-sum(d$fired==0)
  
 class.call <-c(num.nonfired  <= num.fired  )
  
 tree.frame<-as.data.frame(list(node.label=labels, depth=node.depth, real.depth=node.real.depth,
             split.var.name=node.split.var.name, split.var.value =node.split.var.value,
             dec.rule=node.dec.rule, parent.node = node.parent.node, call=class.call, count.fired=num.fired,
             count.nonfired=num.nonfired),
             stringsAsFactors=F)
 # Return the tree frame.
 return(tree.frame)
 }
  
  
 ##############################################################################################
 #
 # Grows a tree starting from the root node using variables with indices in $var.list$
 # returns a list containing the data frame that stores the description of the tree ,
 # and a vector that  stores in which node  each sample in d ends up.
 # $d$ is the dataset
 # $tree.frame$  is the initalized data frame  that stores the description of the tree
 # $var.list$ is the list of column numbers of the features
 # $roc.thres.ratio$ is the minimum ratio of fireds to nonfireds at which a node is classified
 # as fired
  
 tree.grow<- function(d, tree.frame,tree.size, var.list,roc.thres.ratio)
 {
  
 n<-dim(d)[1]
 where.samp<-rep(0,n)
  
 stop.grow<-F
 no.further.splits<-T
  
 while(stop.grow==F) {
     leaf.labels<-tree.frame$node.label[tree.frame$split.var.name=="<leaf>"]
    
     no.further.splits<-T
     split.using.variable <- 0
    
     for (i in leaf.labels) {
    
         if ( sum(where.samp==i)==0)  next
            
         d.node<-d[where.samp==i,]
         split.prop<-best.split(d.node, var.list, i)
         if(!is.null(split.prop)){
             no.further.splits<-F
             split.using.variable <- split.prop$var.split.index
             if (!is.numeric(d[,split.using.variable])){
                 split.tree<-split.discrete(d,tree.frame, i, split.prop, where.samp,roc.thres.ratio)
                 tree.frame<-split.tree$frame
                 where.samp<-split.tree$where
             } else {
                 split.tree<-split.cont(d,tree.frame, i, split.prop, where.samp,roc.thres.ratio)
                 tree.frame<-split.tree$frame
                 where.samp<-split.tree$where
             }   
         }
     }
     leaf.tree<-tree.frame[tree.frame$split.var.name=="<leaf>",]
    
     if ( (max(leaf.tree$depth)==2)||(no.further.splits)||(length(var.list)==0) ) {
    
         stop.grow<-T
     }
    
    
  
 }
    
 # If we have impurity in the NA/NA node of the tree
 # that node is split again.
 # if (0) {
     leaf.tree<-tree.frame[tree.frame$split.var.name=="<leaf>",]
     if (min(leaf.tree$real.depth)==0) {
  
         all.na<-which.min(leaf.tree$real.depth)
         i<-leaf.tree$node.label[all.na]
         if ( sum(where.samp==i)!=0) {
            
             d.node<-d[where.samp==i,]
             split.prop<-best.split(d.node, var.list, i)
             if(!is.null(split.prop)){
  
                 split.using.variable <- split.prop$var.split.index
                 if (!is.numeric(d[,split.using.variable])){
                     split.tree<-split.discrete(d,tree.frame, i, split.prop, where.samp,roc.thres.ratio)
                     tree.frame<-split.tree$frame
                     where.samp<-split.tree$where
                 } else {
                     split.tree<-split.cont(d,tree.frame, i, split.prop, where.samp,roc.thres.ratio)
                     tree.frame<-split.tree$frame
                     where.samp<-split.tree$where
                 }   
             }
         }
  
     }
 # }
    
    
  
 return(list(frame=tree.frame, where=where.samp) )
 }
  
 #
 # Split a node using a discrete variable and update(and return) tree frame data structure
 # returns the updated tree frame data structure and the updated $where.samp$ vector
 # $d$ is the dataset
 # $tree.frame$  is the current data frame  that stores the description of the tree
 # $parent.node.label$ is the label of node to be split
 # $split.prop$ is a list that contains the name for the splitting variable and the value to be used for splitting
 # $where.samp$ a vector that  stores in which node  each sample in d ends up.
 # $roc.thres$ is the minimum ratio of fireds to nonfireds at which a node is classified as fired
  
 split.discrete <- function(d,tree.frame,parent.node.label,split.prop,where.samp,roc.thres)
 {
     y<-d$fired
  
     tree.size  <- dim(tree.frame)[1]-1
  
     var.index<-split.prop$var.split.index
     split.value<-split.prop$var.split.value
     varname<-names(d)[var.index]
     parent.node.index<-which(tree.frame$node.label==parent.node.label)
  
 left.node.indic  <- as.character(d[,varname])==as.character(split.value)
 right.node.indic <- !left.node.indic
 na.node.indic    <- is.na( d[,varname])
  
 left.node.indic [na.node.indic]<-F
 right.node.indic[na.node.indic]<-F
  
  
  
 left.node.indic  <- left.node.indic  &(where.samp==parent.node.label)
 right.node.indic <- right.node.indic &(where.samp==parent.node.label)
 na.node.indic    <- na.node.indic    &(where.samp==parent.node.label)
  
  
  
 where.samp[ left.node.indic  ] <- tree.size+1
 where.samp[ right.node.indic ] <- tree.size+2
 where.samp[ na.node.indic    ] <- tree.size+3
  
  
  
 fired.count.l<-sum(y[left.node.indic]==1)
 fired.count.r<-sum(y[right.node.indic]==1)
 fired.count.na<-sum(y[na.node.indic]==1)
  
 nonfired.count.l<-sum(y[left.node.indic]==0)
 nonfired.count.r<-sum(y[right.node.indic]==0)
 nonfired.count.na<-sum(y[na.node.indic]==0)
  
  
 leaf.call.l  <- (fired.count.l/(fired.count.l+nonfired.count.l))>roc.thres
 leaf.call.r  <- (fired.count.r/(fired.count.r+nonfired.count.r))>roc.thres
 leaf.call.na <- (fired.count.na/(fired.count.na+nonfired.count.na))>roc.thres
  
 new.node.labels <- c(tree.size+1,tree.size+2,tree.size+3)
 current.depth<-tree.frame$depth[parent.node.index]
 current.real.depth<-tree.frame$real.depth[parent.node.index]
  
  
  
 new.depth       <- rep(current.depth+1,3)
  
 new.real.depth  <- rep(c(current.real.depth+1,current.real.depth),c(2,1))
 new.split.varname   <- rep("<leaf>",3)
 new.split.var.value <- rep(NA,3)
 new.dec.rule        <- c("==","!=",NA)
 new.parent.node     <- rep(parent.node.label,3)
 new.call            <- c(leaf.call.l, leaf.call.r, leaf.call.na)
 new.fired.count     <- c(fired.count.l,fired.count.r,fired.count.na)
 new.nonfired.count  <- c(nonfired.count.l,nonfired.count.r, nonfired.count.na)
  
  
  
 new.frame<-as.data.frame(cbind(new.node.labels,new.depth,new.real.depth,
             new.split.varname, new.split.var.value, new.dec.rule, new.parent.node,new.call,new.fired.count,new.nonfired.count ),
             stringsAsFactors=F)
 names(new.frame)<-c("node.label" , "depth", "real.depth", "split.var.name",
                 "split.var.value", "dec.rule", "parent.node","call","count.fired", "count.nonfired")
  
 tree.frame<-rbind(tree.frame,new.frame)
  
  
  
  
 tree.frame$node.label<-as.numeric(tree.frame$node.label)
 tree.frame$depth<-as.numeric(tree.frame$depth)
 tree.frame$real.depth<-as.numeric(tree.frame$real.depth)
 tree.frame$parent.node<-as.numeric(tree.frame$parent.node)
 tree.frame$count.fired<-as.numeric(tree.frame$count.fired)
 tree.frame$count.nonfired<-as.numeric(tree.frame$count.nonfired)
  
  
 tree.frame$split.var.name[parent.node.index]<-varname
 tree.frame$split.var.value[parent.node.index]<-split.value
  
 return(list(frame= tree.frame, where=where.samp))
 }
  
 #
 # Split a node using a continous variable and update(and return) tree frame data structure
 # returns the updated tree frame data structure and the updated $where.samp$ vector
 # $d$ is the dataset
 # $tree.frame$  is the current data frame  that stores the description of the tree
 # $parent.node.label$ is the label of node to be split
 # $split.prop$ is a list that contains the name for the splitting variable and the value to be used for splitting
 # $where.samp$ a vector that  stores in which node  each sample in d ends up.
 # $roc.thres$ is the minimum ratio of fireds to nonfireds at which a node is classified as fired
 split.cont <- function(d,tree.frame,parent.node.label,split.prop,where.samp,roc.thres)
 {
 y<-d$fired
 tree.size  <- dim(tree.frame)[1] - 1
 var.index<-split.prop$var.split.index
 split.value<-split.prop$var.split.value
 varname<-names(d)[var.index]
  
  parent.node.index<-which(tree.frame$node.label==parent.node.label)
  
 left.node.indic  <- d[,varname]<=split.value
 right.node.indic <- !left.node.indic
 na.node.indic    <- is.na( d[,varname])
  
 left.node.indic [na.node.indic]<-F
 right.node.indic[na.node.indic]<-F
  
 left.node.indic  <- left.node.indic &(where.samp==parent.node.label)
 right.node.indic <- right.node.indic &(where.samp==parent.node.label)
 na.node.indic    <- na.node.indic &(where.samp==parent.node.label)
  
  
 where.samp[ left.node.indic  ] <- tree.size+1
 where.samp[ right.node.indic ] <- tree.size+2
 where.samp[ na.node.indic    ] <- tree.size+3
  
 fired.count.l<-sum(y[left.node.indic]==1)
 fired.count.r<-sum(y[right.node.indic]==1)
 fired.count.na<-sum(y[na.node.indic]==1)
  
 nonfired.count.l<-sum(y[left.node.indic]==0)
 nonfired.count.r<-sum(y[right.node.indic]==0)
 nonfired.count.na<-sum(y[na.node.indic]==0)
  
 leaf.call.l  <- (fired.count.l/(fired.count.l+nonfired.count.l))>roc.thres
 leaf.call.r  <- (fired.count.r/(fired.count.r+nonfired.count.r))>roc.thres
 leaf.call.na <- (fired.count.na/(fired.count.na+nonfired.count.na))>roc.thres
  
  
 new.node.labels <- c(tree.size+1,tree.size+2,tree.size+3)
 current.depth<-tree.frame$depth[parent.node.index]
 current.real.depth<-tree.frame$real.depth[parent.node.index]
  
  
  
 new.depth       <- rep(current.depth+1,3)
  
  
 new.real.depth  <- rep(c(current.real.depth+1,current.real.depth),c(2,1))
 new.split.varname   <- rep("<leaf>",3)
 new.split.var.value <- rep(NA,3)
 new.dec.rule        <- c("<=",">",NA)
 new.parent.node     <- rep(parent.node.label,3)
 new.call            <- c(leaf.call.l, leaf.call.r, leaf.call.na)
 new.fired.count     <- c(fired.count.l,fired.count.r,fired.count.na)
 new.nonfired.count  <- c(nonfired.count.l,nonfired.count.r, nonfired.count.na)
  
 new.frame<-as.data.frame(cbind(new.node.labels,new.depth,new.real.depth,
             new.split.varname, new.split.var.value,new.dec.rule,new.parent.node,new.call,new.fired.count,new.nonfired.count),
             stringsAsFactors=F)
 names(new.frame)<-c("node.label" , "depth", "real.depth", "split.var.name",
               "split.var.value", "dec.rule", "parent.node","call","count.fired", "count.nonfired")
  
 tree.frame<-rbind(tree.frame,new.frame)
  
  
 tree.frame$node.label<-as.numeric(tree.frame$node.label)
 tree.frame$depth<-as.numeric(tree.frame$depth)
 tree.frame$real.depth<-as.numeric(tree.frame$real.depth)
 tree.frame$parent.node<-as.numeric(tree.frame$parent.node)
 tree.frame$count.fired<-as.numeric(tree.frame$count.fired)
 tree.frame$count.nonfired<-as.numeric(tree.frame$count.nonfired)
  
  
 tree.frame$split.var.name[parent.node.index]<-varname
 tree.frame$split.var.value[parent.node.index]<-split.value
  
 return(list(frame= tree.frame, where=where.samp))
  
 }
  
 #
 # Computes goodness(decrease in entropy) of a particular split based on the feature in split.var
 # returns a goodness of split measure
 # $y$ is the vector class labels
 # $split.var$ is the feature vector(the splitting variable)
 # $thres$ is the value of the splitting variable used for splitting  the node
 # $is.discrete$ is a flag that is 1 if the splitting variable is a discrete variable
  
  goodness.of.split <- function (y,split.var,thres,is.discrete)
 {
     if ((sum(y==0)==0)||(sum(y==1)==0))
         return(0)
     na.node<-is.na(split.var)
     num.sample<-length(y)
     left.node<-rep(FALSE,num.sample)
     right.node<-rep(FALSE,num.sample)
  
     if (is.discrete) {
     left.node<-(as.character(split.var)==as.character(thres))
     left.node[na.node]<- F
  
     right.node<-(as.character(split.var)!=as.character(thres))   
     right.node[na.node]<- F
  
     } else {
    
     left.node<- split.var<=thres
     left.node[na.node]<- F
  
     right.node<- split.var > thres
     right.node[na.node]<- F
     }
    
     y.l<-y[left.node]
     y.r<-y[right.node]
     y.na<-y[na.node]
    
             
  
     if ((length(y.l)==0)||(length(y.r)==0))
         return(-5)
  
  
    
     orig.imp   <- impurity.at.node(sum(y==0)   ,sum(y==1   ))
     left.imp   <- impurity.at.node(sum(y.l==0) ,sum(y.l==1 ))
     right.imp  <- impurity.at.node(sum(y.r==0) ,sum(y.r==1 ))
     na.imp <- 0
     if(length(y.na)!=0)
         na.imp     <- impurity.at.node(sum(y.na==0),sum(y.na==1))
  
     wt.left<-length(y.l)
     wt.right<-length(y.r)
     wt.na<- length(y.na)
  
     wt.left<-   wt.left/ num.sample
     wt.right<-  wt.right/ num.sample
     wt.na <-    wt.na/ num.sample
     imp.diff<- wt.left*left.imp+wt.right*right.imp+wt.na*na.imp-orig.imp
        
     return(-1*imp.diff)
  
 }
  
 #
 # Entropy at a node
 #
  
 impurity.at.node <- function (y.1,y.2)
 {
     tot.y<-y.1+y.2
     impurity<-0
     if (tot.y==0) {
         print("Entropy Comp. Error")
         impurity<- -1
     } else {
     p.1<-y.1/(tot.y)
     p.2<-y.2/(tot.y)
     impurity<-0
     if ((p.1!=0)&(p.2!=0)) {
     impurity <- -1*(p.1*log2(p.1)+p.2*log2(p.2))
     } else {
     impurity <- 0      
     }
     }
     return(impurity)
 }
  
 #impurity.at.node<- function(y.1,y.2)
 #{
 #    tot.y<-y.1+y.2
 #    impurity<-0
 #    if (tot.y==0) {
 #        print("Entropy Comp. Error")
 #        impurity<- -1
 #    } else {
 #    impurity<- min(y.1,y.2)/tot.y
 #    }
 #    return(impurity)
 #
 #}
  
 #impurity.at.node<- function(y.1,y.2)
 #{
 #    tot.y<-y.1+y.2
 #    impurity<-0
 #    if (tot.y==0) {
 #        print("Entropy Comp. Error")
 #        impurity<- -1
 #    } else {
 #    impurity<- y.1*y.2/tot.y^2
 #    }
 #    return(impurity)
 #
 #}
  
  
 predict.na.tree<-function(tree.frame, test.sample, numeric.or.factor)
 {
    
     at.leaf.node<-F
     at.node<- 0
     test.where <- 0
     child.frame.index <- 0
     while(at.leaf.node==F) {
         node.frame.index<- which(tree.frame$node.label==at.node)   
        
  
         split.variable.name<-tree.frame$split.var.name[node.frame.index]
  
         threshold.value<-tree.frame$split.var.value[node.frame.index]
        
        
         test.value<-test.sample[split.variable.name]
         if (numeric.or.factor[split.variable.name]) {
             test.value <- as.numeric(test.value)
             threshold.value <- as.numeric(threshold.value)
  
        
             NA.child.node<-is.na(test.value)
             if (!NA.child.node) {
                 left.node<- test.value<=threshold.value
                 if (left.node) {
                     child.frame.index <- which((tree.frame$parent.node==at.node) & (tree.frame$dec.rule=="<=") )
                
                 } else {
                     child.frame.index <- which((tree.frame$parent.node==at.node) & (tree.frame$dec.rule==">") )
                 }              
             } else {
             child.frame.index <- which((tree.frame$parent.node==at.node) & (is.na(tree.frame$dec.rule) ) )
  
             }              
         } else {
             test.value <- as.character(test.value)
             threshold.value <- as.character(threshold.value)
        
             NA.child.node<-is.na(test.value)
             if (!NA.child.node) {
                 left.node<- test.value==threshold.value
                 if (left.node) {
                     child.frame.index <- which((tree.frame$parent.node==at.node) & (tree.frame$dec.rule=="==") )               
                 } else {
                     child.frame.index <- which((tree.frame$parent.node==at.node) & (tree.frame$dec.rule=="!=") )
                 }              
             } else {
  
                 child.frame.index <- which((tree.frame$parent.node==at.node) & (is.na(tree.frame$dec.rule) ) )
  
             }
  
         }
        
         at.node<-tree.frame$node.label[child.frame.index]
         at.leaf.node <- (tree.frame$split.var.name[child.frame.index] =="<leaf>")
  
         if (at.leaf.node) {
             test.where<-at.node
         }
         node.frame.index<- which(tree.frame$node.label==at.node)    
        
     }  
     class.call<-tree.frame$call[node.frame.index]
     if (is.na(class.call)) {
         print(paste("leaf.node.index",node.frame.index))
         print(tree.frame)
         class.call<-F
     }
  
        
     return(list(call=class.call,where=test.where))
 }
  
  
  
 draw.node<-function(tree.frame,node.drawn)
 {
 node.index<-which(tree.frame$node.label==node.drawn)
 if (node.index==0) {
     print("No such node exists")
     return()
 }
 parent.node.label<-tree.frame$parent.node[node.index]
 parent.node.index<-which(tree.frame$node.label==parent.node.label)
 num.fired<-tree.frame$count.fired[node.index]
 num.nonfired<-tree.frame$count.nonfired[node.index]
  
 num.total<-num.fired+num.nonfired
 node.depth<-tree.frame$depth[node.index]
 node.str<-character()
 if (node.depth>0) {
     for (i in 1:node.depth) {
     node.str<-paste(node.str,"\t")
    
     }
 }
 node.str<-paste(node.str,"Node ",node.drawn,": ")
 if (is.na(parent.node.label)) {
     node.str<-paste(node.str, "Root node  ", "  Fired" ,num.fired, "Nonfired",num.nonfired,  "Total", num.total)
  
 } else if (!is.na(tree.frame$dec.rule[node.index])) {
     node.str<-paste(node.str, tree.frame$split.var.name[parent.node.index],tree.frame$dec.rule[node.index],
     tree.frame$split.var.value[parent.node.index],
     "  Fired" ,num.fired, "Nonfired",num.nonfired,  "Total", num.total)
 } else {
     node.str<-paste(node.str, tree.frame$split.var.name[parent.node.index] ,"is.na",
     "  Fired" ,num.fired, "Nonfired",num.nonfired,  "Total", num.total)
 }
 node.str<-paste(node.str,"\n")
 cat(node.str)
 }
  
  
  
  
 draw.tree<-function(tree.frame)
 {
  
 all.nodes.drawn<-F
 current.node<-0
 tree.size<-dim(tree.frame)[1]
  
 drawn.nodes<-rep(0,tree.size)
 node.queue<-c()
 while(all.nodes.drawn==F) {
  
     draw.node(tree.frame,current.node)
     current.node.index <- which(tree.frame$node.label ==current.node)
     child.nodes.index  <- which(tree.frame$parent.node==current.node)
     if (length(child.nodes.index)==0) {
     } else {
         child.nodes <- tree.frame$node.label[child.nodes.index]
         node.queue  <- c(child.nodes,node.queue)
     }
     node.queue  <-    node.queue[ node.queue!=current.node]
  
     drawn.nodes[current.node.index] <-1
     if(sum(drawn.nodes)==tree.size) {
         all.nodes.drawn <- T
     }
     else current.node<-node.queue[1]
  
 }
 }
Personal tools
Project Infrastructures