NA Tree RCodeCommented

From CVRG Wiki

Jump to: navigation, search
 make.fired.5050<-function(d)
 {
    # Splite d into two half.  One is for fired data. The other is for not fired data.
    d.fired<-d[d$fired==1,]
    d.not.fired<-d[!d$fired,]
    # Calculate how many fired data observations.
    k.fired<-dim(d.fired)[1]
    # Calculate how many not fired data observations.
    k.not.fired<-dim(d.not.fired)[1]
    # Get the ratio between number of fired observations and not fired observations (and
    # round down to the largest integer).
    rat<-floor(k.not.fired/k.fired)
  
    if (rat>0)
    {
       # If ratio is larger than 0, store d.fired to d.temp.
       d.temp<-d.fired
       if (rat>1)
       {
          # If ratio is larger than 1, we replicate attach d.fired to the end of d.temp
          # for rat-1 times. (ratio being larger than 1 means number of fired are at least
          # twice as many as not number of fired.)
          for (i in seq(1,rat-1))
          {
             d.temp<-rbind(d.temp,d.fired)
          }
       }
    }
    else
    {
       # If ratio is smaller than or equal to 0, give a error message.
       print("error with replication: no firings for this dataset\n")
    }
    # Conbind d.temp and d.not fired together.
    rbind(d.temp,d.not.fired)
 }
  
  
 #######################################################################################
 #
 # Function finds best split in the node given data for all the variables in list.vars
 # returns a list that includes the name of the feature best for a split, a measure for the goodness of the split, and the best splitting value for that feature
 # d.node is the subset of data this node contains.
 # list.vars is the list of column numbers of the features
 # node.number is the node label of the node that's being considered for a split
  
  
 best.split<-function(d.node,list.vars,node.number)
 {
 # Assign d.node to d.
 d<-d.node
 # Assign "fired" column of data to y.
 y<-d$fired
 # Assign node.n the number of observations in total.
 node.n<-dim(d)[1]
  
 # print(paste( "dimension of d.node",dim(d)))
  
 # Create empty list fo goodness.list, var.key.list and var.value.list.
 goodness.list<-list()
 var.key.list<-list()
 var.value.list<-list()
 #For each variable and each threshold(all levels for discrete
 #and limited number of thresholds for continous) compute the goodness
 #and return maximum
 #var.key.list and var.key stores which variable the split is for
 #goodness.list and goodness.vec stores the value of goodness for that split
  
 #var.value holds the splitting value(threshold for continous, or equal/notequal check
 #for continous
  
 #For discrete variables only partition of (single levels vs all others) are checked, l checks for l levels
 #instead of 2^(l-1)-1 checks(all possible partitions of levels).
 #This only matters if there are more than 3 levels
  
 # For every variable in the list.vars, we run the following program to get an optimal split.
 for (i in list.vars) {
    
     # Create empty list for variable name, goodness of fit and threshold.
     var.index.vec<-c()
     goodness.vec<-c()
     var.value.vec<-c()
    
     # If the variable is a numeric
     if (is.numeric(d[,i])) {
        
         # Assign discrete indicator to FALSE, which is used later in goodness.of.split()
         is.discrete<-F
         # Create empty list for sample quantiles.
         samp.qt<-c()
         # Get the number of not NA values for the variable.
         node.not.na<-sum(!is.na(d[,i]))
        
         # Choose number of possible thresholds
         # if number of samples in node is not large enough, try all the possible thresholds
        
         # If number of not NA values is larger than or equal to 10
         if (node.not.na >= 10){
             # Create a list of percentages of (0.2 0.5 0.8)
             list.qt<-seq(0.2,0.8,by=0.3)
             # Get corresponding quantiles in the variable.
             samp.qt<- quantile(d[,i],probs=list.qt,na.rm=T)
        
         } else {
             # If the number of not NA values is less than 10
             # We only test one threshold which is the mean.
             samp.qt<-mean(d[,i],na.rm=T)
         }
         # Assign samp.qt to var.value.vec
         var.value.vec<-samp.qt
        
         # For every quantiles in samp.qt, we run the loop to generate lists of
         # goodness of fit and variable names.
         for (j in samp.qt) {
             # Get the goodness of fit for every quantiles.
             goodness.val<-goodness.of.split(y,d[,i],j,is.discrete)
             # Put goodness of fit into a vector.
             goodness.vec<-c(goodness.vec, goodness.val)
             # Put variable index into a vector.
             var.index.vec<-c(var.index.vec,i)
             # var.value.vec<-c( var.value.vec, as.character(j))
         }
     }  else {
         # If the variable d[,i] is discrete, then run the following program.
    
         # Convert the variable into factor and store it to "disc.var".
         disc.var<-as.factor(d[,i])
         # Assign discrete indicator to TRUE, which is used later in goodness.of.split()
         is.discrete<-T
         # List all the unique values of disc.var.
         disc.levels<-levels(disc.var)
         # Put all the unique values into a vector.
         var.value.vec<-disc.levels
        
      
         # For every value in disc.level, we run the loop to generate lists of
         # goodness of fit and variable names. (The same as numeric varialbe case)
         for (thres in disc.levels) {
             goodness.val<-goodness.of.split(y, disc.var, thres, is.discrete)
             goodness.vec<-c(goodness.vec, goodness.val)
             var.index.vec<-c(var.index.vec,i)  
         }
     }
  
     # Append to list of goodness values
     # var.key.list hold which variable  the goodness value is for
     goodness.list<-list(goodness.list,goodness.vec)
     var.key.list<-list(var.key.list,var.index.vec)
     var.value.list<-list(var.value.list, var.value.vec)
    
 }
 # Unlist the goodness.list, var,key.list and var.value.list so that they become
 # three vectors, instead of three lists.
 goodness.unlist  <- unlist(goodness.list)
 var.key.unlist   <- unlist(var.key.list)
 var.value.unlist <- unlist(var.value.list)
  
 # Look for the maximum in the vector of "goodness.unlist", and find the corresponding
 # entry in vector of "var.key.unlist".
 argmax.goodness<-which.max(goodness.unlist)
 var.best.split<-var.key.unlist[argmax.goodness]
  
 # If the varialbe we just found is numeric, store the threshold as a numeric value to
 # "var.thres", otherwise, store it as it is.
 if (is.numeric(d[, var.best.split])) {
     var.thres<-as.numeric(var.value.unlist[argmax.goodness])
 }  else {
     var.thres<-var.value.unlist[argmax.goodness]
 }
  
 # Record the maximum value in the vector of "goodness.unlist".
 best.goodness.in.list <- goodness.unlist[argmax.goodness]
  
 # If there is no split that decreases impurity in the node
 # return NULL, otherwise, return the properties of the best split, which includes
 # maximum value of goodness of fit, best variable index,  the corresponding threshold
 # value.
 if (best.goodness.in.list>1e-06) {
     return(list(best.goodness = best.goodness.in.list, var.split.index = var.best.split ,
             var.split.value = var.thres ))
 } else {
 return(NULL)
 }
    
 }
  
  
 #######################################################################################
 #
 # Initializes tree frame data structure
 #
 tree.init <- function(d)
 {
 # Create basic variables for the tree frame
 # And initiate them with value 0, NA or empty vector.
 # For split variable name, assign it "<leaf>"s for now.
 node.depth<-c(0)
 node.real.depth<-c(0)
 node.split.var.name<-c("<leaf>")
 node.split.var.value<-c(NA)
 labels<-c(0)
 node.dec.rule<-NA
 node.parent.node<-NA
  
 # Record number of observations with firing and not firings.
 num.fired<-sum(d$fired==1)
 num.nonfired<-sum(d$fired==0)
  
 # If number of firing is smaller than nu.fired, assign TRUE to class.call, otherwise FALSE.
 class.call <-c(num.nonfired  <= num.fired  )
  
 # Create tree frame as data.frame object with fields of node label, depth, real depth,
 # split variable name, split variable value, decisioni rule, parent node, call, number of firings
 # and number of not firings.
 tree.frame<-as.data.frame(list(node.label=labels, depth=node.depth, real.depth=node.real.depth,
             split.var.name=node.split.var.name, split.var.value =node.split.var.value,
             dec.rule=node.dec.rule, parent.node = node.parent.node, call=class.call, count.fired=num.fired,
             count.nonfired=num.nonfired),
             stringsAsFactors=F)
 # Return the tree frame.
 return(tree.frame)
 }
  
  
 ##############################################################################################
 #
 # Grows a tree starting from the root node using variables with indices in $var.list$
 # returns a list containing the data frame that stores the description of the tree ,
 # and a vector that  stores in which node  each sample in d ends up.
 # $d$ is the dataset
 # $tree.frame$  is the initalized data frame  that stores the description of the tree
 # $var.list$ is the list of column numbers of the features
 # $roc.thres.ratio$ is the minimum ratio of fireds to nonfireds at which a node is classified
 # as fired
  
 tree.grow<- function(d, tree.frame,tree.size, var.list,roc.thres.ratio)
 {
  
 # Assign n as the number of observations in the dataset.
 n<-dim(d)[1]
 # Assign a vector of 0s to where.samp.
 where.samp<-rep(0,n)
  
 # Assign FALSE to stop.grow indicator.
 stop.grow<-F
 # Assign TRUE to no.further.splits indicator.
 no.further.splits<-T
  
 while(stop.grow==F) {
     # Get labels of all the leaves.
     leaf.labels<-tree.frame$node.label[tree.frame$split.var.name=="<leaf>"]
     # print(leaf.labels)
    
     # Assign TRUE to no.further.splits indicator.
     no.further.splits<-T
     # Assign 0 to no.further.splits indicator.
     split.using.variable <- 0
    
     # For every leaf, we run the following procedure to grow the tree.
     for (i in leaf.labels) {
    
         # if node has no samples skip the current loop.
         if ( sum(where.samp==i)==0)  next
            
         # Assign d.node the dataset at node i.
         d.node<-d[where.samp==i,]
         # print(dim(d.node))
         # Call best.split() and get the best split variable and threshold for node i.
         split.prop<-best.split(d.node, var.list, i)
         # print(split.prop)
         # print(paste("i  is ",i))
         # If split.prop returns a variable instead of Null, then split the tree and
         # add more rows in the tree frame table.
         if(!is.null(split.prop)){
             # Assign FALSE to no.further.splits
             no.further.splits<-F
             # Get the split variable
             split.using.variable <- split.prop$var.split.index
             # If the split variable is numeric, use split.cont(), otherwise, use split.discrete()
             # to add more rows in tree frame and to change where.samp.
             if (!is.numeric(d[,split.using.variable])){
                 # Call function split.discrete()
                 split.tree<-split.discrete(d,tree.frame, i, split.prop, where.samp,roc.thres.ratio)
                 tree.frame<-split.tree$frame
                 where.samp<-split.tree$where
             } else {
                 # Call function split.discrete()
                 split.tree<-split.cont(d,tree.frame, i, split.prop, where.samp,roc.thres.ratio)
                 tree.frame<-split.tree$frame
                 where.samp<-split.tree$where
             }   
         }
         # print(where.samp)
         # draw.tree(tree.frame)
     }
     # extract "leaf" part of tree frame and assign it leaf.tree.
     leaf.tree<-tree.frame[tree.frame$split.var.name=="<leaf>",]
     # print(tree.frame)
     # print(min(leaf.tree$depth))
    
  
     # If the NA tree reaches the depth of 2, or no.further.splits is TRUE, or there
     # is no variables in var.list, then stop growing the tree (i.e. stop grow<-T).
     if ( (max(leaf.tree$depth)==2)||(no.further.splits)||(length(var.list)==0) ) {
    
         stop.grow<-T
     }
    
    
 # print(paste("No Further Splits",no.further.splits) )
  
 }
    
 # If we have impurity in the NA/NA node of the tree
 # that node is split again.
 # if (0) {
     # extract "leaf" part of tree frame and assign it leaf.tree.
     leaf.tree<-tree.frame[tree.frame$split.var.name=="<leaf>",]
     # If the minimium of real depth is 0, we assign all.na with locations of real.depth equals to 0.
     if (min(leaf.tree$real.depth)==0) {
  
         all.na<-which.min(leaf.tree$real.depth)
         # Extract the labels of 0 real depth part of leaf.tree.
         i<-leaf.tree$node.label[all.na]
         if ( sum(where.samp==i)!=0) {
            
             d.node<-d[where.samp==i,]
             # print(dim(d.node))
             split.prop<-best.split(d.node, var.list, i)
             # print(split.prop)
             # print(paste("i  is ",i))
             if(!is.null(split.prop)){
  
                 split.using.variable <- split.prop$var.split.index
                 if (!is.numeric(d[,split.using.variable])){
                     split.tree<-split.discrete(d,tree.frame, i, split.prop, where.samp,roc.thres.ratio)
                     tree.frame<-split.tree$frame
                     where.samp<-split.tree$where
                 } else {
                     split.tree<-split.cont(d,tree.frame, i, split.prop, where.samp,roc.thres.ratio)
                     tree.frame<-split.tree$frame
                     where.samp<-split.tree$where
                 }   
             }
         }
  
     }
 # }
    
    
  
 return(list(frame=tree.frame, where=where.samp) )
 }
  
 #
 # Split a node using a discrete variable and update(and return) tree frame data structure
 # returns the updated tree frame data structure and the updated $where.samp$ vector
 # $d$ is the dataset
 # $tree.frame$  is the current data frame  that stores the description of the tree
 # $parent.node.label$ is the label of node to be split
 # $split.prop$ is a list that contains the name for the splitting variable and the value to be used for splitting
 # $where.samp$ a vector that  stores in which node  each sample in d ends up.
 # $roc.thres$ is the minimum ratio of fireds to nonfireds at which a node is classified as fired
  
 split.discrete <- function(d,tree.frame,parent.node.label,split.prop,where.samp,roc.thres)
 {
     y<-d$fired
  
     tree.size  <- dim(tree.frame)[1]-1
  
     var.index<-split.prop$var.split.index
     split.value<-split.prop$var.split.value
     varname<-names(d)[var.index]
     parent.node.index<-which(tree.frame$node.label==parent.node.label)
  
 #
 # Create three new child nodes
 #
 left.node.indic  <- as.character(d[,varname])==as.character(split.value)
 right.node.indic <- !left.node.indic
 na.node.indic    <- is.na( d[,varname])
  
 left.node.indic [na.node.indic]<-F
 right.node.indic[na.node.indic]<-F
  
  
  
 left.node.indic  <- left.node.indic  &(where.samp==parent.node.label)
 right.node.indic <- right.node.indic &(where.samp==parent.node.label)
 na.node.indic    <- na.node.indic    &(where.samp==parent.node.label)
  
  
  
 where.samp[ left.node.indic  ] <- tree.size+1
 where.samp[ right.node.indic ] <- tree.size+2
 where.samp[ na.node.indic    ] <- tree.size+3
  
  
 #print(paste("parent.node ", parent.node.label) )
 #print(paste("parent.node.label is numeric ", is.numeric(parent.node.label)) )
 #print(paste("parent.node row ", parent.node.label+1) )
  
 #print(tree.frame)
 #print(paste("current depth is ",tree.frame$depth))
  
 #print(paste("current depth is ",tree.frame[parent.node.label+1,2]))
  
 fired.count.l<-sum(y[left.node.indic]==1)
 fired.count.r<-sum(y[right.node.indic]==1)
 fired.count.na<-sum(y[na.node.indic]==1)
  
 nonfired.count.l<-sum(y[left.node.indic]==0)
 nonfired.count.r<-sum(y[right.node.indic]==0)
 nonfired.count.na<-sum(y[na.node.indic]==0)
  
  
 leaf.call.l  <- (fired.count.l/(fired.count.l+nonfired.count.l))>roc.thres
 leaf.call.r  <- (fired.count.r/(fired.count.r+nonfired.count.r))>roc.thres
 leaf.call.na <- (fired.count.na/(fired.count.na+nonfired.count.na))>roc.thres
  
 new.node.labels <- c(tree.size+1,tree.size+2,tree.size+3)
 current.depth<-tree.frame$depth[parent.node.index]
 current.real.depth<-tree.frame$real.depth[parent.node.index]
  
  
  
 new.depth       <- rep(current.depth+1,3)
  
 new.real.depth  <- rep(c(current.real.depth+1,current.real.depth),c(2,1))
 new.split.varname   <- rep("<leaf>",3)
 new.split.var.value <- rep(NA,3)
 new.dec.rule        <- c("==","!=",NA)
 new.parent.node     <- rep(parent.node.label,3)
 new.call            <- c(leaf.call.l, leaf.call.r, leaf.call.na)
 new.fired.count     <- c(fired.count.l,fired.count.r,fired.count.na)
 new.nonfired.count  <- c(nonfired.count.l,nonfired.count.r, nonfired.count.na)
  
  
 #
 # add the new nodes to the tree.frame data frame
 #
  
 new.frame<-as.data.frame(cbind(new.node.labels,new.depth,new.real.depth,
             new.split.varname, new.split.var.value, new.dec.rule, new.parent.node,new.call,new.fired.count,new.nonfired.count ),
             stringsAsFactors=F)
 names(new.frame)<-c("node.label" , "depth", "real.depth", "split.var.name",
                 "split.var.value", "dec.rule", "parent.node","call","count.fired", "count.nonfired")
 #print(new.frame)
  
 tree.frame<-rbind(tree.frame,new.frame)
  
  
  
  
 tree.frame$node.label<-as.numeric(tree.frame$node.label)
 tree.frame$depth<-as.numeric(tree.frame$depth)
 tree.frame$real.depth<-as.numeric(tree.frame$real.depth)
 tree.frame$parent.node<-as.numeric(tree.frame$parent.node)
 tree.frame$count.fired<-as.numeric(tree.frame$count.fired)
 tree.frame$count.nonfired<-as.numeric(tree.frame$count.nonfired)
  
 #
 # update parent node
 #
  
  
 tree.frame$split.var.name[parent.node.index]<-varname
 tree.frame$split.var.value[parent.node.index]<-split.value
  
 return(list(frame= tree.frame, where=where.samp))
 }
  
 #
 # Split a node using a continous variable and update(and return) tree frame data structure
 # returns the updated tree frame data structure and the updated $where.samp$ vector
 # $d$ is the dataset
 # $tree.frame$  is the current data frame  that stores the description of the tree
 # $parent.node.label$ is the label of node to be split
 # $split.prop$ is a list that contains the name for the splitting variable and the value to be used for splitting
 # $where.samp$ a vector that  stores in which node  each sample in d ends up.
 # $roc.thres$ is the minimum ratio of fireds to nonfireds at which a node is classified as fired
 split.cont <- function(d,tree.frame,parent.node.label,split.prop,where.samp,roc.thres)
 {
 y<-d$fired
 # print(split.prop)
 tree.size  <- dim(tree.frame)[1] - 1
 var.index<-split.prop$var.split.index
 split.value<-split.prop$var.split.value
 varname<-names(d)[var.index]
  
  parent.node.index<-which(tree.frame$node.label==parent.node.label)
  
 # print(paste("is.numeric.value ",is.numeric(split.value)))
  
 #
 # Create three new child nodes
 #
 left.node.indic  <- d[,varname]<=split.value
 right.node.indic <- !left.node.indic
 na.node.indic    <- is.na( d[,varname])
  
 left.node.indic [na.node.indic]<-F
 right.node.indic[na.node.indic]<-F
  
 left.node.indic  <- left.node.indic &(where.samp==parent.node.label)
 right.node.indic <- right.node.indic &(where.samp==parent.node.label)
 na.node.indic    <- na.node.indic &(where.samp==parent.node.label)
  
  
 where.samp[ left.node.indic  ] <- tree.size+1
 where.samp[ right.node.indic ] <- tree.size+2
 where.samp[ na.node.indic    ] <- tree.size+3
  
 fired.count.l<-sum(y[left.node.indic]==1)
 fired.count.r<-sum(y[right.node.indic]==1)
 fired.count.na<-sum(y[na.node.indic]==1)
  
 nonfired.count.l<-sum(y[left.node.indic]==0)
 nonfired.count.r<-sum(y[right.node.indic]==0)
 nonfired.count.na<-sum(y[na.node.indic]==0)
  
 leaf.call.l  <- (fired.count.l/(fired.count.l+nonfired.count.l))>roc.thres
 leaf.call.r  <- (fired.count.r/(fired.count.r+nonfired.count.r))>roc.thres
 leaf.call.na <- (fired.count.na/(fired.count.na+nonfired.count.na))>roc.thres
  
  
 new.node.labels <- c(tree.size+1,tree.size+2,tree.size+3)
 current.depth<-tree.frame$depth[parent.node.index]
 current.real.depth<-tree.frame$real.depth[parent.node.index]
  
  
  
 new.depth       <- rep(current.depth+1,3)
  
  
 new.real.depth  <- rep(c(current.real.depth+1,current.real.depth),c(2,1))
 new.split.varname   <- rep("<leaf>",3)
 new.split.var.value <- rep(NA,3)
 new.dec.rule        <- c("<=",">",NA)
 new.parent.node     <- rep(parent.node.label,3)
 new.call            <- c(leaf.call.l, leaf.call.r, leaf.call.na)
 new.fired.count     <- c(fired.count.l,fired.count.r,fired.count.na)
 new.nonfired.count  <- c(nonfired.count.l,nonfired.count.r, nonfired.count.na)
  
 new.frame<-as.data.frame(cbind(new.node.labels,new.depth,new.real.depth,
             new.split.varname, new.split.var.value,new.dec.rule,new.parent.node,new.call,new.fired.count,new.nonfired.count),
             stringsAsFactors=F)
 names(new.frame)<-c("node.label" , "depth", "real.depth", "split.var.name",
               "split.var.value", "dec.rule", "parent.node","call","count.fired", "count.nonfired")
  
 tree.frame<-rbind(tree.frame,new.frame)
  
  
 tree.frame$node.label<-as.numeric(tree.frame$node.label)
 tree.frame$depth<-as.numeric(tree.frame$depth)
 tree.frame$real.depth<-as.numeric(tree.frame$real.depth)
 tree.frame$parent.node<-as.numeric(tree.frame$parent.node)
 tree.frame$count.fired<-as.numeric(tree.frame$count.fired)
 tree.frame$count.nonfired<-as.numeric(tree.frame$count.nonfired)
 #
 # update parent node
 #
  
  
 tree.frame$split.var.name[parent.node.index]<-varname
 tree.frame$split.var.value[parent.node.index]<-split.value
  
 # print(paste("update parent.node", varname,"  ",split.value) )
  
 return(list(frame= tree.frame, where=where.samp))
  
 }
  
 #
 # Computes goodness(decrease in entropy) of a particular split based on the feature in split.var
 # returns a goodness of split measure
 # $y$ is the vector class labels
 # $split.var$ is the feature vector(the splitting variable)
 # $thres$ is the value of the splitting variable used for splitting  the node
 # $is.discrete$ is a flag that is 1 if the splitting variable is a discrete variable
  
  goodness.of.split <- function (y,split.var,thres,is.discrete)
 {
 # If a split is not necessary, return 0
     if ((sum(y==0)==0)||(sum(y==1)==0))
         return(0)
     na.node<-is.na(split.var)
     num.sample<-length(y)
     left.node<-rep(FALSE,num.sample)
     right.node<-rep(FALSE,num.sample)
  
     # print(num.sample)
     #print(paste("threshold",thres))
     if (is.discrete) {
     left.node<-(as.character(split.var)==as.character(thres))
     left.node[na.node]<- F
  
     right.node<-(as.character(split.var)!=as.character(thres))    
     right.node[na.node]<- F
  
     } else {
    
     left.node<- split.var<=thres
     left.node[na.node]<- F
  
     right.node<- split.var > thres
     right.node[na.node]<- F
     }
    
     y.l<-y[left.node]
     y.r<-y[right.node]
     y.na<-y[na.node]
    
             
 #   print(paste("class 0, class 1",sum(y==0)   ,sum(y==1 )))
 #   print(paste("left.node size",length(y.l),sum(y.l==0)   ,sum(y.l==1 )))
 #   print(paste("right.node size",length(y.r),sum(y.r==0)  ,sum(y.r==1 )))
 #   print(paste("na.node size",length(y.na),sum(y.na==0)   ,sum(y.na==1 )))
  
     if ((length(y.l)==0)||(length(y.r)==0))
         return(-5)
  
  
    
     orig.imp   <- impurity.at.node(sum(y==0)   ,sum(y==1   ))
     left.imp   <- impurity.at.node(sum(y.l==0) ,sum(y.l==1 ))
     right.imp  <- impurity.at.node(sum(y.r==0) ,sum(y.r==1 ))
     na.imp <- 0
     if(length(y.na)!=0)
         na.imp     <- impurity.at.node(sum(y.na==0),sum(y.na==1))
  
     wt.left<-length(y.l)
     wt.right<-length(y.r)
     wt.na<- length(y.na)
  
     wt.left<-   wt.left/ num.sample
     wt.right<-  wt.right/ num.sample
     wt.na <-    wt.na/ num.sample
     imp.diff<- wt.left*left.imp+wt.right*right.imp+wt.na*na.imp-orig.imp
 #   print(paste("imp.diff",imp.diff))
        
     return(-1*imp.diff)
  
 }
  
 #
 # Entropy at a node
 #
  
 impurity.at.node <- function (y.1,y.2)
 {
     tot.y<-y.1+y.2
     impurity<-0
     if (tot.y==0) {
         print("Entropy Comp. Error")
         impurity<- -1
     } else {
     p.1<-y.1/(tot.y)
     p.2<-y.2/(tot.y)
     impurity<-0
     if ((p.1!=0)&(p.2!=0)) {
     impurity <- -1*(p.1*log2(p.1)+p.2*log2(p.2))
     } else {
     impurity <- 0      
     }
     }
     return(impurity)
 }
  
 #impurity.at.node<- function(y.1,y.2)
 #{
 #    tot.y<-y.1+y.2
 #    impurity<-0
 #    if (tot.y==0) {
 #        print("Entropy Comp. Error")
 #        impurity<- -1
 #    } else {
 #    impurity<- min(y.1,y.2)/tot.y
 #    }
 #    return(impurity)
 #
 #}
  
 #impurity.at.node<- function(y.1,y.2)
 #{
 #    tot.y<-y.1+y.2
 #    impurity<-0
 #    if (tot.y==0) {
 #        print("Entropy Comp. Error")
 #        impurity<- -1
 #    } else {
 #    impurity<- y.1*y.2/tot.y^2
 #    }
 #    return(impurity)
 #
 #}
  
  
 predict.na.tree<-function(tree.frame, test.sample, numeric.or.factor)
 {
    
     at.leaf.node<-F
     at.node<- 0
     test.where <- 0
     child.frame.index <- 0
     while(at.leaf.node==F) {
         node.frame.index<- which(tree.frame$node.label==at.node)   
        
         # print(paste("node.frame.index",node.frame.index))
  
         split.variable.name<-tree.frame$split.var.name[node.frame.index]
 #       print(paste("split.variable.name",split.variable.name))
 #       print(paste("numeric.or.factor",numeric.or.factor[split.variable.name]))
  
         threshold.value<-tree.frame$split.var.value[node.frame.index]
        
        
         #       print(split.variable.name)
         test.value<-test.sample[split.variable.name]
 #       print(numeric.or.factor)
  
 #       print(split.variable.name)
         if (numeric.or.factor[split.variable.name]) {
             #
             # the splitting variable is continous
             #
             test.value <- as.numeric(test.value)
             threshold.value <- as.numeric(threshold.value)
  
        
             NA.child.node<-is.na(test.value)
             if (!NA.child.node) {
                 left.node<- test.value<=threshold.value
                 if (left.node) {
                     child.frame.index <- which((tree.frame$parent.node==at.node) & (tree.frame$dec.rule=="<=") )
                
                 } else {
                     child.frame.index <- which((tree.frame$parent.node==at.node) & (tree.frame$dec.rule==">") )
                 }              
             } else {
             child.frame.index <- which((tree.frame$parent.node==at.node) & (is.na(tree.frame$dec.rule) ) )
  
             }              
         } else {
             #
             # if the splitting variable is discrete
             #
             test.value <- as.character(test.value)
             threshold.value <- as.character(threshold.value)
        
             NA.child.node<-is.na(test.value)
             if (!NA.child.node) {
                 left.node<- test.value==threshold.value
                 if (left.node) {
                     child.frame.index <- which((tree.frame$parent.node==at.node) & (tree.frame$dec.rule=="==") )               
                 } else {
                     child.frame.index <- which((tree.frame$parent.node==at.node) & (tree.frame$dec.rule=="!=") )
                 }              
             } else {
  
                 child.frame.index <- which((tree.frame$parent.node==at.node) & (is.na(tree.frame$dec.rule) ) )
  
             }
  
         }
        
         #Child node becomes the current node
         at.node<-tree.frame$node.label[child.frame.index]
         #Check if this current node is  a leaf.
         at.leaf.node <- (tree.frame$split.var.name[child.frame.index] =="<leaf>")
        # print(paste("child.frame.index",child.frame.index))
  
         #print(paste("at.node",at.node))
  
         #print(paste("at.leaf.node",at.leaf.node))
         if (at.leaf.node) {
             test.where<-at.node
         }
         #set the index to the row whose node.label is the the current node
         node.frame.index<- which(tree.frame$node.label==at.node)   
        
     }  
     class.call<-tree.frame$call[node.frame.index]
     if (is.na(class.call)) {
         print(paste("leaf.node.index",node.frame.index))
         print(tree.frame)
         class.call<-F
     }
  
        
     return(list(call=class.call,where=test.where))
 }
  
  
  
 draw.node<-function(tree.frame,node.drawn)
 {
 node.index<-which(tree.frame$node.label==node.drawn)
 if (node.index==0) {
     print("No such node exists")
     return()
 }
 parent.node.label<-tree.frame$parent.node[node.index]
 parent.node.index<-which(tree.frame$node.label==parent.node.label)
 num.fired<-tree.frame$count.fired[node.index]
 num.nonfired<-tree.frame$count.nonfired[node.index]
  
 num.total<-num.fired+num.nonfired
 node.depth<-tree.frame$depth[node.index]
 node.str<-character()
 if (node.depth>0) {
     for (i in 1:node.depth) {
     node.str<-paste(node.str,"\t")
    
     }
 }
 node.str<-paste(node.str,"Node ",node.drawn,": ")
 if (is.na(parent.node.label)) {
     node.str<-paste(node.str, "Root node  ", "  Fired" ,num.fired, "Nonfired",num.nonfired,  "Total", num.total)
  
 } else if (!is.na(tree.frame$dec.rule[node.index])) {
     node.str<-paste(node.str, tree.frame$split.var.name[parent.node.index],tree.frame$dec.rule[node.index],
     tree.frame$split.var.value[parent.node.index],
     "  Fired" ,num.fired, "Nonfired",num.nonfired,  "Total", num.total)
 } else {
     node.str<-paste(node.str, tree.frame$split.var.name[parent.node.index] ,"is.na",
     "  Fired" ,num.fired, "Nonfired",num.nonfired,  "Total", num.total)
 }
 node.str<-paste(node.str,"\n")
 cat(node.str)
 }
  
  
  
  
 draw.tree<-function(tree.frame)
 {
  
 all.nodes.drawn<-F
 current.node<-0
 tree.size<-dim(tree.frame)[1]
  
 drawn.nodes<-rep(0,tree.size)
 node.queue<-c()
 while(all.nodes.drawn==F) {
  
     draw.node(tree.frame,current.node)
     current.node.index <- which(tree.frame$node.label ==current.node)
     child.nodes.index  <- which(tree.frame$parent.node==current.node)
     if (length(child.nodes.index)==0) {
     } else {
         child.nodes <- tree.frame$node.label[child.nodes.index]
         node.queue  <- c(child.nodes,node.queue)
     }
     node.queue  <-    node.queue[ node.queue!=current.node]
  
     drawn.nodes[current.node.index] <-1
     if(sum(drawn.nodes)==tree.size) {
         all.nodes.drawn <- T
     }
     else current.node<-node.queue[1]
  
 }
 }
Personal tools
Project Infrastructures