From CVRG Wiki
make.fired.5050<-function(d)
{
# Splite d into two half. One is for fired data. The other is for not fired data.
d.fired<-d[d$fired==1,]
d.not.fired<-d[!d$fired,]
# Calculate how many fired data observations.
k.fired<-dim(d.fired)[1]
# Calculate how many not fired data observations.
k.not.fired<-dim(d.not.fired)[1]
# Get the ratio between number of fired observations and not fired observations (and
# round down to the largest integer).
rat<-floor(k.not.fired/k.fired)
if (rat>0)
{
# If ratio is larger than 0, store d.fired to d.temp.
d.temp<-d.fired
if (rat>1)
{
# If ratio is larger than 1, we replicate attach d.fired to the end of d.temp
# for rat-1 times. (ratio being larger than 1 means number of fired are at least
# twice as many as not number of fired.)
for (i in seq(1,rat-1))
{
d.temp<-rbind(d.temp,d.fired)
}
}
}
else
{
# If ratio is smaller than or equal to 0, give a error message.
print("error with replication: no firings for this dataset\n")
}
# Conbind d.temp and d.not fired together.
rbind(d.temp,d.not.fired)
}
#######################################################################################
#
# Function finds best split in the node given data for all the variables in list.vars
# returns a list that includes the name of the feature best for a split, a measure for the goodness of the split, and the best splitting value for that feature
# d.node is the subset of data this node contains.
# list.vars is the list of column numbers of the features
# node.number is the node label of the node that's being considered for a split
best.split<-function(d.node,list.vars,node.number)
{
# Assign d.node to d.
d<-d.node
# Assign "fired" column of data to y.
y<-d$fired
# Assign node.n the number of observations in total.
node.n<-dim(d)[1]
# print(paste( "dimension of d.node",dim(d)))
# Create empty list fo goodness.list, var.key.list and var.value.list.
goodness.list<-list()
var.key.list<-list()
var.value.list<-list()
#For each variable and each threshold(all levels for discrete
#and limited number of thresholds for continous) compute the goodness
#and return maximum
#var.key.list and var.key stores which variable the split is for
#goodness.list and goodness.vec stores the value of goodness for that split
#var.value holds the splitting value(threshold for continous, or equal/notequal check
#for continous
#For discrete variables only partition of (single levels vs all others) are checked, l checks for l levels
#instead of 2^(l-1)-1 checks(all possible partitions of levels).
#This only matters if there are more than 3 levels
# For every variable in the list.vars, we run the following program to get an optimal split.
for (i in list.vars) {
# Create empty list for variable name, goodness of fit and threshold.
var.index.vec<-c()
goodness.vec<-c()
var.value.vec<-c()
# If the variable is a numeric
if (is.numeric(d[,i])) {
# Assign discrete indicator to FALSE, which is used later in goodness.of.split()
is.discrete<-F
# Create empty list for sample quantiles.
samp.qt<-c()
# Get the number of not NA values for the variable.
node.not.na<-sum(!is.na(d[,i]))
# Choose number of possible thresholds
# if number of samples in node is not large enough, try all the possible thresholds
# If number of not NA values is larger than or equal to 10
if (node.not.na >= 10){
# Create a list of percentages of (0.2 0.5 0.8)
list.qt<-seq(0.2,0.8,by=0.3)
# Get corresponding quantiles in the variable.
samp.qt<- quantile(d[,i],probs=list.qt,na.rm=T)
} else {
# If the number of not NA values is less than 10
# We only test one threshold which is the mean.
samp.qt<-mean(d[,i],na.rm=T)
}
# Assign samp.qt to var.value.vec
var.value.vec<-samp.qt
# For every quantiles in samp.qt, we run the loop to generate lists of
# goodness of fit and variable names.
for (j in samp.qt) {
# Get the goodness of fit for every quantiles.
goodness.val<-goodness.of.split(y,d[,i],j,is.discrete)
# Put goodness of fit into a vector.
goodness.vec<-c(goodness.vec, goodness.val)
# Put variable index into a vector.
var.index.vec<-c(var.index.vec,i)
# var.value.vec<-c( var.value.vec, as.character(j))
}
} else {
# If the variable d[,i] is discrete, then run the following program.
# Convert the variable into factor and store it to "disc.var".
disc.var<-as.factor(d[,i])
# Assign discrete indicator to TRUE, which is used later in goodness.of.split()
is.discrete<-T
# List all the unique values of disc.var.
disc.levels<-levels(disc.var)
# Put all the unique values into a vector.
var.value.vec<-disc.levels
# For every value in disc.level, we run the loop to generate lists of
# goodness of fit and variable names. (The same as numeric varialbe case)
for (thres in disc.levels) {
goodness.val<-goodness.of.split(y, disc.var, thres, is.discrete)
goodness.vec<-c(goodness.vec, goodness.val)
var.index.vec<-c(var.index.vec,i)
}
}
# Append to list of goodness values
# var.key.list hold which variable the goodness value is for
goodness.list<-list(goodness.list,goodness.vec)
var.key.list<-list(var.key.list,var.index.vec)
var.value.list<-list(var.value.list, var.value.vec)
}
# Unlist the goodness.list, var,key.list and var.value.list so that they become
# three vectors, instead of three lists.
goodness.unlist <- unlist(goodness.list)
var.key.unlist <- unlist(var.key.list)
var.value.unlist <- unlist(var.value.list)
# Look for the maximum in the vector of "goodness.unlist", and find the corresponding
# entry in vector of "var.key.unlist".
argmax.goodness<-which.max(goodness.unlist)
var.best.split<-var.key.unlist[argmax.goodness]
# If the varialbe we just found is numeric, store the threshold as a numeric value to
# "var.thres", otherwise, store it as it is.
if (is.numeric(d[, var.best.split])) {
var.thres<-as.numeric(var.value.unlist[argmax.goodness])
} else {
var.thres<-var.value.unlist[argmax.goodness]
}
# Record the maximum value in the vector of "goodness.unlist".
best.goodness.in.list <- goodness.unlist[argmax.goodness]
# If there is no split that decreases impurity in the node
# return NULL, otherwise, return the properties of the best split, which includes
# maximum value of goodness of fit, best variable index, the corresponding threshold
# value.
if (best.goodness.in.list>1e-06) {
return(list(best.goodness = best.goodness.in.list, var.split.index = var.best.split ,
var.split.value = var.thres ))
} else {
return(NULL)
}
}
#######################################################################################
#
# Initializes tree frame data structure
#
tree.init <- function(d)
{
# Create basic variables for the tree frame
# And initiate them with value 0, NA or empty vector.
# For split variable name, assign it "<leaf>"s for now.
node.depth<-c(0)
node.real.depth<-c(0)
node.split.var.name<-c("<leaf>")
node.split.var.value<-c(NA)
labels<-c(0)
node.dec.rule<-NA
node.parent.node<-NA
# Record number of observations with firing and not firings.
num.fired<-sum(d$fired==1)
num.nonfired<-sum(d$fired==0)
# If number of firing is smaller than nu.fired, assign TRUE to class.call, otherwise FALSE.
class.call <-c(num.nonfired <= num.fired )
# Create tree frame as data.frame object with fields of node label, depth, real depth,
# split variable name, split variable value, decisioni rule, parent node, call, number of firings
# and number of not firings.
tree.frame<-as.data.frame(list(node.label=labels, depth=node.depth, real.depth=node.real.depth,
split.var.name=node.split.var.name, split.var.value =node.split.var.value,
dec.rule=node.dec.rule, parent.node = node.parent.node, call=class.call, count.fired=num.fired,
count.nonfired=num.nonfired),
stringsAsFactors=F)
# Return the tree frame.
return(tree.frame)
}
##############################################################################################
#
# Grows a tree starting from the root node using variables with indices in $var.list$
# returns a list containing the data frame that stores the description of the tree ,
# and a vector that stores in which node each sample in d ends up.
# $d$ is the dataset
# $tree.frame$ is the initalized data frame that stores the description of the tree
# $var.list$ is the list of column numbers of the features
# $roc.thres.ratio$ is the minimum ratio of fireds to nonfireds at which a node is classified
# as fired
tree.grow<- function(d, tree.frame,tree.size, var.list,roc.thres.ratio)
{
# Assign n as the number of observations in the dataset.
n<-dim(d)[1]
# Assign a vector of 0s to where.samp.
where.samp<-rep(0,n)
# Assign FALSE to stop.grow indicator.
stop.grow<-F
# Assign TRUE to no.further.splits indicator.
no.further.splits<-T
while(stop.grow==F) {
# Get labels of all the leaves.
leaf.labels<-tree.frame$node.label[tree.frame$split.var.name=="<leaf>"]
# print(leaf.labels)
# Assign TRUE to no.further.splits indicator.
no.further.splits<-T
# Assign 0 to no.further.splits indicator.
split.using.variable <- 0
# For every leaf, we run the following procedure to grow the tree.
for (i in leaf.labels) {
# if node has no samples skip the current loop.
if ( sum(where.samp==i)==0) next
# Assign d.node the dataset at node i.
d.node<-d[where.samp==i,]
# print(dim(d.node))
# Call best.split() and get the best split variable and threshold for node i.
split.prop<-best.split(d.node, var.list, i)
# print(split.prop)
# print(paste("i is ",i))
# If split.prop returns a variable instead of Null, then split the tree and
# add more rows in the tree frame table.
if(!is.null(split.prop)){
# Assign FALSE to no.further.splits
no.further.splits<-F
# Get the split variable
split.using.variable <- split.prop$var.split.index
# If the split variable is numeric, use split.cont(), otherwise, use split.discrete()
# to add more rows in tree frame and to change where.samp.
if (!is.numeric(d[,split.using.variable])){
# Call function split.discrete()
split.tree<-split.discrete(d,tree.frame, i, split.prop, where.samp,roc.thres.ratio)
tree.frame<-split.tree$frame
where.samp<-split.tree$where
} else {
# Call function split.discrete()
split.tree<-split.cont(d,tree.frame, i, split.prop, where.samp,roc.thres.ratio)
tree.frame<-split.tree$frame
where.samp<-split.tree$where
}
}
# print(where.samp)
# draw.tree(tree.frame)
}
# extract "leaf" part of tree frame and assign it leaf.tree.
leaf.tree<-tree.frame[tree.frame$split.var.name=="<leaf>",]
# print(tree.frame)
# print(min(leaf.tree$depth))
# If the NA tree reaches the depth of 2, or no.further.splits is TRUE, or there
# is no variables in var.list, then stop growing the tree (i.e. stop grow<-T).
if ( (max(leaf.tree$depth)==2)||(no.further.splits)||(length(var.list)==0) ) {
stop.grow<-T
}
# print(paste("No Further Splits",no.further.splits) )
}
# If we have impurity in the NA/NA node of the tree
# that node is split again.
# if (0) {
# extract "leaf" part of tree frame and assign it leaf.tree.
leaf.tree<-tree.frame[tree.frame$split.var.name=="<leaf>",]
# If the minimium of real depth is 0, we assign all.na with locations of real.depth equals to 0.
if (min(leaf.tree$real.depth)==0) {
all.na<-which.min(leaf.tree$real.depth)
# Extract the labels of 0 real depth part of leaf.tree.
i<-leaf.tree$node.label[all.na]
if ( sum(where.samp==i)!=0) {
d.node<-d[where.samp==i,]
# print(dim(d.node))
split.prop<-best.split(d.node, var.list, i)
# print(split.prop)
# print(paste("i is ",i))
if(!is.null(split.prop)){
split.using.variable <- split.prop$var.split.index
if (!is.numeric(d[,split.using.variable])){
split.tree<-split.discrete(d,tree.frame, i, split.prop, where.samp,roc.thres.ratio)
tree.frame<-split.tree$frame
where.samp<-split.tree$where
} else {
split.tree<-split.cont(d,tree.frame, i, split.prop, where.samp,roc.thres.ratio)
tree.frame<-split.tree$frame
where.samp<-split.tree$where
}
}
}
}
# }
return(list(frame=tree.frame, where=where.samp) )
}
#
# Split a node using a discrete variable and update(and return) tree frame data structure
# returns the updated tree frame data structure and the updated $where.samp$ vector
# $d$ is the dataset
# $tree.frame$ is the current data frame that stores the description of the tree
# $parent.node.label$ is the label of node to be split
# $split.prop$ is a list that contains the name for the splitting variable and the value to be used for splitting
# $where.samp$ a vector that stores in which node each sample in d ends up.
# $roc.thres$ is the minimum ratio of fireds to nonfireds at which a node is classified as fired
split.discrete <- function(d,tree.frame,parent.node.label,split.prop,where.samp,roc.thres)
{
y<-d$fired
tree.size <- dim(tree.frame)[1]-1
var.index<-split.prop$var.split.index
split.value<-split.prop$var.split.value
varname<-names(d)[var.index]
parent.node.index<-which(tree.frame$node.label==parent.node.label)
#
# Create three new child nodes
#
left.node.indic <- as.character(d[,varname])==as.character(split.value)
right.node.indic <- !left.node.indic
na.node.indic <- is.na( d[,varname])
left.node.indic [na.node.indic]<-F
right.node.indic[na.node.indic]<-F
left.node.indic <- left.node.indic &(where.samp==parent.node.label)
right.node.indic <- right.node.indic &(where.samp==parent.node.label)
na.node.indic <- na.node.indic &(where.samp==parent.node.label)
where.samp[ left.node.indic ] <- tree.size+1
where.samp[ right.node.indic ] <- tree.size+2
where.samp[ na.node.indic ] <- tree.size+3
#print(paste("parent.node ", parent.node.label) )
#print(paste("parent.node.label is numeric ", is.numeric(parent.node.label)) )
#print(paste("parent.node row ", parent.node.label+1) )
#print(tree.frame)
#print(paste("current depth is ",tree.frame$depth))
#print(paste("current depth is ",tree.frame[parent.node.label+1,2]))
fired.count.l<-sum(y[left.node.indic]==1)
fired.count.r<-sum(y[right.node.indic]==1)
fired.count.na<-sum(y[na.node.indic]==1)
nonfired.count.l<-sum(y[left.node.indic]==0)
nonfired.count.r<-sum(y[right.node.indic]==0)
nonfired.count.na<-sum(y[na.node.indic]==0)
leaf.call.l <- (fired.count.l/(fired.count.l+nonfired.count.l))>roc.thres
leaf.call.r <- (fired.count.r/(fired.count.r+nonfired.count.r))>roc.thres
leaf.call.na <- (fired.count.na/(fired.count.na+nonfired.count.na))>roc.thres
new.node.labels <- c(tree.size+1,tree.size+2,tree.size+3)
current.depth<-tree.frame$depth[parent.node.index]
current.real.depth<-tree.frame$real.depth[parent.node.index]
new.depth <- rep(current.depth+1,3)
new.real.depth <- rep(c(current.real.depth+1,current.real.depth),c(2,1))
new.split.varname <- rep("<leaf>",3)
new.split.var.value <- rep(NA,3)
new.dec.rule <- c("==","!=",NA)
new.parent.node <- rep(parent.node.label,3)
new.call <- c(leaf.call.l, leaf.call.r, leaf.call.na)
new.fired.count <- c(fired.count.l,fired.count.r,fired.count.na)
new.nonfired.count <- c(nonfired.count.l,nonfired.count.r, nonfired.count.na)
#
# add the new nodes to the tree.frame data frame
#
new.frame<-as.data.frame(cbind(new.node.labels,new.depth,new.real.depth,
new.split.varname, new.split.var.value, new.dec.rule, new.parent.node,new.call,new.fired.count,new.nonfired.count ),
stringsAsFactors=F)
names(new.frame)<-c("node.label" , "depth", "real.depth", "split.var.name",
"split.var.value", "dec.rule", "parent.node","call","count.fired", "count.nonfired")
#print(new.frame)
tree.frame<-rbind(tree.frame,new.frame)
tree.frame$node.label<-as.numeric(tree.frame$node.label)
tree.frame$depth<-as.numeric(tree.frame$depth)
tree.frame$real.depth<-as.numeric(tree.frame$real.depth)
tree.frame$parent.node<-as.numeric(tree.frame$parent.node)
tree.frame$count.fired<-as.numeric(tree.frame$count.fired)
tree.frame$count.nonfired<-as.numeric(tree.frame$count.nonfired)
#
# update parent node
#
tree.frame$split.var.name[parent.node.index]<-varname
tree.frame$split.var.value[parent.node.index]<-split.value
return(list(frame= tree.frame, where=where.samp))
}
#
# Split a node using a continous variable and update(and return) tree frame data structure
# returns the updated tree frame data structure and the updated $where.samp$ vector
# $d$ is the dataset
# $tree.frame$ is the current data frame that stores the description of the tree
# $parent.node.label$ is the label of node to be split
# $split.prop$ is a list that contains the name for the splitting variable and the value to be used for splitting
# $where.samp$ a vector that stores in which node each sample in d ends up.
# $roc.thres$ is the minimum ratio of fireds to nonfireds at which a node is classified as fired
split.cont <- function(d,tree.frame,parent.node.label,split.prop,where.samp,roc.thres)
{
y<-d$fired
# print(split.prop)
tree.size <- dim(tree.frame)[1] - 1
var.index<-split.prop$var.split.index
split.value<-split.prop$var.split.value
varname<-names(d)[var.index]
parent.node.index<-which(tree.frame$node.label==parent.node.label)
# print(paste("is.numeric.value ",is.numeric(split.value)))
#
# Create three new child nodes
#
left.node.indic <- d[,varname]<=split.value
right.node.indic <- !left.node.indic
na.node.indic <- is.na( d[,varname])
left.node.indic [na.node.indic]<-F
right.node.indic[na.node.indic]<-F
left.node.indic <- left.node.indic &(where.samp==parent.node.label)
right.node.indic <- right.node.indic &(where.samp==parent.node.label)
na.node.indic <- na.node.indic &(where.samp==parent.node.label)
where.samp[ left.node.indic ] <- tree.size+1
where.samp[ right.node.indic ] <- tree.size+2
where.samp[ na.node.indic ] <- tree.size+3
fired.count.l<-sum(y[left.node.indic]==1)
fired.count.r<-sum(y[right.node.indic]==1)
fired.count.na<-sum(y[na.node.indic]==1)
nonfired.count.l<-sum(y[left.node.indic]==0)
nonfired.count.r<-sum(y[right.node.indic]==0)
nonfired.count.na<-sum(y[na.node.indic]==0)
leaf.call.l <- (fired.count.l/(fired.count.l+nonfired.count.l))>roc.thres
leaf.call.r <- (fired.count.r/(fired.count.r+nonfired.count.r))>roc.thres
leaf.call.na <- (fired.count.na/(fired.count.na+nonfired.count.na))>roc.thres
new.node.labels <- c(tree.size+1,tree.size+2,tree.size+3)
current.depth<-tree.frame$depth[parent.node.index]
current.real.depth<-tree.frame$real.depth[parent.node.index]
new.depth <- rep(current.depth+1,3)
new.real.depth <- rep(c(current.real.depth+1,current.real.depth),c(2,1))
new.split.varname <- rep("<leaf>",3)
new.split.var.value <- rep(NA,3)
new.dec.rule <- c("<=",">",NA)
new.parent.node <- rep(parent.node.label,3)
new.call <- c(leaf.call.l, leaf.call.r, leaf.call.na)
new.fired.count <- c(fired.count.l,fired.count.r,fired.count.na)
new.nonfired.count <- c(nonfired.count.l,nonfired.count.r, nonfired.count.na)
new.frame<-as.data.frame(cbind(new.node.labels,new.depth,new.real.depth,
new.split.varname, new.split.var.value,new.dec.rule,new.parent.node,new.call,new.fired.count,new.nonfired.count),
stringsAsFactors=F)
names(new.frame)<-c("node.label" , "depth", "real.depth", "split.var.name",
"split.var.value", "dec.rule", "parent.node","call","count.fired", "count.nonfired")
tree.frame<-rbind(tree.frame,new.frame)
tree.frame$node.label<-as.numeric(tree.frame$node.label)
tree.frame$depth<-as.numeric(tree.frame$depth)
tree.frame$real.depth<-as.numeric(tree.frame$real.depth)
tree.frame$parent.node<-as.numeric(tree.frame$parent.node)
tree.frame$count.fired<-as.numeric(tree.frame$count.fired)
tree.frame$count.nonfired<-as.numeric(tree.frame$count.nonfired)
#
# update parent node
#
tree.frame$split.var.name[parent.node.index]<-varname
tree.frame$split.var.value[parent.node.index]<-split.value
# print(paste("update parent.node", varname," ",split.value) )
return(list(frame= tree.frame, where=where.samp))
}
#
# Computes goodness(decrease in entropy) of a particular split based on the feature in split.var
# returns a goodness of split measure
# $y$ is the vector class labels
# $split.var$ is the feature vector(the splitting variable)
# $thres$ is the value of the splitting variable used for splitting the node
# $is.discrete$ is a flag that is 1 if the splitting variable is a discrete variable
goodness.of.split <- function (y,split.var,thres,is.discrete)
{
# If a split is not necessary, return 0
if ((sum(y==0)==0)||(sum(y==1)==0))
return(0)
na.node<-is.na(split.var)
num.sample<-length(y)
left.node<-rep(FALSE,num.sample)
right.node<-rep(FALSE,num.sample)
# print(num.sample)
#print(paste("threshold",thres))
if (is.discrete) {
left.node<-(as.character(split.var)==as.character(thres))
left.node[na.node]<- F
right.node<-(as.character(split.var)!=as.character(thres))
right.node[na.node]<- F
} else {
left.node<- split.var<=thres
left.node[na.node]<- F
right.node<- split.var > thres
right.node[na.node]<- F
}
y.l<-y[left.node]
y.r<-y[right.node]
y.na<-y[na.node]
# print(paste("class 0, class 1",sum(y==0) ,sum(y==1 )))
# print(paste("left.node size",length(y.l),sum(y.l==0) ,sum(y.l==1 )))
# print(paste("right.node size",length(y.r),sum(y.r==0) ,sum(y.r==1 )))
# print(paste("na.node size",length(y.na),sum(y.na==0) ,sum(y.na==1 )))
if ((length(y.l)==0)||(length(y.r)==0))
return(-5)
orig.imp <- impurity.at.node(sum(y==0) ,sum(y==1 ))
left.imp <- impurity.at.node(sum(y.l==0) ,sum(y.l==1 ))
right.imp <- impurity.at.node(sum(y.r==0) ,sum(y.r==1 ))
na.imp <- 0
if(length(y.na)!=0)
na.imp <- impurity.at.node(sum(y.na==0),sum(y.na==1))
wt.left<-length(y.l)
wt.right<-length(y.r)
wt.na<- length(y.na)
wt.left<- wt.left/ num.sample
wt.right<- wt.right/ num.sample
wt.na <- wt.na/ num.sample
imp.diff<- wt.left*left.imp+wt.right*right.imp+wt.na*na.imp-orig.imp
# print(paste("imp.diff",imp.diff))
return(-1*imp.diff)
}
#
# Entropy at a node
#
impurity.at.node <- function (y.1,y.2)
{
tot.y<-y.1+y.2
impurity<-0
if (tot.y==0) {
print("Entropy Comp. Error")
impurity<- -1
} else {
p.1<-y.1/(tot.y)
p.2<-y.2/(tot.y)
impurity<-0
if ((p.1!=0)&(p.2!=0)) {
impurity <- -1*(p.1*log2(p.1)+p.2*log2(p.2))
} else {
impurity <- 0
}
}
return(impurity)
}
#impurity.at.node<- function(y.1,y.2)
#{
# tot.y<-y.1+y.2
# impurity<-0
# if (tot.y==0) {
# print("Entropy Comp. Error")
# impurity<- -1
# } else {
# impurity<- min(y.1,y.2)/tot.y
# }
# return(impurity)
#
#}
#impurity.at.node<- function(y.1,y.2)
#{
# tot.y<-y.1+y.2
# impurity<-0
# if (tot.y==0) {
# print("Entropy Comp. Error")
# impurity<- -1
# } else {
# impurity<- y.1*y.2/tot.y^2
# }
# return(impurity)
#
#}
predict.na.tree<-function(tree.frame, test.sample, numeric.or.factor)
{
at.leaf.node<-F
at.node<- 0
test.where <- 0
child.frame.index <- 0
while(at.leaf.node==F) {
node.frame.index<- which(tree.frame$node.label==at.node)
# print(paste("node.frame.index",node.frame.index))
split.variable.name<-tree.frame$split.var.name[node.frame.index]
# print(paste("split.variable.name",split.variable.name))
# print(paste("numeric.or.factor",numeric.or.factor[split.variable.name]))
threshold.value<-tree.frame$split.var.value[node.frame.index]
# print(split.variable.name)
test.value<-test.sample[split.variable.name]
# print(numeric.or.factor)
# print(split.variable.name)
if (numeric.or.factor[split.variable.name]) {
#
# the splitting variable is continous
#
test.value <- as.numeric(test.value)
threshold.value <- as.numeric(threshold.value)
NA.child.node<-is.na(test.value)
if (!NA.child.node) {
left.node<- test.value<=threshold.value
if (left.node) {
child.frame.index <- which((tree.frame$parent.node==at.node) & (tree.frame$dec.rule=="<=") )
} else {
child.frame.index <- which((tree.frame$parent.node==at.node) & (tree.frame$dec.rule==">") )
}
} else {
child.frame.index <- which((tree.frame$parent.node==at.node) & (is.na(tree.frame$dec.rule) ) )
}
} else {
#
# if the splitting variable is discrete
#
test.value <- as.character(test.value)
threshold.value <- as.character(threshold.value)
NA.child.node<-is.na(test.value)
if (!NA.child.node) {
left.node<- test.value==threshold.value
if (left.node) {
child.frame.index <- which((tree.frame$parent.node==at.node) & (tree.frame$dec.rule=="==") )
} else {
child.frame.index <- which((tree.frame$parent.node==at.node) & (tree.frame$dec.rule=="!=") )
}
} else {
child.frame.index <- which((tree.frame$parent.node==at.node) & (is.na(tree.frame$dec.rule) ) )
}
}
#Child node becomes the current node
at.node<-tree.frame$node.label[child.frame.index]
#Check if this current node is a leaf.
at.leaf.node <- (tree.frame$split.var.name[child.frame.index] =="<leaf>")
# print(paste("child.frame.index",child.frame.index))
#print(paste("at.node",at.node))
#print(paste("at.leaf.node",at.leaf.node))
if (at.leaf.node) {
test.where<-at.node
}
#set the index to the row whose node.label is the the current node
node.frame.index<- which(tree.frame$node.label==at.node)
}
class.call<-tree.frame$call[node.frame.index]
if (is.na(class.call)) {
print(paste("leaf.node.index",node.frame.index))
print(tree.frame)
class.call<-F
}
return(list(call=class.call,where=test.where))
}
draw.node<-function(tree.frame,node.drawn)
{
node.index<-which(tree.frame$node.label==node.drawn)
if (node.index==0) {
print("No such node exists")
return()
}
parent.node.label<-tree.frame$parent.node[node.index]
parent.node.index<-which(tree.frame$node.label==parent.node.label)
num.fired<-tree.frame$count.fired[node.index]
num.nonfired<-tree.frame$count.nonfired[node.index]
num.total<-num.fired+num.nonfired
node.depth<-tree.frame$depth[node.index]
node.str<-character()
if (node.depth>0) {
for (i in 1:node.depth) {
node.str<-paste(node.str,"\t")
}
}
node.str<-paste(node.str,"Node ",node.drawn,": ")
if (is.na(parent.node.label)) {
node.str<-paste(node.str, "Root node ", " Fired" ,num.fired, "Nonfired",num.nonfired, "Total", num.total)
} else if (!is.na(tree.frame$dec.rule[node.index])) {
node.str<-paste(node.str, tree.frame$split.var.name[parent.node.index],tree.frame$dec.rule[node.index],
tree.frame$split.var.value[parent.node.index],
" Fired" ,num.fired, "Nonfired",num.nonfired, "Total", num.total)
} else {
node.str<-paste(node.str, tree.frame$split.var.name[parent.node.index] ,"is.na",
" Fired" ,num.fired, "Nonfired",num.nonfired, "Total", num.total)
}
node.str<-paste(node.str,"\n")
cat(node.str)
}
draw.tree<-function(tree.frame)
{
all.nodes.drawn<-F
current.node<-0
tree.size<-dim(tree.frame)[1]
drawn.nodes<-rep(0,tree.size)
node.queue<-c()
while(all.nodes.drawn==F) {
draw.node(tree.frame,current.node)
current.node.index <- which(tree.frame$node.label ==current.node)
child.nodes.index <- which(tree.frame$parent.node==current.node)
if (length(child.nodes.index)==0) {
} else {
child.nodes <- tree.frame$node.label[child.nodes.index]
node.queue <- c(child.nodes,node.queue)
}
node.queue <- node.queue[ node.queue!=current.node]
drawn.nodes[current.node.index] <-1
if(sum(drawn.nodes)==tree.size) {
all.nodes.drawn <- T
}
else current.node<-node.queue[1]
}
}