From CVRG Wiki
make.fired.5050<-function(d)
{
d.fired<-d[d$fired==1,]
d.not.fired<-d[!d$fired,]
k.fired<-dim(d.fired)[1]
k.not.fired<-dim(d.not.fired)[1]
rat<-floor(k.not.fired/k.fired)
if (rat>0)
{
d.temp<-d.fired
if (rat>1)
{
for (i in seq(1,rat-1))
{
d.temp<-rbind(d.temp,d.fired)
}
}
}
else
{
print("error with replication: no firings for this dataset\n")
}
rbind(d.temp,d.not.fired)
}
#######################################################################################
#
# Function finds best split in the node given data for all the variables in list.vars
# returns a list that includes the name of the feature best for a split, a measure for the goodness of the split, and the best splitting value for that feature
# d.node is the subset of data this node contains.
# list.vars is the list of column numbers of the features
# node.number is the node label of the node that's being considered for a split
best.split<-function(d.node,list.vars,node.number)
{
d<-d.node
y<-d$fired
node.n<-dim(d)[1]
goodness.list<-list()
var.key.list<-list()
var.value.list<-list()
for (i in list.vars) {
var.index.vec<-c()
goodness.vec<-c()
var.value.vec<-c()
if (is.numeric(d[,i])) {
is.discrete<-F
samp.qt<-c()
node.not.na<-sum(!is.na(d[,i]))
if (node.not.na >= 10){
list.qt<-seq(0.2,0.8,by=0.3)
samp.qt<- quantile(d[,i],probs=list.qt,na.rm=T)
} else {
samp.qt<-mean(d[,i],na.rm=T)
}
var.value.vec<-samp.qt
for (j in samp.qt) {
goodness.val<-goodness.of.split(y,d[,i],j,is.discrete)
goodness.vec<-c(goodness.vec, goodness.val)
var.index.vec<-c(var.index.vec,i)
}
} else {
disc.var<-as.factor(d[,i])
is.discrete<-T
disc.levels<-levels(disc.var)
var.value.vec<-disc.levels
for (thres in disc.levels) {
goodness.val<-goodness.of.split(y, disc.var, thres, is.discrete)
goodness.vec<-c(goodness.vec, goodness.val)
var.index.vec<-c(var.index.vec,i)
}
}
goodness.list<-list(goodness.list,goodness.vec)
var.key.list<-list(var.key.list,var.index.vec)
var.value.list<-list(var.value.list, var.value.vec)
}
goodness.unlist <- unlist(goodness.list)
var.key.unlist <- unlist(var.key.list)
var.value.unlist <- unlist(var.value.list)
argmax.goodness<-which.max(goodness.unlist)
var.best.split<-var.key.unlist[argmax.goodness]
if (is.numeric(d[, var.best.split])) {
var.thres<-as.numeric(var.value.unlist[argmax.goodness])
} else {
var.thres<-var.value.unlist[argmax.goodness]
}
best.goodness.in.list <- goodness.unlist[argmax.goodness]
if (best.goodness.in.list>1e-06) {
return(list(best.goodness = best.goodness.in.list, var.split.index = var.best.split ,
var.split.value = var.thres ))
} else {
return(NULL)
}
}
#######################################################################################
#
# Initializes tree frame data structure
#
tree.init <- function(d)
{
node.depth<-c(0)
node.real.depth<-c(0)
node.split.var.name<-c("<leaf>")
node.split.var.value<-c(NA)
labels<-c(0)
node.dec.rule<-NA
node.parent.node<-NA
num.fired<-sum(d$fired==1)
num.nonfired<-sum(d$fired==0)
class.call <-c(num.nonfired <= num.fired )
tree.frame<-as.data.frame(list(node.label=labels, depth=node.depth, real.depth=node.real.depth,
split.var.name=node.split.var.name, split.var.value =node.split.var.value,
dec.rule=node.dec.rule, parent.node = node.parent.node, call=class.call, count.fired=num.fired,
count.nonfired=num.nonfired),
stringsAsFactors=F)
# Return the tree frame.
return(tree.frame)
}
##############################################################################################
#
# Grows a tree starting from the root node using variables with indices in $var.list$
# returns a list containing the data frame that stores the description of the tree ,
# and a vector that stores in which node each sample in d ends up.
# $d$ is the dataset
# $tree.frame$ is the initalized data frame that stores the description of the tree
# $var.list$ is the list of column numbers of the features
# $roc.thres.ratio$ is the minimum ratio of fireds to nonfireds at which a node is classified
# as fired
tree.grow<- function(d, tree.frame,tree.size, var.list,roc.thres.ratio)
{
n<-dim(d)[1]
where.samp<-rep(0,n)
stop.grow<-F
no.further.splits<-T
while(stop.grow==F) {
leaf.labels<-tree.frame$node.label[tree.frame$split.var.name=="<leaf>"]
no.further.splits<-T
split.using.variable <- 0
for (i in leaf.labels) {
if ( sum(where.samp==i)==0) next
d.node<-d[where.samp==i,]
split.prop<-best.split(d.node, var.list, i)
if(!is.null(split.prop)){
no.further.splits<-F
split.using.variable <- split.prop$var.split.index
if (!is.numeric(d[,split.using.variable])){
split.tree<-split.discrete(d,tree.frame, i, split.prop, where.samp,roc.thres.ratio)
tree.frame<-split.tree$frame
where.samp<-split.tree$where
} else {
split.tree<-split.cont(d,tree.frame, i, split.prop, where.samp,roc.thres.ratio)
tree.frame<-split.tree$frame
where.samp<-split.tree$where
}
}
}
leaf.tree<-tree.frame[tree.frame$split.var.name=="<leaf>",]
if ( (max(leaf.tree$depth)==2)||(no.further.splits)||(length(var.list)==0) ) {
stop.grow<-T
}
}
# If we have impurity in the NA/NA node of the tree
# that node is split again.
# if (0) {
leaf.tree<-tree.frame[tree.frame$split.var.name=="<leaf>",]
if (min(leaf.tree$real.depth)==0) {
all.na<-which.min(leaf.tree$real.depth)
i<-leaf.tree$node.label[all.na]
if ( sum(where.samp==i)!=0) {
d.node<-d[where.samp==i,]
split.prop<-best.split(d.node, var.list, i)
if(!is.null(split.prop)){
split.using.variable <- split.prop$var.split.index
if (!is.numeric(d[,split.using.variable])){
split.tree<-split.discrete(d,tree.frame, i, split.prop, where.samp,roc.thres.ratio)
tree.frame<-split.tree$frame
where.samp<-split.tree$where
} else {
split.tree<-split.cont(d,tree.frame, i, split.prop, where.samp,roc.thres.ratio)
tree.frame<-split.tree$frame
where.samp<-split.tree$where
}
}
}
}
# }
return(list(frame=tree.frame, where=where.samp) )
}
#
# Split a node using a discrete variable and update(and return) tree frame data structure
# returns the updated tree frame data structure and the updated $where.samp$ vector
# $d$ is the dataset
# $tree.frame$ is the current data frame that stores the description of the tree
# $parent.node.label$ is the label of node to be split
# $split.prop$ is a list that contains the name for the splitting variable and the value to be used for splitting
# $where.samp$ a vector that stores in which node each sample in d ends up.
# $roc.thres$ is the minimum ratio of fireds to nonfireds at which a node is classified as fired
split.discrete <- function(d,tree.frame,parent.node.label,split.prop,where.samp,roc.thres)
{
y<-d$fired
tree.size <- dim(tree.frame)[1]-1
var.index<-split.prop$var.split.index
split.value<-split.prop$var.split.value
varname<-names(d)[var.index]
parent.node.index<-which(tree.frame$node.label==parent.node.label)
left.node.indic <- as.character(d[,varname])==as.character(split.value)
right.node.indic <- !left.node.indic
na.node.indic <- is.na( d[,varname])
left.node.indic [na.node.indic]<-F
right.node.indic[na.node.indic]<-F
left.node.indic <- left.node.indic &(where.samp==parent.node.label)
right.node.indic <- right.node.indic &(where.samp==parent.node.label)
na.node.indic <- na.node.indic &(where.samp==parent.node.label)
where.samp[ left.node.indic ] <- tree.size+1
where.samp[ right.node.indic ] <- tree.size+2
where.samp[ na.node.indic ] <- tree.size+3
fired.count.l<-sum(y[left.node.indic]==1)
fired.count.r<-sum(y[right.node.indic]==1)
fired.count.na<-sum(y[na.node.indic]==1)
nonfired.count.l<-sum(y[left.node.indic]==0)
nonfired.count.r<-sum(y[right.node.indic]==0)
nonfired.count.na<-sum(y[na.node.indic]==0)
leaf.call.l <- (fired.count.l/(fired.count.l+nonfired.count.l))>roc.thres
leaf.call.r <- (fired.count.r/(fired.count.r+nonfired.count.r))>roc.thres
leaf.call.na <- (fired.count.na/(fired.count.na+nonfired.count.na))>roc.thres
new.node.labels <- c(tree.size+1,tree.size+2,tree.size+3)
current.depth<-tree.frame$depth[parent.node.index]
current.real.depth<-tree.frame$real.depth[parent.node.index]
new.depth <- rep(current.depth+1,3)
new.real.depth <- rep(c(current.real.depth+1,current.real.depth),c(2,1))
new.split.varname <- rep("<leaf>",3)
new.split.var.value <- rep(NA,3)
new.dec.rule <- c("==","!=",NA)
new.parent.node <- rep(parent.node.label,3)
new.call <- c(leaf.call.l, leaf.call.r, leaf.call.na)
new.fired.count <- c(fired.count.l,fired.count.r,fired.count.na)
new.nonfired.count <- c(nonfired.count.l,nonfired.count.r, nonfired.count.na)
new.frame<-as.data.frame(cbind(new.node.labels,new.depth,new.real.depth,
new.split.varname, new.split.var.value, new.dec.rule, new.parent.node,new.call,new.fired.count,new.nonfired.count ),
stringsAsFactors=F)
names(new.frame)<-c("node.label" , "depth", "real.depth", "split.var.name",
"split.var.value", "dec.rule", "parent.node","call","count.fired", "count.nonfired")
tree.frame<-rbind(tree.frame,new.frame)
tree.frame$node.label<-as.numeric(tree.frame$node.label)
tree.frame$depth<-as.numeric(tree.frame$depth)
tree.frame$real.depth<-as.numeric(tree.frame$real.depth)
tree.frame$parent.node<-as.numeric(tree.frame$parent.node)
tree.frame$count.fired<-as.numeric(tree.frame$count.fired)
tree.frame$count.nonfired<-as.numeric(tree.frame$count.nonfired)
tree.frame$split.var.name[parent.node.index]<-varname
tree.frame$split.var.value[parent.node.index]<-split.value
return(list(frame= tree.frame, where=where.samp))
}
#
# Split a node using a continous variable and update(and return) tree frame data structure
# returns the updated tree frame data structure and the updated $where.samp$ vector
# $d$ is the dataset
# $tree.frame$ is the current data frame that stores the description of the tree
# $parent.node.label$ is the label of node to be split
# $split.prop$ is a list that contains the name for the splitting variable and the value to be used for splitting
# $where.samp$ a vector that stores in which node each sample in d ends up.
# $roc.thres$ is the minimum ratio of fireds to nonfireds at which a node is classified as fired
split.cont <- function(d,tree.frame,parent.node.label,split.prop,where.samp,roc.thres)
{
y<-d$fired
tree.size <- dim(tree.frame)[1] - 1
var.index<-split.prop$var.split.index
split.value<-split.prop$var.split.value
varname<-names(d)[var.index]
parent.node.index<-which(tree.frame$node.label==parent.node.label)
left.node.indic <- d[,varname]<=split.value
right.node.indic <- !left.node.indic
na.node.indic <- is.na( d[,varname])
left.node.indic [na.node.indic]<-F
right.node.indic[na.node.indic]<-F
left.node.indic <- left.node.indic &(where.samp==parent.node.label)
right.node.indic <- right.node.indic &(where.samp==parent.node.label)
na.node.indic <- na.node.indic &(where.samp==parent.node.label)
where.samp[ left.node.indic ] <- tree.size+1
where.samp[ right.node.indic ] <- tree.size+2
where.samp[ na.node.indic ] <- tree.size+3
fired.count.l<-sum(y[left.node.indic]==1)
fired.count.r<-sum(y[right.node.indic]==1)
fired.count.na<-sum(y[na.node.indic]==1)
nonfired.count.l<-sum(y[left.node.indic]==0)
nonfired.count.r<-sum(y[right.node.indic]==0)
nonfired.count.na<-sum(y[na.node.indic]==0)
leaf.call.l <- (fired.count.l/(fired.count.l+nonfired.count.l))>roc.thres
leaf.call.r <- (fired.count.r/(fired.count.r+nonfired.count.r))>roc.thres
leaf.call.na <- (fired.count.na/(fired.count.na+nonfired.count.na))>roc.thres
new.node.labels <- c(tree.size+1,tree.size+2,tree.size+3)
current.depth<-tree.frame$depth[parent.node.index]
current.real.depth<-tree.frame$real.depth[parent.node.index]
new.depth <- rep(current.depth+1,3)
new.real.depth <- rep(c(current.real.depth+1,current.real.depth),c(2,1))
new.split.varname <- rep("<leaf>",3)
new.split.var.value <- rep(NA,3)
new.dec.rule <- c("<=",">",NA)
new.parent.node <- rep(parent.node.label,3)
new.call <- c(leaf.call.l, leaf.call.r, leaf.call.na)
new.fired.count <- c(fired.count.l,fired.count.r,fired.count.na)
new.nonfired.count <- c(nonfired.count.l,nonfired.count.r, nonfired.count.na)
new.frame<-as.data.frame(cbind(new.node.labels,new.depth,new.real.depth,
new.split.varname, new.split.var.value,new.dec.rule,new.parent.node,new.call,new.fired.count,new.nonfired.count),
stringsAsFactors=F)
names(new.frame)<-c("node.label" , "depth", "real.depth", "split.var.name",
"split.var.value", "dec.rule", "parent.node","call","count.fired", "count.nonfired")
tree.frame<-rbind(tree.frame,new.frame)
tree.frame$node.label<-as.numeric(tree.frame$node.label)
tree.frame$depth<-as.numeric(tree.frame$depth)
tree.frame$real.depth<-as.numeric(tree.frame$real.depth)
tree.frame$parent.node<-as.numeric(tree.frame$parent.node)
tree.frame$count.fired<-as.numeric(tree.frame$count.fired)
tree.frame$count.nonfired<-as.numeric(tree.frame$count.nonfired)
tree.frame$split.var.name[parent.node.index]<-varname
tree.frame$split.var.value[parent.node.index]<-split.value
return(list(frame= tree.frame, where=where.samp))
}
#
# Computes goodness(decrease in entropy) of a particular split based on the feature in split.var
# returns a goodness of split measure
# $y$ is the vector class labels
# $split.var$ is the feature vector(the splitting variable)
# $thres$ is the value of the splitting variable used for splitting the node
# $is.discrete$ is a flag that is 1 if the splitting variable is a discrete variable
goodness.of.split <- function (y,split.var,thres,is.discrete)
{
if ((sum(y==0)==0)||(sum(y==1)==0))
return(0)
na.node<-is.na(split.var)
num.sample<-length(y)
left.node<-rep(FALSE,num.sample)
right.node<-rep(FALSE,num.sample)
if (is.discrete) {
left.node<-(as.character(split.var)==as.character(thres))
left.node[na.node]<- F
right.node<-(as.character(split.var)!=as.character(thres))
right.node[na.node]<- F
} else {
left.node<- split.var<=thres
left.node[na.node]<- F
right.node<- split.var > thres
right.node[na.node]<- F
}
y.l<-y[left.node]
y.r<-y[right.node]
y.na<-y[na.node]
if ((length(y.l)==0)||(length(y.r)==0))
return(-5)
orig.imp <- impurity.at.node(sum(y==0) ,sum(y==1 ))
left.imp <- impurity.at.node(sum(y.l==0) ,sum(y.l==1 ))
right.imp <- impurity.at.node(sum(y.r==0) ,sum(y.r==1 ))
na.imp <- 0
if(length(y.na)!=0)
na.imp <- impurity.at.node(sum(y.na==0),sum(y.na==1))
wt.left<-length(y.l)
wt.right<-length(y.r)
wt.na<- length(y.na)
wt.left<- wt.left/ num.sample
wt.right<- wt.right/ num.sample
wt.na <- wt.na/ num.sample
imp.diff<- wt.left*left.imp+wt.right*right.imp+wt.na*na.imp-orig.imp
return(-1*imp.diff)
}
#
# Entropy at a node
#
impurity.at.node <- function (y.1,y.2)
{
tot.y<-y.1+y.2
impurity<-0
if (tot.y==0) {
print("Entropy Comp. Error")
impurity<- -1
} else {
p.1<-y.1/(tot.y)
p.2<-y.2/(tot.y)
impurity<-0
if ((p.1!=0)&(p.2!=0)) {
impurity <- -1*(p.1*log2(p.1)+p.2*log2(p.2))
} else {
impurity <- 0
}
}
return(impurity)
}
#impurity.at.node<- function(y.1,y.2)
#{
# tot.y<-y.1+y.2
# impurity<-0
# if (tot.y==0) {
# print("Entropy Comp. Error")
# impurity<- -1
# } else {
# impurity<- min(y.1,y.2)/tot.y
# }
# return(impurity)
#
#}
#impurity.at.node<- function(y.1,y.2)
#{
# tot.y<-y.1+y.2
# impurity<-0
# if (tot.y==0) {
# print("Entropy Comp. Error")
# impurity<- -1
# } else {
# impurity<- y.1*y.2/tot.y^2
# }
# return(impurity)
#
#}
predict.na.tree<-function(tree.frame, test.sample, numeric.or.factor)
{
at.leaf.node<-F
at.node<- 0
test.where <- 0
child.frame.index <- 0
while(at.leaf.node==F) {
node.frame.index<- which(tree.frame$node.label==at.node)
split.variable.name<-tree.frame$split.var.name[node.frame.index]
threshold.value<-tree.frame$split.var.value[node.frame.index]
test.value<-test.sample[split.variable.name]
if (numeric.or.factor[split.variable.name]) {
test.value <- as.numeric(test.value)
threshold.value <- as.numeric(threshold.value)
NA.child.node<-is.na(test.value)
if (!NA.child.node) {
left.node<- test.value<=threshold.value
if (left.node) {
child.frame.index <- which((tree.frame$parent.node==at.node) & (tree.frame$dec.rule=="<=") )
} else {
child.frame.index <- which((tree.frame$parent.node==at.node) & (tree.frame$dec.rule==">") )
}
} else {
child.frame.index <- which((tree.frame$parent.node==at.node) & (is.na(tree.frame$dec.rule) ) )
}
} else {
test.value <- as.character(test.value)
threshold.value <- as.character(threshold.value)
NA.child.node<-is.na(test.value)
if (!NA.child.node) {
left.node<- test.value==threshold.value
if (left.node) {
child.frame.index <- which((tree.frame$parent.node==at.node) & (tree.frame$dec.rule=="==") )
} else {
child.frame.index <- which((tree.frame$parent.node==at.node) & (tree.frame$dec.rule=="!=") )
}
} else {
child.frame.index <- which((tree.frame$parent.node==at.node) & (is.na(tree.frame$dec.rule) ) )
}
}
at.node<-tree.frame$node.label[child.frame.index]
at.leaf.node <- (tree.frame$split.var.name[child.frame.index] =="<leaf>")
if (at.leaf.node) {
test.where<-at.node
}
node.frame.index<- which(tree.frame$node.label==at.node)
}
class.call<-tree.frame$call[node.frame.index]
if (is.na(class.call)) {
print(paste("leaf.node.index",node.frame.index))
print(tree.frame)
class.call<-F
}
return(list(call=class.call,where=test.where))
}
draw.node<-function(tree.frame,node.drawn)
{
node.index<-which(tree.frame$node.label==node.drawn)
if (node.index==0) {
print("No such node exists")
return()
}
parent.node.label<-tree.frame$parent.node[node.index]
parent.node.index<-which(tree.frame$node.label==parent.node.label)
num.fired<-tree.frame$count.fired[node.index]
num.nonfired<-tree.frame$count.nonfired[node.index]
num.total<-num.fired+num.nonfired
node.depth<-tree.frame$depth[node.index]
node.str<-character()
if (node.depth>0) {
for (i in 1:node.depth) {
node.str<-paste(node.str,"\t")
}
}
node.str<-paste(node.str,"Node ",node.drawn,": ")
if (is.na(parent.node.label)) {
node.str<-paste(node.str, "Root node ", " Fired" ,num.fired, "Nonfired",num.nonfired, "Total", num.total)
} else if (!is.na(tree.frame$dec.rule[node.index])) {
node.str<-paste(node.str, tree.frame$split.var.name[parent.node.index],tree.frame$dec.rule[node.index],
tree.frame$split.var.value[parent.node.index],
" Fired" ,num.fired, "Nonfired",num.nonfired, "Total", num.total)
} else {
node.str<-paste(node.str, tree.frame$split.var.name[parent.node.index] ,"is.na",
" Fired" ,num.fired, "Nonfired",num.nonfired, "Total", num.total)
}
node.str<-paste(node.str,"\n")
cat(node.str)
}
draw.tree<-function(tree.frame)
{
all.nodes.drawn<-F
current.node<-0
tree.size<-dim(tree.frame)[1]
drawn.nodes<-rep(0,tree.size)
node.queue<-c()
while(all.nodes.drawn==F) {
draw.node(tree.frame,current.node)
current.node.index <- which(tree.frame$node.label ==current.node)
child.nodes.index <- which(tree.frame$parent.node==current.node)
if (length(child.nodes.index)==0) {
} else {
child.nodes <- tree.frame$node.label[child.nodes.index]
node.queue <- c(child.nodes,node.queue)
}
node.queue <- node.queue[ node.queue!=current.node]
drawn.nodes[current.node.index] <-1
if(sum(drawn.nodes)==tree.size) {
all.nodes.drawn <- T
}
else current.node<-node.queue[1]
}
}