QuadrantTreeClassifierRCode
From CVRG Wiki
Contents |
R Code Branch of Quadrant Tree Classifier Doc
Document Information
Package base version 2.8.0
1/5/2009
Project 5: Statistical Learning with Multi-Scale Cardiovascular Data
Contact email: yqin@jhu.edu
CardioVascular Research Grid
Johns Hopkins University
Commented Quadrant Tree Classifier Code
Download zipped file of commented code
library(survival)
#
# Given vector of event times and fired indicator (0=censored,1=fired),
# and a time threshold, compute an estimate of the probability of firing
# before that time threshold. We use linear interpolation of output from the
# survival package's kaplan-meier calculation. Generally, we err on the side
# of making the probability greater than needed to be careful.
#
survival.prob<-function(time,fired,time.threshold)
{
# If no data, then returen 0 so err on the side of making everyone fire.
if (length(time)==0)
{
return(0)
}
# Fit a survival model with Kaplan-Meier method.
S<-survfit(Surv(time,fired),type="kaplan-meier")
# Assign number of observatoins to ntimes.
ntimes<-length(S$time)
# If there is just one observation or the smallest time is larger than the
# threshold, return 0.
if ((ntimes==1)||(min(S$time)>=time.threshold))
{
return(0)
}
# If the largest time is smaller than the threshold, return the largest survival
# probability generated by the model.
if (max(S$time)<=time.threshold)
{
return(S$surv[length(S$surv)])
}
# If not either of the two situation mentioned above, then do linear interpolation
# Find the index of observation with the largest time that is smaller than threshold
index1<-max(which(S$time<time.threshold))
# Find the index of observation with the smallest time that is larger than threshold
# i.e. index1+1
index2<-index1+1
# Calculate the probability of surviving after time index1.
p1<-S$surv[index1]
# Calculate the probability of surviving after time index2.
p2<-S$surv[index2]
# Linear interpolation
lambda<-(time.threshold-S$time[index1])/(S$time[index2]-S$time[index1])
p<-p1+lambda*(p2-p1)
return(p)
}
build.quadrant.classifier<-function(x,y,implanted,days.to.firing,tx.list,ty.list,sens,time.threshold)
{
#
#
#
# Inputs:
#
# x = continuous feature variable vector
# y = continuous feature variable vector
# implanted = days of implanted
# days to firing = days to firing
#
# tx = list of x thresholds
# ty = list of y thresholds
# sens = desired sensitivity
# time.threshold = time defining the "fired within" phenotype
#
#Output:
#
# an assignment of probabilities to the cells:
#
# cell 1: x<tx, y<ty
# cell 2: x<tx, y>ty
# cell 3: x>tx, y<ty
# cell 4: x>tx, y>ty
#
#
# This function uses the R "survival" package to compute
# Kaplan-Meier probabilities
#
# Create a time variable and an indicator of fired.
#
# Assign lx the value of the length of x.
lx<-length(x)
# Assign fired a array of 0s with length the same as lx.
fired<-rep(0,lx)
# Assign time a array of 0s with length the same as lx.
time<-rep(0,lx)
# Remove all the NAs and convert data for survival analysis.
for (i in seq(1,lx))
{
# For each i, i.e. each observation:
# If days.to.firing is NA, then assign both fired[i] and time[i] to 0.
# Otherwise, if days.to firing is smaller than time.threshold, then assign
# fired[i]
if (is.na(days.to.firing[i]))
{
fired[i]<-0
time[i]<-implanted[i]
} else
# Otherwise, if days.to firing is smaller than time.threshold, then assign
# fired[i] 1, and copy days.to.firing[i] to time[i]. And otherwise, assign
# fired[i] 0, and copy implanted[i] to time[i].
{
if (days.to.firing[i]<time.threshold)
{
fired[i]<-1
time[i]<-days.to.firing[i]
}else
{
fired[i]<-0
time[i]<-implanted[i]
}
}
}
#
# Get rid of missing data
#
# Combine x, y, time, and fired to a data frame named d.temp
d.temp<-data.frame(x=x,y=y,time=time,fired=fired)
# Remove all the observations with missing data or NAs.
d.temp<-d.temp[complete.cases(d.temp),]
#
# Creat a list with spec=0
best.L<-list(spec=0)
# Creat a loop to try every possible combination of tx and ty to get a optimal
# pair of tx ty with maxium specificity.
for (tx in tx.list)
{
for (ty in ty.list)
{
#
# Break up the data into the four cells based on tx and ty.
#
data<-list()
data[[1]]<-d.temp[(d.temp$x<tx)&(d.temp$y<ty),]
data[[2]]<-d.temp[(d.temp$x<tx)&(d.temp$y>ty),]
data[[3]]<-d.temp[(d.temp$x>tx)&(d.temp$y<ty),]
data[[4]]<-d.temp[(d.temp$x>tx)&(d.temp$y>ty),]
#
#
# Compute prior probabilities of the cells.
#
p.cells<-c(
dim(data[[1]])[1],
dim(data[[2]])[1],
dim(data[[3]])[1],
dim(data[[4]])[1])
p.cells<-p.cells/sum(p.cells)
#
# Compute the fire survival probablilities.
# Note the linear interpolation.
#
# Creat array for p and P[fire | cell = i]
p.fire<-c()
# Creat array for p and P[not fire | cell = i]
p.notfire<-c()
#
# Do a loop for each cell.
for (i in seq(1,4))
{
#print("i = ")
#print(i)
# Store time for cell i
time<-data[[i]]$time
# Store fired for cell i
fired<-data[[i]]$fired
# Do survival analysis within cell i
p<-survival.prob(time,fired,time.threshold)
#print("p = ")
#print(p)
# Store P[not fire | cell = i] to p.notfire
p.notfire<-c(p.notfire,p)
# Store P[fire | cell = i] to p.fire
p.fire<-c(p.fire,1-p)
}
#print("end of loop p.fire p.notfire= ")
#print(p.fire)
#print(p.notfire)
#
# We estimated P[fire | cell = i] for each i,
# and P[not fire | cell = i]
# for each i and we know P[ cell = i].
# Use Bayes theorem to get the cell
# distributions under each class.
#
# p = P[ cell = i | fired]
#
# and
#
# q = P[ cell i | not fired]
#
# Calculate P[ cell = i, fired]
p<-p.fire*p.cells
# Calculate P[ cell = i | fired]
p<-p/sum(p)
# Calculate P[ cell = i, not fired]
q<-p.notfire*p.cells
# Calculate P[ cell = i | not fired]
q<-q/sum(q)
#
# Compute likelihood ratios
#
lr<-p/q
#
# Make a list of the cell indices 1,2,3,4 in decreasing order
# of likelihood ratio.
#
perm<-order(lr, decreasing=TRUE)
#
# Define the cell probabilities for the classifier.
# The rule is to assign a
# probability of 1 to each cell until we get to
# the desired sensitivity.
# The last probability can be a fraction in order
# to get exactly what we want.
#
#print("lr = ")
#print(lr)
#print("p = ")
#print(p)
#print("q = ")
#print(q)
#
# Assign all 0s to fprob
fprob<-rep(0,4)
# Creat a variable used in "while" loop.
psum<-0
i<-1
# Repeat the loop until psum+p[perm[i]]<sens, which means keep add up more
# cells into psum until it reaches sensitiviy.
while(psum+p[perm[i]]<sens)
{
# If a cell is included, assign fprob for that cell 1.
fprob[perm[i]]<-1
# Add one more into psum
psum<-psum+p[perm[i]]
i<-i+1
}
# For the last cell, compute a partial
fprob[perm[i]]<-(sens-psum)/p[perm[i]]
#
# Compute the specificity
#
spec<-sum((1-fprob)*q)
# If we find a larger specificity, we replace the best.L with the new one.
if (spec>best.L$spec)
{
best.L<-list(fprob=fprob,spec=spec,tx=tx,ty=ty,p=p,q=q)
}
}
}
# Return best.L.
best.L
}
Uncommented Quadrant Tree Classifier Code
Download zipped file of uncommented code
library(survival)
survival.prob<-function(time,fired,time.threshold)
{
# If no data, then returen 0 so err on the side of making everyone fire.
if (length(time)==0)
{
return(0)
}
S<-survfit(Surv(time,fired),type="kaplan-meier")
ntimes<-length(S$time)
if ((ntimes==1)||(min(S$time)>=time.threshold))
{
return(0)
}
if (max(S$time)<=time.threshold)
{
return(S$surv[length(S$surv)])
}
index1<-max(which(S$time<time.threshold))
index2<-index1+1
p1<-S$surv[index1]
p2<-S$surv[index2]
lambda<-(time.threshold-S$time[index1])/(S$time[index2]-S$time[index1])
p<-p1+lambda*(p2-p1)
return(p)
}
build.quadrant.classifier<-function(x,y,implanted,days.to.firing,tx.list,ty.list,sens,time.threshold)
{
lx<-length(x)
fired<-rep(0,lx)
time<-rep(0,lx)
for (i in seq(1,lx))
{
if (is.na(days.to.firing[i]))
{
fired[i]<-0
time[i]<-implanted[i]
} else
{
if (days.to.firing[i]<time.threshold)
{
fired[i]<-1
time[i]<-days.to.firing[i]
}else
{
fired[i]<-0
time[i]<-implanted[i]
}
}
}
d.temp<-data.frame(x=x,y=y,time=time,fired=fired)
d.temp<-d.temp[complete.cases(d.temp),]
best.L<-list(spec=0)
for (tx in tx.list)
{
for (ty in ty.list)
{
data<-list()
data[[1]]<-d.temp[(d.temp$x<tx)&(d.temp$y<ty),]
data[[2]]<-d.temp[(d.temp$x<tx)&(d.temp$y>ty),]
data[[3]]<-d.temp[(d.temp$x>tx)&(d.temp$y<ty),]
data[[4]]<-d.temp[(d.temp$x>tx)&(d.temp$y>ty),]
p.cells<-c(
dim(data[[1]])[1],
dim(data[[2]])[1],
dim(data[[3]])[1],
dim(data[[4]])[1])
p.cells<-p.cells/sum(p.cells)
p.fire<-c()
p.notfire<-c()
for (i in seq(1,4))
{
time<-data[[i]]$time
fired<-data[[i]]$fired
p<-survival.prob(time,fired,time.threshold)
p.notfire<-c(p.notfire,p)
p.fire<-c(p.fire,1-p)
}
p<-p.fire*p.cells
p<-p/sum(p)
q<-p.notfire*p.cells
q<-q/sum(q)
lr<-p/q
perm<-order(lr, decreasing=TRUE)
fprob<-rep(0,4)
psum<-0
i<-1
while(psum+p[perm[i]]<sens)
{
# If a cell is included, assign fprob for that cell 1.
fprob[perm[i]]<-1
# Add one more into psum
psum<-psum+p[perm[i]]
i<-i+1
}
fprob[perm[i]]<-(sens-psum)/p[perm[i]]
spec<-sum((1-fprob)*q)
if (spec>best.L$spec)
{
best.L<-list(fprob=fprob,spec=spec,tx=tx,ty=ty,p=p,q=q)
}
}
}
best.L
}
