## SWITCHdna.r ## Authors: Victor J. Weigman1,2, Andrey A. Shabalin3,4, Joel S. Parker1 ## 1. Program in Bioinformatics and Computational Biology ## 2. Department of Biology ## 3. Department of Statistics ## 4. Departmant of Biostatistics ## ## This is an all-encompassing package which aggregates informaiton, statistics, and graphics related to aCGH experiments (or intensities from SNP arrays) ## The modules below are mainly based from output from either SWITCHdna or cnaGENE # Functions ## SWITCHdna(data = data, Fthresh = 11, alpha = 10, verbose=1){ # data = file that contains CGH intensity, with first 3 columns being (Probe, Chr, Position), in genomic order, dimnames must be set # Fthresh = value for F-statistics, to find breakpoints # alpha = min number of probes/observations needed to cover you minimum segment size # -returns- # data.frame of segments # file of segments to use for plotting functions ## ## plot.freq.SW # data = is the file of output from SWITCHdna # grps = file name that contains each sample and the class its associated with. First column is sample names, 2nd column is classIDs # thresh = Tstatistic cutoff # species = species you are using, point to chrmlengths file ## segAn(sumfile=sumfile,ref=ref,pct=0.5) # annotates _summary.txt files for identifying genes within the region at a pre-determined percentage # sumfile = output file from plot.freq.SW or its object # ref = Reference file for containing genomic locations of region of interest, same format as with cnaGENE # 4 columns: Chromosome, Start, Stop, Symbol # pct = %age cutoff for annotating a segment ##cnaGENE(data="filename.txt",ref="annotfile.txt",Z.score=3,I.thresh=0.1,return.type=1,out="filename") # takes direct output from CNAmat to make a M x N of gain/lost status for genes and samples # Calling gains and losses follows same protocol as the plotting functions # Parameters: # data = filename from output SWITCHdna # ref = reference genome file, formatted as RowID Chr Start Pos End Pos Gene Symbol # Z.score = Score to define segments that are significant # I.thresh = Score to define background level of intensity # return.type: 1 = discrete, 2 = intensity, 3 = z.score ## plot.SW plots segment sizes and intensities for each class of interest # data = object from SWITCHdna # grps = file name that contains each sample and the class its associated with # Z.score = Z score cutoff # I.thresh = Segment Intensity cutoff ## plot.freq.SW(data=data, grps=groups, Z.score=3, I.thresh=0.1, species="C:/Scripts/hsChrLengths.txt", returnSummary = F) # creates a frequency plot for each class of interest # data = object from SWITCHdna # grps = file name that contains each sample and the class its associated with # Z.score = Zscore cutoff # species = species of where data comes from, helps with plotting in genomic distance # returnSummary = if you want the function to return the summary table ############################################################ ################# FUNCTIONS ############################## ############################################################ # SWITCHdna - wrapper for doing CGH data analysis using the supWald approximation # data - data matrix that has first 3 columns as # Fthresh - Threshold for F-statistic # alpha - the number of probes/observations needed to cover your minimal size range # verbose - 1 will alert for each new sample processed. 0 - squelches all output to prompt # -returns- # data.frame of segments # file of segments to use for plotting functions SWITCHdna <- function(data = data, Fthresh = 11, alpha = 10, verbose=1){ time.start <- Sys.time() out <- paste("SWITCHdna_out_F",Fthresh,"_a",alpha,".txt",sep="") if(mode(data)=="character"){ out <- strsplit(data,"\\.")[[1]][1] data <- read.table(data, sep="\t",header=T,na.strings="") out <- paste(out,"_SWITCHdna_out_F",Fthresh,"_a",alpha,".txt",sep="") } col.index <- 4:ncol(data) row.index <- 1:nrow(data) chrom <- as.numeric(data[row.index,2]) pos <- as.numeric(data[row.index,3]) chrom.tab <- names(table(chrom)) data.mat <- as.matrix(data[row.index,col.index]) cna.out <- c() for(i in 1:ncol(data.mat)){ flush.console() print(paste("Processing Sample:",names(data[,col.index])[i])) for(j in 1:length(chrom.tab)){ data.chrom <- as.matrix(data.mat[chrom==chrom.tab[j],i]) data.chrom.pos <- as.matrix(pos[chrom==chrom.tab[j]]) #remove missing values, maybe address better method later... data.chrom.pos <- as.matrix(data.chrom.pos[!is.na(data.chrom)]) data.chrom <- as.matrix(data.chrom[!is.na(data.chrom)]) n <- length(data.chrom) data.out <- breakMultiple(data.chrom, alpha, Fthresh) posA = data.out$pos; posA.long = c(0,posA,n); averages = rep(0,length(posA)+1); tstats = rep(0,length(posA)+1); for(k in 1:(length(posA)+1)){ averages[k] <- mean( data.chrom[(posA.long[k]+1):posA.long[k+1]]); tstats[k] <- averages[k] * sqrt(posA.long[k+1] - posA.long[k]); } beginings = data.chrom.pos[posA.long[1:(length(posA)+1)]+1] ends = data.chrom.pos[posA.long[2:(length(posA)+2)]]+1 cna.tmp <- cbind(Samples=rep(names(data[,col.index])[i],length(averages)),Chromosome=rep(chrom.tab[j],length(averages)),beginings,ends,tstats,averages) cna.out <- rbind(cna.out,cna.tmp) } } cna.out <- as.data.frame(cna.out) cna.out[,2:6] <- apply(cna.out[,2:6],2,as.numeric) time.end <- Sys.time() if(verbose==1){ print(difftime(time.end,time.start)) } write.table(cna.out,out,sep="\t",row.names=F) return(cna.out) } ## CNAstat - core internal function for application of SupWald method ## Original Author: Andrey Shabalin ## Implementation was originally displayed in Lickwar et al The Set2/Rpd3S Pathway Suppresses Cryptic Transcription without Regard to Gene Length or Transcription Frequency. PLoS ONE. March 19, 2009 CNAstat <- function(pos.x = pos.x, stepFromEnds = endstep){ pos <-c() supFstat <-c() n <- length(pos.x) if(n < stepFromEnds*2){ pos <- -1 supFstat <-0 return(list(pos=pos,supFstat=supFstat)) } pos.y <- rev(pos.x) sumX <- cumsum(pos.x) sumX2 <- cumsum(pos.x^2) sumY <- cumsum(pos.y) sumY2 <- cumsum(pos.y^2) sseX <- sumX2 - sumX^2/seq(1,n) sseY <- sumY2 - sumY^2/seq(1,n) SST <- tail(sseX,1) sseXY = as.matrix(head(sseX,length(sseX)-1) + rev(head(sseY,length(sseY)-1))) sseXY[1:(stepFromEnds-1)] <- Inf sseXY[(n-stepFromEnds+1):(n-1)] <- Inf posMin <- which.min(sseXY) sseMin <- sseXY[posMin] pos <- posMin supFstat = ((SST-sseMin)/sseMin)*(n-2) return(list(pos=pos,supFstat=supFstat)) } ## breakMultiple takes a set of positions and intensities and decides on splitting ## Original Author: Andrey Shabalin ## the iterative portion of the program ## find segments in a given string breakMultiple <- function(pos.x = pos.x, stepFromEnds = endstep, Fthresh = thresh.f){ newStat <- CNAstat(pos.x,stepFromEnds) if((newStat$pos < 0) | (newStat$supFstat < Fthresh)){ return(list(pos=c(),supFstat=c())) } else{ tmp.L <- breakMultiple(pos.x[1:newStat$pos],stepFromEnds, Fthresh) tmp.R <- breakMultiple(pos.x[(newStat$pos+1):length(pos.x)],stepFromEnds, Fthresh) posM <- c(tmp.L$pos,newStat$pos,(tmp.R$pos+newStat$pos)) supFstatM = c(tmp.L$supFstat, newStat$supFstat, tmp.R$supFstat) return(list(pos=posM,supFstat=supFstatM)) } } ############################################# ## The following are annotation functions ## ############################################# ## segAn - annotates _summary.txt files for identifying genes within the region at a pre-determined percentage # sumfile = output file from plot.freq.SW or its object # ref = Reference file for containing genomic locations of region of interest, same format as with cnaGENE # 4 columns: Chromosome, Start, Stop, Symbol # pct = %age cutoff for annotating a segment segAn <- function(sumfile=sumfile,ref=ref,pct=0.5){ data <- read.table(sumfile,header=T,sep="\t") annot <- read.table(ref,header=T,sep="\t",na.strings="") annot <- annot[which(!is.na(annot[,4])),] root <- unlist(strsplit(sumfile,"\\.txt"))[1] write.table(t(c(names(data),"Symbols")),file=paste(root,"_annot.txt",sep=""),sep="\t",append=T,col.names=F,row.names=F) count = 1 for(i in 1:nrow(data)){ gene.match <- c() if(data[i,4] >= pct | data[i,4] <= -pct){ annot.chr <- annot[annot[,1]==data[i,1],] #match chromosome symb <- as.matrix(annot.chr[,4]) if(nrow(annot.chr)>0){ for(j in 1:nrow(annot.chr)){ print(paste("i =",i,"j =",j)) if(annot.chr[j,2]>data[i,2] & annot.chr[j,2]data[i,2] & annot.chr[j,3]0){ gene.match <- unique(gene.match[!is.na(gene.match)]) gene.match <- paste(gene.match,collapse=" ") #data.out <- rbind(data.out,c(data[i,],symbols=gene.match)) write.table(c(count,data[i,],symbols=gene.match),file=paste(root,"_annot.txt",sep=""),sep="\t",append=T,col.names=F,row.names=F) count = count+1 } } } #write.table(data.out,file=paste(root,"_annot.txt",sep=""),sep="\t") } ##cnaGENE - takes direct output from CNAmat to make a M x N of gain/lost status for genes and samples # Calling gains and losses follows same protocol as the plotting functions # Parameters: # data = filename from output SWITCHdna # ref = reference genome file, formatted as RowID Chr Start Pos End Pos Gene Symbol # Z.score = Score to define segments that are significant # I.thresh = Score to define background level of intensity # return.type: 1 = discrete, 2 = intensity, 3 = z.score cnaGENE <- function(data="filename.txt",ref="annotfile.txt",Z.score=3,I.thresh=0.1,return.type=1,out="filename"){ if(mode(data)=="character"){ out <- strsplit(data,"\\.")[[1]][1] data<-read.table(data,sep="\t",header=T) } data <- data[data[,5] > Z.score | data[,5] <= -Z.score,] data <- data[data[,6] > I.thresh | data[,6] <= -I.thresh,] annot <- read.table(ref,header=T,sep="\t",na.strings="") annot <- annot[which(!is.na(annot[,4])),] samps <- names(table(data[,1])) #make matrix of proper dimensions to fit this data gene.mat <- matrix(0,nrow=nrow(annot),ncol=length(samps)) rownames(gene.mat) <- annot[,4] #annotate the ending matrix dimnames(gene.mat)[[2]] <- samps #for each segment, make call, update table for(i in 1:ncol(gene.mat)){ samp.tmp <- data[data[,1]==samps[i],] samp.tmp.pos <- samp.tmp[samp.tmp[,6]>0,] samp.tmp.neg <- samp.tmp[samp.tmp[,6]<0,] if(nrow(samp.tmp.pos)>0){ for(j in 1:nrow(samp.tmp.pos)){ annot.chr <- annot[annot[,1]==samp.tmp.pos[j,2],] #match chromosome symb <- as.matrix(annot.chr[,4]) if(nrow(annot.chr)>0){ for(k in 1:nrow(annot.chr)){ print(paste("i =",i,"j =",j,"k=",k)) if(annot.chr[k,2]>samp.tmp.pos[j,3] & annot.chr[k,2]samp.tmp.pos[j,3] & annot.chr[k,3]0){ for(j in 1:nrow(samp.tmp.neg)){ annot.chr <- annot[annot[,1]==samp.tmp.neg[j,2],] #match chromosome symb <- as.matrix(annot.chr[,4]) if(nrow(annot.chr)>0){ for(k in 1:nrow(annot.chr)){ print(paste("i =",i,"j =",j,"k=",k)) if(annot.chr[k,2]>samp.tmp.neg[j,3] & annot.chr[k,2]samp.tmp.neg[j,3] & annot.chr[k,3] Z.score | data[,5] <= -Z.score,] data <- data[data[,6] > I.thresh | data[,6] <= -I.thresh,] alldata<-list() for(k in 1:length(cls.names)){ samps <- cls[cls[,2]==cls.names[k],1] samps <- as.character(samps[!is.na(samps)]) nsamples<-nlevels(as.factor(samps)) x<-as.data.frame(data[data[,1]%in%samps,]) if(nrow(x) > 0){ vals<-as.numeric(as.vector(t(as.matrix(x)[,6]))) size<-as.numeric(as.vector(t(as.matrix(x)[,4])))-as.numeric(as.vector(t(as.matrix(x)[,3]))) vals[is.na(vals)]<-0 size<-log10(size) size[is.na(size)]<-0 highcut<- threshold lowcut<- -1*threshold calls<-rep("noChange",nrow(x)) gains<-rep(-1,nrow(x)) losses<-rep(-1,nrow(x)) calls[vals>highcut & size>segThreshold]<-"Gain" calls[valssegThreshold]<-"Loss" gains[vals>highcut & size>segThreshold]<-1 losses[valssegThreshold]<-1 alldata<-cbind(x,calls,gains,losses) # format the table and get out some variables newTable<-as.matrix(alldata) newTable<-as.matrix(apply(newTable,2,function(x){gsub(" ","",x)})) newTable[newTable=='X']<-20 newTable[newTable=='Y']<-21 newTable[newTable=="XY"]<-23 newTable[newTable=="M"]<-25 if(nrow(x)==1){ newTable <- as.matrix(t(newTable)) ##if matrix is 1 row, it gets cast to a character vector, which is bad } samples<-newTable[,1] #nsamples<-nlevels(as.factor(samples)) #browser() if(nrow(x)==1){ newTable<-as.numeric(newTable[,-c(1,7)]) newTable <- as.matrix(t(newTable)) ##if matrix is 1 row, it gets cast to a character vector, which is bad } else{ newTable<-apply(newTable[,c(-1,-7)],2,as.numeric) } chr<-newTable[,1] locstop<-newTable[,3] locstart<-newTable[,2] gains<-newTable[,6] losses<-newTable[,7] chr.gains<-vector() chr.losses<-vector() chr.losses.length<-vector() chr.gains.length<-vector() chr.losses.start<-vector() chr.gains.start<-vector() chr.losses.stop<-vector() chr.gains.stop<-vector() chr.losses.chr<-vector() chr.gains.chr<-vector() chr.start<-rep(0,10000) chr.end<-rep(0,10000) chr.chr<-rep(0,10000) chr.barsup<-rep(0,10000) chr.barsdown<-rep(0,10000) outable<-matrix(nrow=10000,ncol=4) gain.first<-1 loss.first<-1 for(tchr in 1:nrow(totalChrLengths)){ # parse out the data for this chromosome this.locstart<-newTable[chr==tchr,2] this.locstop<-newTable[chr==tchr,3] this.gains<-newTable[chr==tchr,6] this.losses<-newTable[chr==tchr,7] this.samples<-samples[chr==tchr] # sort within sample blocks and merge when contiguous blocks are the same sample and same call this.order<-order(this.samples,this.locstart) s.gains<-this.gains[this.order] s.losses<-this.losses[this.order] s.locs.start<-this.locstart[this.order] s.locs.stop<-this.locstop[this.order] s.samples<-this.samples[this.order] indices.start<-rep(0,10000) indices.stop<-rep(0,10000) count<-1 index<-1 if(length(s.locs.start)>1){ while((length(s.locs.start)-1) > index){ tstart<-index looped<-F while(s.samples[index]==s.samples[index+1] && s.gains[index]==s.gains[index+1] && s.losses[index]==s.losses[index+1] && (length(s.locs.start)-1) > index){ index<-index+1 looped<-T } if(looped){index<-index-1} indices.start[count]<-tstart indices.stop[count]<-index count<-count+1 index<-index+1 } if(sum(indices.start>0)>0){ s.gains<-s.gains[indices.start] s.losses<-s.losses[indices.start] s.locs.start<-s.locs.start[indices.start] s.locs.stop<-s.locs.stop[indices.stop] } } this.order<-order(s.locs.start) # sort across locations s.gains<-s.gains[this.order] s.losses<-s.losses[this.order] s.locs.start<-s.locs.start[this.order] s.locs.stop<- s.locs.stop[this.order] s.samples<-s.samples[this.order] # find the overlaping gain segments gain.filter<-s.gains==1 if(sum(gain.filter)>0){ sg.locs.start<-s.locs.start[gain.filter] sg.locs.stop<- s.locs.stop[gain.filter] breakType<-c(rep("start",length(sg.locs.start)),rep("stop",length(sg.locs.stop))) breakLocs<-c(sg.locs.start,sg.locs.stop) breakType<-breakType[sort.list(breakLocs)] breakLocs<-breakLocs[sort.list(breakLocs)] segCount<-1 segStop<-vector() segStart<-vector() segScores<-vector() segStart[1]<-breakLocs[1] segScore<-1 lastStart<-segStart[1] lastStop<-0 for(j in 2:length(breakLocs)){ if(breakType[j]=="start"){ if(breakLocs[j] == lastStart){ segScore <- segScore + 1 }else{ if(segScore>0){ segStop[segCount] <- breakLocs[j] lastStop <- segStop[segCount] segScores[segCount] <- segScore segCount <- segCount + 1 } segStart[segCount] <- breakLocs[j] segScore <- segScore + 1 lastStart<-segStart[segCount] } }else{ if(breakLocs[j] == lastStop){ segScore <- segScore - 1 }else{ segStop[segCount] <- breakLocs[j] lastStop <- segStop[segCount] segScores[segCount] <- segScore segCount <- segCount + 1 segScore <- segScore - 1 segStart[segCount] <- breakLocs[j] } } } gain.start<-segStart[-1*length(segStart)] gain.stop<-segStop gain<-segScores }else{ gain.start<-0 gain.stop<-totalChrLengths[totalChrLengths[,1]==tchr,2] gain<-0 } # find the overlaping loss segments loss.filter<- s.losses==1 if(sum(loss.filter)>0){ sl.locs.start<-s.locs.start[loss.filter] sl.locs.stop<- s.locs.stop[loss.filter] breakType<-c(rep("start",length(sl.locs.start)),rep("stop",length(sl.locs.stop))) breakLocs<-c(sl.locs.start,sl.locs.stop) breakType<-breakType[sort.list(breakLocs)] breakLocs<-breakLocs[sort.list(breakLocs)] segCount<-1 segStop<-vector() segStart<-vector() segScores<-vector() segStart[1]<-breakLocs[1] segScore<-1 lastStart<-segStart[1] lastStop<-0 for(j in 2:length(breakLocs)){ if(breakType[j]=="start"){ if(breakLocs[j] == lastStart){ segScore <- segScore + 1 }else{ if(segScore>0){ segStop[segCount] <- breakLocs[j] lastStop <- segStop[segCount] segScores[segCount] <- segScore segCount <- segCount + 1 } segStart[segCount] <- breakLocs[j] segScore <- segScore + 1 lastStart<-segStart[segCount] } }else{ if(breakLocs[j] == lastStop){ segScore <- segScore - 1 }else{ segStop[segCount] <- breakLocs[j] lastStop <- segStop[segCount] segScores[segCount] <- segScore segCount <- segCount + 1 segScore <- segScore - 1 segStart[segCount] <- breakLocs[j] } } } loss.start<-segStart[-1*length(segStart)] loss.stop<-segStop loss<-segScores }else{ loss.start<-0 loss.stop<-totalChrLengths[totalChrLengths[,1]==tchr,2] loss<-0 } # finalize the segments into the entire chrom loss.index<-loss.first if(loss.start[1] != 0){ chr.losses[loss.index]<- 0 chr.losses.length[loss.index]<- loss.start[1] chr.losses.start[loss.index]<-0 chr.losses.stop[loss.index]<-loss.start[1] loss.index<-loss.index+1 } chr.losses[loss.index]<-loss[1] chr.losses.length[loss.index]<-loss.stop[1]-loss.start[1] chr.losses.start[loss.index]<-loss.start[1] chr.losses.stop[loss.index]<-loss.stop[1] loss.index<-loss.index+1 if(length(loss.start)>1){ for(j in 2:length(loss.start)){ if(loss.start[j]!=loss.stop[j-1]){ chr.losses[loss.index]<- 0 chr.losses.length[loss.index]<- loss.start[j] - loss.stop[j-1] chr.losses.start[loss.index]<-loss.stop[j-1] chr.losses.stop[loss.index]<-loss.start[j] loss.index <- loss.index + 1 } chr.losses[loss.index]<-loss[j] chr.losses.length[loss.index]<-loss.stop[j]-loss.start[j] chr.losses.start[loss.index]<-loss.start[j] chr.losses.stop[loss.index]<-loss.stop[j] loss.index <- loss.index + 1 } } if(totalChrLengths[tchr,2]>sum(chr.losses.length[loss.first:(loss.index-1)])){ chr.losses[loss.index]<-0 chr.losses.start[loss.index]<-sum(chr.losses.length[loss.first:(loss.index-1)]) chr.losses.stop[loss.index]<-totalChrLengths[totalChrLengths[,1]==tchr,2] chr.losses.length[loss.index]<-totalChrLengths[totalChrLengths[,1]==tchr,2]-sum(chr.losses.length[loss.first:(loss.index-1)]) loss.index <- loss.index + 1 } if(totalChrLengths[tchr,2]1){ for(j in 2:length(gain.start)){ if(gain.start[j]!=gain.stop[j-1]){ chr.gains[gain.index]<- 0 chr.gains.length[gain.index]<- gain.start[j] - gain.stop[j-1] chr.gains.start[gain.index]<-gain.stop[j-1] chr.gains.stop[gain.index]<-gain.start[j] gain.index <- gain.index + 1 } chr.gains[gain.index]<-gain[j] chr.gains.length[gain.index]<-gain.stop[j]-gain.start[j] chr.gains.start[gain.index]<-gain.start[j] chr.gains.stop[gain.index]<-gain.stop[j] gain.index <- gain.index + 1 } } if(totalChrLengths[totalChrLengths[,1]==tchr,2]>sum(chr.gains.length[gain.first:(gain.index-1)])){ chr.gains[gain.index]<-0 chr.gains.start[gain.index]<-sum(chr.gains.length[gain.first:(gain.index-1)]) chr.gains.stop[gain.index]<-totalChrLengths[totalChrLengths[,1]==tchr,2] chr.gains.length[gain.index]<-totalChrLengths[totalChrLengths[,1]==tchr,2]-sum(chr.gains.length[gain.first:(gain.index-1)]) gain.index <- gain.index + 1 } if(totalChrLengths[tchr,2]1){ chr.pos = sum(as.numeric(totalChrLengths[1:(j-1),2])) } else{ chr.pos = 0 } chr.lengths[count]<- totalChrLengths[j,2]-50 chr.barsup[count]<- 0 chr.barsdown[count]<- 0 count<-count+1 chr.lengths[count]<- 50 chr.barsup[count]<- 1 chr.barsdown[count]<- -1 count<-count+1 abline(v=chr.pos,col="gray",lwd=3) text(chr.pos,-0.9,labels=j,col="black",cex=1.5) } dev.off() outable<-rbind(cbind(chr.gains.chr,chr.gains.start,chr.gains.stop,(chr.gains/nsamples)),cbind(chr.losses.chr,chr.losses.start,chr.losses.stop,(-1*chr.losses/nsamples))) outable<-as.matrix(outable[!(is.na(outable[,4]) | outable[,4]==0),]) if(ncol(outable)==1){ outable <- t(outable) ##this exists because if there is only one region for a summary, then it gets cast to a vector and not the 4-column matrix it is supposed to } if(length(outable)>0){ outable<-rbind(c("Chr","Start","End","Percent Change"),outable) write.table(outable,paste(cls.names[k],"_SWITCHdna_summary.txt",sep=""),sep="\t",col.names=F,row.names=F) } else{ print(paste(cls.names[k],"_SWITCHdna_summary.txt contains no elements!",sep="")) } } else{ print(paste(cls.names[k],"_SWITCHdna_summary.txt contains no elements after filter!",sep="")) } } if(returnSummary==T){ return(outable) } }