Quantcast
Viewing all articles
Browse latest Browse all 27

a simple gene finder

上了几天的课,http://ibw2011.fmmu.edu.cn/schedule.htm
今天就上完了,只完成了project 1,想写gibbs sampling,但是没搞明白,汗。

这个纯属练习用,没啥实用价值。

Course Projects:

Project 1: Implementation of a simple gene finder

GOAL

Build a simple codon-usage based gene finder for finding genes in E.coli.

Procedure

Collect 100 gene sequences from the bacterium E. coli in the genbank (http://www.ncbi.nlm.nihh.gov).
Compute the codon usage table based on these genes (and the translated protein sequences from them);
Build a probabilistic model based on the codon usages;
Implement a random sequence model in which the nucleotide frequency is computed from the 100 E. coli genes.
For a given DNA sequence (and one selected reading frame), compare your model with a random sequence model;
Results that you should submit:

Two FASTA files for the collected 100 genes and 100 translated protein sequences;
The printed codon usage table;
A program named ECgnfinder, running with the syntax as
ECgnfinder –i inputfile

Inputfile stands for the name of input file, which should contain one DNA sequence in FASTA file format; the program should be able to report an error message if the input file is in the wrong format.

The output should be printed to the standard output as (xxx stands for the likelihood)

ORF1: xxx
ORF2: xxx

?View Code RSPLUS
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
###########################################################
##                                                       ##
##  Dragonstar Course Project (IBW 2011)                 ##
##                                                       ##
##  Project 1: Implementation of a simple gene finder    ##
##  Author Guangchuang Yu <guangchuangyu@gmail.com>      ##
##  Xi'an (2011-07-08)                                   ##
##                                                       ##
###########################################################
 
 
 
##' .. content for \description{} (ECgnfinder: An R program to predict prokaryotic genes (e.g. E.coli) using 1st Markov Chain) ..
##'
##' .. content for \details{} ..
##' @title ECgnfinder
##' @param traningSequence : A FASTA file containing multiple sequences for training set
##' @param testingSequence : A FASTA file containing multiple sequences for testing
##' @param backgroundSequence : A FASTA file containing multiple sequences for building the background model
##' @return log likelihood ratio
##' @author Guangchuang Yu
ECgnfinder <- function(trainingSequence, testingSequence, backgroundSequence, print=FALSE) {
    print ("reading training sequence file...")
    trainingFasta <- readFASTA(trainingSequence)
    trainSeq <- sapply(trainingFasta, function(i) i$seq)
 
    print("building Transition Probability Matrix...")
    TPM <- getTPM(trainSeq)
 
    print("building codon usage table...")
    codonUsage <- getCodonUsage(trainSeq)
 
    print("building NucleoTide table...")
    NtUsage <- getNtUsage(backgroundSequence)
 
 
    print("reading testing sequence file...")
    testing <- readFASTA(testingSequence)
    testingSeq <- sapply(testing, function(i) i$seq)
 
    x <- lapply(testingSeq, logLikelihood, transition=TPM, codonusage=codonUsage, background=NtUsage)
    names(x) <- paste("Sequence", seq_along(testingSeq), sep="")
    if (print) {
        x
    }
    return(x)
}
 
 
##' .. content for \description{} (read FASTA file) ..
##'
##' .. content for \details{} ..
##' @title readFASTA
##' @param file: A FASTA file
##' @return a list of list which containing desc and seq.
##' @author Guangchuang Yu
readFASTA <- function(file) {
	x=scan(file, what="", sep="\n")
	idx <- grep(">", x)
	fasta <- list()
	for (i in 1:length(idx)) {
            desc <- x[idx[i]]
            if (i == length(idx)) {
                stop <- length(x)
            } else {
                stop <- idx[i+1]-1
            }
            seq.idx <- seq(idx[i]+1, stop)
            s <- x[seq.idx]
            s <- paste(s, collapse="")
            ss <- list(desc=desc, seq=s)
            fasta[i] <- list(ss)
	}
	return(fasta)
    }
 
 
##' Build a transtion probability matrix from the given Sequence set.
##'
##' @title getTPM
##' @param sequences : A vector of Sequences.
##' @return transition probability matrix
##' @author Guangchuang Yu
getTPM <- function(sequences) {
    codonLength <- 3
    codons <- getCodon()
 
    ## calculate a transition count matrix
    transition <- matrix(1, nrow=length(codons), ncol=length(codons))  ## pseudo count
    colnames(transition) <- codons
    rownames(transition) <- codons
 
    for (x in 1:length(sequences)) {
        s <- sequences[x]
        len <- nchar(s)
        ## drop the last alphabets which is too short (<codonLength) to form a codon.
        ## and -codonLength to eliminate out of boundary.
        len <- len - len %% codonLength - codonLength
 
        idx <- seq(1, len, codonLength)
        for (i in seq_along(idx)) {
            ## codonI--codonJ
            codonI <- substr(s,idx[i], idx[i]+codonLength-1)
            if (i != length(idx)) {
                codonJ <- substr(s,idx[i+1], idx[i+1]+codonLength-1)
            }
            if (codonI %in% codons & codonJ %in% codons) {
                transition[codonI, codonJ] <- transition[codonI, codonJ] +1
            } else {
                warning("codon ", codonI, " or ", codonJ, " which does not defined was detected...")
            }
        }
    }
 
    ## Calculate Transition Probability Matrix..
    rowsum <- rowSums(transition)
    TPM <- transition/rowsum
    return(TPM)
}
 
getCodon <- function() {
    codons <- c("TTT", "TTC", "TTA", "TTG",
                "CTT", "CTC", "CTA", "CTG",
                "ATT", "ATC", "ATA", "ATG",
                "GTT", "GTC", "GTA", "GTG",
                "TCT", "TCC", "TCA", "TCG",
                "CCT", "CCC", "CCA", "CCG",
                "ACT", "ACC", "ACA", "ACG",
                "GCT", "GCC", "GCA", "GCG",
                "TAT", "TAC", "TAA", "TAG",
                "CAT", "CAC", "CAA", "CAG",
                "AAT", "AAC", "AAA", "AAG",
                "GAT", "GAC", "GAA", "GAG",
                "TGT", "TGC", "TGA", "TGG",
                "CGT", "CGC", "CGA", "CGG",
                "AGT", "AGC", "AGA", "AGG",
                "GGT", "GGC", "GGA", "GGG"
                )
    return(codons)
}
 
 
##' .. content for \description{} (estimate codon usage table based on training sequences) ..
##'
##' .. content for \details{} ..
##' @title getCodonUsage
##' @param sequences : A vector of sequence set.
##' @return A data.frame containing two columns of codons and Freq (frequency probabilities)
##' @author Guangchuang Yu
getCodonUsage <- function(sequences) {
    codons <- codons <- getCodon()
    codon.table <- data.frame">data.frame(codons=codons, Freq=rep(0, length(codons)))
    rownames(codon.table) = codons
 
    for (i in 1:length(sequences)) {
        codon <- codonCount(sequences[i])
        rowname.idx <- as.character(codon[,1])
        codon.table[rowname.idx,2] <- codon.table[rowname.idx,2] + codon[,2]
    }
    codon.table[,2] <- codon.table[,2]/sum(codon.table[,2])
                                        #print(codon.table)  #### print result.
    return(codon.table)
}
 
##' .. content for \description{} (no empty lines) ..
##'
##' .. content for \details{} ..
##' @title codonCount
##' @param seqStr : A sequence string
##' @return A data.frame containing two columns of codons and their counts
##' @author Guangchuang Yu
codonCount <- function(seqStr) {
    codonLength=3
    len <- nchar(seqStr)
    ## drop the last alphabets which is too short (<codonLength) to form a codon.
    len <- len - len %% codonLength
 
    idx <- seq(1, len,codonLength)
    codons <- c()
    for ( i in 1:length(idx)) {
        codon <- substr(seqStr, idx[i], idx[i]+codonLength-1)
        codons <- c(codons, codon)
    }
    codons.df <- as.data.frame(table(codons))
    return(codons.df)
}
 
##' .. content for \description{} (reverse complementary sequence) ..
##'
##' .. content for \details{} ..
##' @title revcom
##' @param seqStr : A sequence string
##' @return A reverse complementary sequence
##' @author Guangchuang Yu
revcom <- function(seqStr) {
	seqStr.str <- unlist(strsplit(seqStr, split=""))
	idxT <- which(seqStr.str=="T")
	idxG <- which(seqStr.str=="G")
	idxA <- which(seqStr.str=="A")
	idxC <- which(seqStr.str=="C")
	seqStr.str[idxT] <- "A"
	seqStr.str[idxA] <- "T"
	seqStr.str[idxG] <- "C"
	seqStr.str[idxC] <- "G"
	seqStr.rc <- rev(seqStr.str)
	seqStr.rc <- paste(seqStr.rc, collapse="")
	return(seqStr.rc)
}
 
##' .. content for \description{} (calculate NT frequency probabilities of a given sequence set) ..
##'
##' .. content for \details{use to estimate the background/random NT usage} ..
##' @title getNtUsage
##' @param file : A FASTA file
##' @return A data.frame containing NTs and their corresponding frequency probabilities.
##' @author Guangchuang Yu
getNtUsage <- function(file) {
    print("reading FASTA file")
    fasta <- readFASTA(file)
    sequences <- sapply(fasta, function(i) i$seq)
    x <- sapply(sequences, strsplit, split="")
    x <- unlist(x)
    NTprob <- as.data.frame(table(x))
    NTprob[,2] <- NTprob[,2]/sum(NTprob[,2])
    rownames(NTprob) <- NTprob[,1]
    return(NTprob)
}
 
##' .. content for \description{} (calculate the log likelihood of the six possible reading frame of a given sequence) ..
##'
##' .. content for \details{} ..
##' @title logLikelihood
##' @param sequence : A sequence string
##' @param transition : Transition Probatility Matrix
##' @param codonusage : Codon Usage data.frame
##' @param background : NT Usage data.frame for background/random model
##' @return Log likelihood
##' @author Guangchuang Yu
logLikelihood <- function(sequence, transition, codonusage, background) {
    codonLength <- 3
    orf1 <- sequence
    orf2 <- substr(sequence, 2, nchar(sequence))
    orf3 <- substr(sequence, 3, nchar(sequence))
    seq.rc <- revcom(sequence)
    orf4 <- seq.rc
    orf5 <- substr(seq.rc, 2, nchar(seq.rc))
    orf6 <- substr(seq.rc, 3, nchar(seq.rc))
    ORF <- c(orf1, orf2, orf3, orf4, orf5, orf6)
    names(ORF) <- c("ORF1", "ORF2", "ORF3", "ORF4", "ORF5", "ORF6")
 
    result <- data.frame">data.frame(ORF=names(ORF), logLikelihood=rep(0,6))
    for (j in seq_along(ORF)) {
        firstCodon <- substr(ORF[j], 1, 3)
        if (firstCodon %in% codonusage[,1]) {
            initProb <- log(codonusage[firstCodon,2])
        } else {
            initProb <- log(1/64)
        }
 
        transitionCount <- matrix(0, nrow=nrow(transition), ncol=ncol(transition))
        rownames(transitionCount) <- rownames(transition)
        colnames(transitionCount) <- colnames(transition)
 
        codons <- rownames(transition)
 
        len <- nchar(ORF[j])
        len <- len - len %% codonLength - codonLength
 
        idx <- seq(1, len, codonLength)
        for (i in seq_along(idx)) {
            ## codonI--codonJ
            codonI <- substr(ORF[j],idx[i], idx[i]+codonLength-1)
            if (i != length(idx)) {
                codonJ <- substr(ORF[j],idx[i+1], idx[i+1]+codonLength-1)
            }
            if (codonI %in% codons & codonJ %in% codons) {
                transitionCount[codonI, codonJ] <- transitionCount[codonI, codonJ] +1
            } else {
                warning("codon ", codonI, " or ", codonJ, " which does not defined was detected...")
            }
        }
 
        postTransition <- log(transition) * transitionCount
        posterior <- initProb + sum(postTransition)
 
        NTseq <- unlist(strsplit(ORF[j], split=""))
        orf.len <- nchar(ORF[j]) - nchar(ORF[j]) %% codonLength
        NTseq <- NTseq[1:orf.len]
        randomModel <- sum(log(background[NTseq,2]))
 
        likelihood <- posterior - randomModel
        result[j, 2] <- likelihood
    }
    return(result)
}

Related Posts


Viewing all articles
Browse latest Browse all 27

Trending Articles