我尝试使用R中的cor函数来对多个样本中的基因进行相关。我有两个输入文件:实际表达式和预测表达式。这两个文件都有5行对应于基因和列作为样本。当我使用cor函数时,它给出了一个错误:基本上,对于每个基因,我想得到实际和预测之间的所有样本的cor值。
library(readr)
actual_expr = read.table(file = "actual_expr.txt", header = TRUE, sep = "\t" )
pred_expr = read.table(file = "predict_Expr.txt", header = TRUE, sep = "\t" )
pred_expr=pred_expr[,-1]
pred_expr <- as.data.frame(t(pred_expr))
colnames(pred_expr) <- pred_expr[1,]
pred_expr <- pred_expr[-c(1),]
actual_expr <- as.data.frame(t(actual_expr))
#filter columns of actual based on pred:
#pred_expr <- pred_expr[,colnames(actual_expr)%in%colnames(pred_expr)]
y <- colnames(actual_expr)
z <- which(colnames(pred_expr) %in% y)
pred_expr <- pred_expr[,z]
##convert chr to numeric for cor function
for(i in 1:205){
pred_expr[,i] <- as.numeric((as.character(pred_expr[,i])))
#actual_expr[,i] <- as.numeric((as.character(actual_expr[,i])))
}
cor_vec <- NULL
for(i in 1:5){ #for first-five gene
if(rownames(pred_expr)[i] %in% rownames(actual_expr)=="TRUE"){
z <- which(rownames(actual_expr) %in% rownames(pred_expr)[i])
index3=which(rownames(pred_expr)[i] %in% rownames(actual_expr)[z])
index4=which(rownames(actual_expr) %in% rownames(pred_expr)[i])
data1_1<<- pred_expr[index3,]
data2_1<<- actual_expr[index4,]
cor_vec[i] = cor(data1_1[i,],data2_1[i,])
}
}
错误是:
Warning messages:
1: In cor_vec[i] <- cor(data1_1[i, ], data2_1[i, ]) :
number of items to replace is not a multiple of replacement length
2: In cor_vec[i] <- cor(data1_1[i, ], data2_1[i, ]) :
number of items to replace is not a multiple of replacement length
cor_Vec具有NA NA NA NA NA
值。这些文件是:
dput(data1_1[1, c(1, 5)])
structure(list(`GTEX-1117F` = -1.1059008, `GTEX-1192X` = -1.9839559), row.names = "ENSG00000278558.4", class = "data.frame")
> dput(data2_1[1, c(1, 5)])
structure(list(`GTEX-1117F` = 0.0294928231667479, `GTEX-1192X` = -0.294937186047513), row.names = "ENSG00000278558.4", class = "data.frame")
dput(pred_expr[1:2,c(1:5)])
structure(list(`GTEX-1117F` = c(-1.1059008, -0.7702356), `GTEX-111FC` = c(-1.9839559,
-0.3895127), `GTEX-1128S` = c(-1.9839559, -0.3895127), `GTEX-117XS` = c(-0.9919779,
-0.1947563), `GTEX-1192X` = c(-1.9839559, -0.3895127)), row.names = c("ENSG00000278558.4",
"ENSG00000275793.1"), class = "data.frame")
dput(actual_expr[1:2,c(1:5)])
structure(list(`GTEX-1117F` = c(0.0205417435855037, -0.257328654412256
), `GTEX-111FC` = c(-0.196491626008295, -0.564355420678903),
`GTEX-1128S` = c(-0.259273634395018, -0.53496548376192),
`GTEX-117XS` = c(-0.1764185747562, -0.765050266632035), `GTEX-1192X` = c(-0.185980126179191,
-0.479964998120876)), row.names = c("ENSG00000227232.5",
"ENSG00000268903.1"), class = "data.frame")
有人知道为什么我会得到这个错误吗。谢谢。
1条答案
按热度按时间bjg7j2ky1#
在循环内部,使用
unlist
从data.frame
返回一个vector
作为具有单行的子集,仍然返回数据。所以我们用
循环内