用cor函数在R中进行相关的替换长度误差

zxlwwiss  于 2023-04-03  发布在  其他
关注(0)|答案(1)|浏览(104)

我尝试使用R中的cor函数来对多个样本中的基因进行相关。我有两个输入文件:实际表达式和预测表达式。这两个文件都有5行对应于基因和列作为样本。当我使用cor函数时,它给出了一个错误:基本上,对于每个基因,我想得到实际和预测之间的所有样本的cor值。

library(readr)
    actual_expr = read.table(file = "actual_expr.txt", header = TRUE, sep = "\t" )
    pred_expr = read.table(file = "predict_Expr.txt", header = TRUE, sep = "\t" )
    pred_expr=pred_expr[,-1]
    pred_expr <- as.data.frame(t(pred_expr))
    colnames(pred_expr) <- pred_expr[1,]
    pred_expr <- pred_expr[-c(1),]
    actual_expr <- as.data.frame(t(actual_expr))
    
    #filter columns of actual based on pred:
    #pred_expr <- pred_expr[,colnames(actual_expr)%in%colnames(pred_expr)]
    y <- colnames(actual_expr)
    z <- which(colnames(pred_expr) %in% y)
    pred_expr <- pred_expr[,z]
##convert chr to numeric for cor function
    for(i in 1:205){
    pred_expr[,i] <- as.numeric((as.character(pred_expr[,i])))
    #actual_expr[,i] <- as.numeric((as.character(actual_expr[,i])))
    }
    cor_vec <- NULL
    for(i in 1:5){ #for first-five gene
       if(rownames(pred_expr)[i] %in% rownames(actual_expr)=="TRUE"){
           z <- which(rownames(actual_expr) %in% rownames(pred_expr)[i])
           index3=which(rownames(pred_expr)[i] %in%  rownames(actual_expr)[z])
           index4=which(rownames(actual_expr) %in% rownames(pred_expr)[i])
           data1_1<<- pred_expr[index3,]
           data2_1<<- actual_expr[index4,]
           cor_vec[i] = cor(data1_1[i,],data2_1[i,])
           }
       }

错误是:

Warning messages:
1: In cor_vec[i] <- cor(data1_1[i, ], data2_1[i, ]) :
  number of items to replace is not a multiple of replacement length
2: In cor_vec[i] <- cor(data1_1[i, ], data2_1[i, ]) :
  number of items to replace is not a multiple of replacement length

cor_Vec具有NA NA NA NA NA值。这些文件是:

dput(data1_1[1, c(1, 5)])
structure(list(`GTEX-1117F` = -1.1059008, `GTEX-1192X` = -1.9839559), row.names = "ENSG00000278558.4", class = "data.frame")
> dput(data2_1[1, c(1, 5)])
structure(list(`GTEX-1117F` = 0.0294928231667479, `GTEX-1192X` = -0.294937186047513), row.names = "ENSG00000278558.4", class = "data.frame")
dput(pred_expr[1:2,c(1:5)])
structure(list(`GTEX-1117F` = c(-1.1059008, -0.7702356), `GTEX-111FC` = c(-1.9839559, 
-0.3895127), `GTEX-1128S` = c(-1.9839559, -0.3895127), `GTEX-117XS` = c(-0.9919779, 
-0.1947563), `GTEX-1192X` = c(-1.9839559, -0.3895127)), row.names = c("ENSG00000278558.4", 
"ENSG00000275793.1"), class = "data.frame")
dput(actual_expr[1:2,c(1:5)])
structure(list(`GTEX-1117F` = c(0.0205417435855037, -0.257328654412256
), `GTEX-111FC` = c(-0.196491626008295, -0.564355420678903), 
    `GTEX-1128S` = c(-0.259273634395018, -0.53496548376192), 
    `GTEX-117XS` = c(-0.1764185747562, -0.765050266632035), `GTEX-1192X` = c(-0.185980126179191, 
    -0.479964998120876)), row.names = c("ENSG00000227232.5", 
"ENSG00000268903.1"), class = "data.frame")

有人知道为什么我会得到这个错误吗。谢谢。

bjg7j2ky

bjg7j2ky1#

在循环内部,使用unlistdata.frame返回一个vector作为具有单行的子集,仍然返回数据。

> str(data1_1)
'data.frame':   1 obs. of  2 variables:
 $ GTEX-1117F: num -1.11
 $ GTEX-1192X: num -1.98
> str(data1_1[1,])
'data.frame':   1 obs. of  2 variables:
 $ GTEX-1117F: num -1.11
 $ GTEX-1192X: num -1.98
> str(unlist(data1_1[1,]))
 Named num [1:2] -1.11 -1.98
 - attr(*, "names")= chr [1:2] "GTEX-1117F" "GTEX-1192X"

所以我们用

cor(unlist(data1_1[1,]), unlist(data2_1[1,]))

循环内

相关问题