setwd('D:\\生信\\HIV-Cancer\\Whole blood_model_new\\KEGG_all')
library(randomForest)
library(pROC)
library(caret)
library(party)
library(tidyverse)
library(glmnet)
### 建模数据准备-------------------------------------------------------------------------
#基因取交集
HIV_file <- 'D:/生信/HIV-Cancer/Whole blood_model_new/Whole_blood_ART_array_only.txt'
HIV <- read.delim(HIV_file,sep = '\t')
cancer_file <- 'D:/生信/HIV-Cancer/Whole blood_model_new/KEGG_all/ART_mic/BLCA/KEGG_gene.txt'
cancer <- read.delim(cancer_file,sep = '\t')
colnames(cancer)[1] <- 'X'
gene <- merge(HIV,cancer,by = 'X')
gene <- data.frame(X=gene$X)
gene_path <- 'D:/生信/HIV-Cancer/Whole blood_model_new/KEGG_all/ART_mic/BLCA/BLCA_gene.txt'
write.table(gene,gene_path,sep = '\t', quote = F, row.names = F)#保存基因list

#提取配对样本
tpm_path <- 'D:/生信/HIV-Cancer/TCGA_count_new/raw/50_logtpm_insec/BLCA_50_TPM.txt'
exp.counts <- read.delim(tpm_path,sep = '\t',row.names = 1)
exp.counts <- exp.counts[, !grepl("\\.1$", colnames(exp.counts))]  #保存不以".1"结尾的样本（列）
samples <- substr(colnames(exp.counts), 1, 12)  
patients <- names(table(samples))[table(samples) >= 2]  
exp.counts <- exp.counts[, samples %in% patients] 

#提取建模基因
exp.counts <- rownames_to_column(exp.counts,'X')
model_data <- merge(gene,exp.counts,by = 'X')
model_data <- column_to_rownames(model_data,'X')
model_data <- data.frame(t(model_data))
model_data$group <- ifelse(substr(rownames(model_data), 14, 14) == "0", "T", "N")
#训练集/测试集划分
set.seed(123)#设置随机数种子，使结果下次也可以复现
ind <- sample(2, nrow(model_data), replace = TRUE, prob = c(0.7, 0.3))
# 训练集
train <- model_data[ind==1, ] #the training data set
# 测试集
test <- model_data[ind==2, ] #the test data set

train_path <- 'D:/生信/HIV-Cancer/Whole blood_model_new/KEGG_all/ART_mic/BLCA/train_data.txt'
test_path <- 'D:/生信/HIV-Cancer/Whole blood_model_new/KEGG_all/ART_mic/BLCA/test_data.txt'
write.table(train,train_path,sep = '\t', quote = F, col.names = NA)
write.table(test,test_path,sep = '\t', quote = F, col.names = NA)

# 循环
cancers <- c("BLCA", "BRCA", "COAD", "ESCA","HNSC","KICH","KIRC","KIRP",
             "LIHC","LUAD","LUSC","PRAD","READ","STAD","THCA","UCEC") 
for (cancer_name in cancers) {  
  # 基因取交集  
  HIV_file <- 'D:/生信/HIV-Cancer/Whole blood_model_new/Whole_blood_nonART_array_seq_union.txt'  
  HIV <- read.delim(HIV_file, sep = '\t')  
  cancer_file <- paste0('D:/生信/HIV-Cancer/Whole blood_model_new/KEGG_all/ART_mic/', cancer_name, '/KEGG_gene.txt')  
  cancer <- read.delim(cancer_file, sep = '\t')  
  colnames(cancer)[1] <- 'X'  
  gene <- merge(HIV, cancer, by = 'X')  
  gene <- data.frame(X = gene$X)  
  gene_path <- paste0('D:/生信/HIV-Cancer/Whole blood_model_new/KEGG_all/nonART_mic_seq/', cancer_name, '/', cancer_name, '_gene.txt')  
  write.table(gene, gene_path, sep = '\t', quote = FALSE, row.names = FALSE) # 保存基因list  
  
  # 提取配对样本  
  tpm_path <- paste0('D:/生信/HIV-Cancer/TCGA_count_new/raw/50_logtpm_insec/', cancer_name, '_50_TPM.txt')  
  exp.counts <- read.delim(tpm_path, sep = '\t', row.names = 1)  
  exp.counts <- exp.counts[, !grepl("\\.1$", colnames(exp.counts))]    
  samples <- substr(colnames(exp.counts), 1, 12)    
  patients <- names(table(samples))[table(samples) >= 2]    
  exp.counts <- exp.counts[, samples %in% patients]   
  
  # 提取建模基因  
  exp.counts <- rownames_to_column(exp.counts, 'X')  
  model_data <- merge(gene, exp.counts, by = 'X')  
  model_data <- column_to_rownames(model_data, 'X')  
  model_data <- data.frame(t(model_data))  
  model_data$group <- ifelse(substr(rownames(model_data), 14, 14) == "0", "T", "N")  
  
  # 训练集/测试集划分  
  set.seed(123)  
  ind <- sample(2, nrow(model_data), replace = TRUE, prob = c(0.7, 0.3))  
  train <- model_data[ind == 1, ] # 训练集  
  test <- model_data[ind == 2, ] # 测试集  
  
  # 保存训练集和测试集  
  train_path <- paste0('D:/生信/HIV-Cancer/Whole blood_model_new/KEGG_all/nonART_mic_seq/', cancer_name, '/', cancer_name, '_train_data.txt')  
  test_path <- paste0('D:/生信/HIV-Cancer/Whole blood_model_new/KEGG_all/nonART_mic_seq/', cancer_name, '/', cancer_name, '_test_data.txt')  
  write.table(train, train_path, sep = '\t', quote = FALSE, col.names = NA)  
  write.table(test, test_path, sep = '\t', quote = FALSE, col.names = NA)  
}




### Lasso筛选---------------------------------------------------------------------
train_path <- 'D:/生信/HIV-Cancer/Whole blood_model_new/KEGG_all/ART_mic/BLCA/BLCA_train_data.txt'
test_path <- 'D:/生信/HIV-Cancer/Whole blood_model_new/KEGG_all/ART_mic/BLCA/BLCA_test_data.txt'
train <- read.delim(train_path,sep = '\t', row.names = 1)
test <- read.delim(test_path,sep = '\t', row.names = 1)

##lasso需要矩阵类型，使用as.matrix
x <- as.matrix(train[,-ncol(train)])
y <- as.matrix(train[,ncol(train)])
fit <- glmnet(x, y, family = "binomial", alpha = 1)
plot(fit, xvar="lambda",label = TRUE)
#交叉验证
fitCV <- cv.glmnet(x, y, 
                   family = "binomial",
                   type.measure = "class",
                   nfolds = 10, alpha = 1)
plot(fitCV)
#lambda.min 是最佳值，lambda.1se 则是一倍 SE 内的更简洁的模型。
coef.min = coef(fitCV, s = "lambda.min") 
#可以看看模型用了哪些基因
gene=as.data.frame(as.matrix(coef.min))
gene <- gene[gene$s1 != 0,,drop=F]
train_need <- train[,c(rownames(gene)[-1],"group")]
test_need <- test[,c(rownames(gene)[-1],"group")]
lasso_gene_path <- 'D:/生信/HIV-Cancer/Whole blood_model_new/KEGG_all_lasso/ART_mic/BLCA/lasso_gene.txt'
train_need_path <- 'D:/生信/HIV-Cancer/Whole blood_model_new/KEGG_all_lasso/ART_mic/BLCA/train_data.txt'
test_need_path <- 'D:/生信/HIV-Cancer/Whole blood_model_new/KEGG_all_lasso/ART_mic/BLCA/test_data.txt'
write.table(gene,lasso_gene_path,sep = '\t', quote = FALSE, col.names = NA)
write.table(train_need,train_need_path,sep = '\t', quote = FALSE, col.names = NA)
write.table(test_need,test_need_path,sep = '\t', quote = FALSE, col.names = NA)

#循环
cancer_types <- c("BLCA", "BRCA", "COAD", "ESCA","HNSC","KICH","KIRC","KIRP",
             "LIHC","LUAD","LUSC","PRAD","READ","STAD","THCA","UCEC") 
base_path <- 'D:/生信/HIV-Cancer/Whole blood_model_new/KEGG_all/nonART_mic_seq/'  
lasso_gene_base_path <- 'D:/生信/HIV-Cancer/Whole blood_model_new/KEGG_all_lasso/nonART_mic_seq/'  

for (cancer_type in cancer_types) {  
  # 构造文件路径  
  train_path <- file.path(base_path, cancer_type, paste0(cancer_type, "_train_data.txt"))  
  test_path <- file.path(base_path, cancer_type, paste0(cancer_type, "_test_data.txt"))  
  lasso_gene_path <- file.path(lasso_gene_base_path, cancer_type, paste0(cancer_type, "_lasso_gene.txt"))  
  train_need_path <- file.path(lasso_gene_base_path, cancer_type, paste0(cancer_type, "_train_data.txt"))  
  test_need_path <- file.path(lasso_gene_base_path, cancer_type, paste0(cancer_type, "_test_data.txt"))  
  # 读取数据  
  train <- read.delim(train_path, sep = '\t', row.names = 1)  
  test <- read.delim(test_path, sep = '\t', row.names = 1)  
  # 转换为矩阵并执行Lasso回归  
  x <- as.matrix(train[,-ncol(train)])  
  y <- as.matrix(train[,ncol(train)])  
  fit <- glmnet(x, y, family = "binomial", alpha = 1)  
  # 交叉验证  
  fitCV <- cv.glmnet(x, y,   
                     family = "binomial",  
                     type.measure = "class",  
                     nfolds = 10, alpha = 1)  
  # 获取最佳lambda对应的系数  
  coef.min <- coef(fitCV, s = "lambda.min")  
  # 获取非零系数对应的基因  
  gene <- as.data.frame(as.matrix(coef.min))  
  gene <- gene[gene$s1 != 0,,drop=FALSE]  
  # 提取所需的数据列  
  train_need <- train[,c(rownames(gene)[-1],"group")]  
  test_need <- test[,c(rownames(gene)[-1],"group")]  
  # 写入结果文件  
  write.table(gene, lasso_gene_path, sep = '\t', quote = FALSE, col.names = NA)  
  write.table(train_need, train_need_path, sep = '\t', quote = FALSE, col.names = NA)  
  write.table(test_need, test_need_path, sep = '\t', quote = FALSE, col.names = NA)  
  
  # 打印进度信息（可选）  
  cat(paste("Processing complete for cancer type:", cancer_type, "\n"))  
}








###### 建模 ######
# 读取数据
train_path <- 'D:/生信/HIV-Cancer/Whole blood_model_new/KEGG_all/ART_mic/BLCA/BLCA_train_data.txt'
test_path <- 'D:/生信/HIV-Cancer/Whole blood_model_new/KEGG_all/ART_mic/BLCA/BLCA_test_data.txt'
train <- read.delim(train_path,sep = '\t', row.names = 1)
test <- read.delim(test_path,sep = '\t', row.names = 1)
train$group <- as.factor(train$group)
test$group <- as.factor(test$group)
### RF建模----------------------------------------------------------------------------
set.seed(123)
rf <- randomForest(
  group ~ .,
  data = train,
  ntree = 500,
  importance = TRUE,
  proximity = TRUE,
  type = "classification")
#测试集进行预测
set.seed(123)
prob_rf <- predict(rf, newdata = test, type = "prob") 
predict_rf <- predict(rf, newdata = test)
#混淆矩阵
train_confu <- confusionMatrix(rf[["predicted"]],train$group,positive = "T")
test_confu <- confusionMatrix(predict_rf,test$group,positive = "T")
#ROC
roc1 <- roc(train$group, rf$votes[, 2])
roc2 <- roc(test$group, prob_rf[, 2])#观测在前，预测在后
rf_ROC_train <- data.frame(specificities=roc1[["specificities"]],sensitivities=roc1[["sensitivities"]],thresholds=roc1[["thresholds"]])#ROC
rf_ROC_test <- data.frame(specificities=roc2[["specificities"]],sensitivities=roc2[["sensitivities"]],thresholds=roc2[["thresholds"]])#ROC
#保存
save(rf,file =  "D:/生信/HIV-Cancer/Whole blood_model_new/KEGG_all/ART_mic/BLCA/rf_model.Rdata")
train_confu_path <- "D:/生信/HIV-Cancer/Whole blood_model_new/KEGG_all/ART_mic/BLCA/rf_conf_matrix_train.txt"
test_confu_path <- "D:/生信/HIV-Cancer/Whole blood_model_new/KEGG_all/ART_mic/BLCA/rf_conf_matrix_test.txt"
writeLines(capture.output(print(train_confu)), train_confu_path)
writeLines(capture.output(print(test_confu)), test_confu_path)
rf_ROC_train_path <- "D:/生信/HIV-Cancer/Whole blood_model_new/KEGG_all/ART_mic/BLCA/rf_ROC_train.txt"
write.table(rf_ROC_train,rf_ROC_train_path,sep = '\t', quote = FALSE,row.names = F)
rf_ROC_test_path <- "D:/生信/HIV-Cancer/Whole blood_model_new/KEGG_all/ART_mic/BLCA/rf_ROC_test.txt"
write.table(rf_ROC_test,rf_ROC_test_path,sep = '\t', quote = FALSE,row.names = F)


### party建模-------------------------------------------------------------------------
set.seed(123)
model <- ctree(group~.,data = train)    
#训练集混淆矩阵
train_prob_p <- treeresponse(model, newdata=train) %>% as.data.frame() %>% t()
train_pred_p <- predict(model, newdata=train, type="response")
train_confu_p <- confusionMatrix(train_pred_p,train$group,positive = "T")
#测试集混淆矩阵
test_prob_p <- treeresponse(model, newdata = test) %>% as.data.frame() %>% t() 
test_pred_p <- predict(model, newdata = test, type = "response")
test_confu_p <- confusionMatrix(test_pred_p,test$group,positive = "T")
#ROC
roc1_p <- roc(train$group, train_prob_p[, 2])
roc2_p <- roc(test$group, test_prob_p[, 2])#观测在前，预测在后
p_ROC_train <- data.frame(specificities=roc1_p[["specificities"]],sensitivities=roc1_p[["sensitivities"]],thresholds=roc1_p[["thresholds"]])#ROC
p_ROC_test <- data.frame(specificities=roc2_p[["specificities"]],sensitivities=roc2_p[["sensitivities"]],thresholds=roc2_p[["thresholds"]])#ROC
#保存
save(model,file =  "D:/生信/HIV-Cancer/Whole blood_model_new/KEGG_all/ART_mic/BLCA/party_model.Rdata")
p_train_confu_path <- "D:/生信/HIV-Cancer/Whole blood_model_new/KEGG_all/ART_mic/BLCA/party_conf_matrix_train.txt"
p_test_confu_path <- "D:/生信/HIV-Cancer/Whole blood_model_new/KEGG_all/ART_mic/BLCA/party_conf_matrix_test.txt"
writeLines(capture.output(print(train_confu_p)), p_train_confu_path)
writeLines(capture.output(print(test_confu_p)), p_test_confu_path)

party_ROC_train_path <- "D:/生信/HIV-Cancer/Whole blood_model_new/KEGG_all/ART_mic/BLCA/party_ROC_train.txt"
write.table(p_ROC_train,party_ROC_train_path,sep = '\t', quote = FALSE,row.names = F)
party_ROC_test_path <- "D:/生信/HIV-Cancer/Whole blood_model_new/KEGG_all/ART_mic/BLCA/party_ROC_test.txt"
write.table(p_ROC_test,party_ROC_test_path,sep = '\t', quote = FALSE,row.names = F)



### 建模循环--------------------------------------------------------
base_path <- 'D:/生信/HIV-Cancer/Whole blood_model_new/KEGG_all_lasso/nonART_mic_seq'  
cancer_names <- c("BLCA", "BRCA", "COAD", "ESCA","HNSC","KICH","KIRC","KIRP",
             "LIHC","LUAD","LUSC","PRAD","READ","STAD","THCA","UCEC") 

for (cancer_name in cancer_names) {  
  # 构建训练集和测试集的完整路径  
  train_path <- file.path(base_path, cancer_name,paste0(cancer_name, "_train_data.txt"))  
  test_path <- file.path(base_path, cancer_name,paste0(cancer_name, "_test_data.txt"))  
  
  # 读取数据  
  train <- read.delim(train_path, sep = '\t', row.names = 1)  
  test <- read.delim(test_path, sep = '\t', row.names = 1)  
  train$group <- as.factor(train$group)  
  test$group <- as.factor(test$group)  
  
  # 设置随机种子以确保结果的可重复性（可选）  
  set.seed(123)  
  
  ### 随机森林建模  
  rf <- randomForest(group ~ ., data = train, ntree = 500, importance = TRUE, proximity = TRUE, type = "classification")  
  # 测试集进行预测  
  prob_rf <- predict(rf, newdata = test, type = "prob")  
  predict_rf <- predict(rf, newdata = test)  
  # 计算混淆矩阵  
  rf_train_confu <- confusionMatrix(rf[["predicted"]], train$group, positive = "T")  
  rf_test_confu <- confusionMatrix(predict_rf, test$group, positive = "T")  
  # 计算ROC曲线  
  rf_roc1 <- roc(train$group, rf$votes[, 2])  
  rf_roc2 <- roc(test$group, prob_rf[, 2])  
  rf_ROC_train <- data.frame(specificities=rf_roc1[["specificities"]],sensitivities=rf_roc1[["sensitivities"]],thresholds=rf_roc1[["thresholds"]])#ROC
  rf_ROC_test <- data.frame(specificities=rf_roc2[["specificities"]],sensitivities=rf_roc2[["sensitivities"]],thresholds=rf_roc2[["thresholds"]])#ROC
  
  # 保存数据
    #混淆矩阵
  rf_train_confu_path <- file.path(base_path, cancer_name, "rf_conf_matrix_train.txt")  
  writeLines(capture.output(print(rf_train_confu)), rf_train_confu_path)  
  rf_test_confu_path <- file.path(base_path,cancer_name, "rf_conf_matrix_test.txt")  
  writeLines(capture.output(print(rf_test_confu)), rf_test_confu_path)
    #模型
  rf_model_filename <- file.path(base_path, cancer_name, "rf_model.Rdata")  
  save(rf,file =  rf_model_filename)
    #ROC
  rf_ROC_train_path <- file.path(base_path, cancer_name,"rf_ROC_train.txt")
  write.table(rf_ROC_train,rf_ROC_train_path,sep = '\t', quote = FALSE,row.names = F)
  rf_ROC_test_path <- file.path(base_path, cancer_name,"rf_ROC_test.txt")
  write.table(rf_ROC_test,rf_ROC_test_path,sep = '\t', quote = FALSE,row.names = F)
  
  ### party建模
  model <- ctree(group~.,data = train,)    
  #训练集混淆矩阵
  train_prob_p <- treeresponse(model, newdata=train) %>% as.data.frame() %>% t()
  train_pred_p <- predict(model, newdata=train, type="response")
  train_confu_p <- confusionMatrix(train_pred_p,train$group,positive = "T")
  #测试集混淆矩阵
  test_prob_p <- treeresponse(model, newdata = test) %>% as.data.frame() %>% t() 
  test_pred_p <- predict(model, newdata = test, type = "response")
  test_confu_p <- confusionMatrix(test_pred_p,test$group,positive = "T")
  #ROC
  roc1_p <- roc(train$group, train_prob_p[, 2])
  roc2_p <- roc(test$group, test_prob_p[, 2])#观测在前，预测在后
  p_ROC_train <- data.frame(specificities=roc1_p[["specificities"]],sensitivities=roc1_p[["sensitivities"]],thresholds=roc1_p[["thresholds"]])#ROC
  p_ROC_test <- data.frame(specificities=roc2_p[["specificities"]],sensitivities=roc2_p[["sensitivities"]],thresholds=roc2_p[["thresholds"]])#ROC
  
  # 保存数据
    #混淆矩阵
  p_train_confu_path <- file.path(base_path, cancer_name, "party_conf_matrix_train.txt")  
  writeLines(capture.output(print(train_confu_p)), p_train_confu_path)  
  p_test_confu_path <- file.path(base_path,cancer_name, "party_conf_matrix_test.txt")  
  writeLines(capture.output(print(test_confu_p)), p_test_confu_path)
    #模型
  p_model_filename <- file.path(base_path, cancer_name, "party_model.Rdata")  
  save(model,file =  p_model_filename)
    #ROC
  p_ROC_train_path <- file.path(base_path, cancer_name,"party_ROC_train.txt")
  write.table(p_ROC_train,p_ROC_train_path,sep = '\t', quote = FALSE,row.names = F)
  p_ROC_test_path <- file.path(base_path, cancer_name,"party_ROC_test.txt")
  write.table(p_ROC_test,p_ROC_test_path,sep = '\t', quote = FALSE,row.names = F)
  
  # 打印进度信息（可选）  
  cat(paste("Finished processing cancer:", cancer_name, "\n"))  
}


###AUC
base_path <- 'D:/生信/HIV-Cancer/Whole blood_model_new/KEGG_all_lasso/ART_mic'  
cancer_names <- c("BLCA", "BRCA", "COAD", "ESCA","HNSC","KICH","KIRC","KIRP",
                  "LIHC","LUAD","LUSC","PRAD","READ","STAD","THCA","UCEC") 
auc_results <- data.frame(  
  cancer = character(),  
  model = character(),  
  dataset = character(),  
  AUC = numeric(),  
  stringsAsFactors = FALSE # 避免R将数据框中的列转换为因子  
)  

for (cancer_name in cancer_names) { 
  rf_path <- file.path(base_path, cancer_name,"rf_model.Rdata")
  load(rf_path)
  # 构建训练集和测试集的完整路径  
  train_path <- file.path(base_path, cancer_name,paste0(cancer_name, "_train_data.txt"))  
  test_path <- file.path(base_path, cancer_name,paste0(cancer_name, "_test_data.txt"))  
  
  # 读取数据  
  train <- read.delim(train_path, sep = '\t', row.names = 1)  
  test <- read.delim(test_path, sep = '\t', row.names = 1)  
  train$group <- as.factor(train$group)  
  test$group <- as.factor(test$group)  
  
  #RF
  prob_rf <- predict(rf, newdata = test, type = "prob") 
  rf_train_roc_obj <- roc(train$group, rf$votes[, 2])  
  rf_train_auc_value <- round(auc(rf_train_roc_obj),4) 
  rf_test_roc_obj <- roc(test$group, prob_rf[, 2])
  rf_test_auc_value <- round(auc(rf_test_roc_obj),4)
  # 将RF模型的AUC值添加到结果数据框中  
  auc_results <- rbind(auc_results, data.frame(  
    cancer = cancer_name,  
    model = "RF",  
    dataset = "train",  
    AUC = rf_train_auc_value  
  ))  
  auc_results <- rbind(auc_results, data.frame(  
    cancer = cancer_name,  
    model = "RF",  
    dataset = "test",  
    AUC = rf_test_auc_value  
  ))  
  
  #party
  party_path <- file.path(base_path, cancer_name,"party_model.Rdata")
  load(party_path)
  train_prob_p <- treeresponse(model, newdata=train) %>% as.data.frame() %>% t()
  train_pred_p <- predict(model, newdata=train, type="response",probability=TRUE)
  test_prob_p <- treeresponse(model, newdata = test) %>% as.data.frame() %>% t() 
  test_pred_p <- predict(model, newdata = test, type = "response",probability=TRUE)
  p_train_roc_obj <- roc(train$group, train_prob_p[, 2]) 
  p_train_auc_value <- round(auc(p_train_roc_obj),4) 
  p_test_roc_obj <- roc(test$group, test_prob_p[, 2]) 
  p_test_auc_value <- round(auc(p_test_roc_obj),4) 
  # 将party模型的AUC值添加到结果数据框中  
  auc_results <- rbind(auc_results, data.frame(  
    cancer = cancer_name,  
    model = "party",  
    dataset = "train",  
    AUC = p_train_auc_value  
  ))  
  auc_results <- rbind(auc_results, data.frame(  
    cancer = cancer_name,  
    model = "party",  
    dataset = "test",  
    AUC = p_test_auc_value  
  ))  
    
}  
write.table(auc_results,"all_lasso-ART_mic-AUC.txt",sep = '\t', quote = FALSE,row.names = F)
