# 加载需要的包
remove(list = ls())
library(sva)
setwd('H:/project')
tiss_type='PBMC'
type='HIV_ART'
GEO_id2='GSE10924' #HIV样本的id
GEO_id1='GSE46480' #free样本的id
GEO_id3='GSE2171' #中继样本id
platform_id2='GPL201' #HIV样本id
platform_id1='GPL570' #free样本id
platform_id3='GPL201' #中继样本id
#读取文件
# matrix_free=read.table(file = paste0('results/results_PBMC_whole_blood/',tiss_type,'/',GEO_id1,'/',GEO_id1,'_HIVfree_',platform_id1,'_matrix.txt'),sep = '\t',header = T)
matrix_free=read.table(file = paste0('results/results_PBMC_whole_blood/',tiss_type,'/',GEO_id1,'/',GEO_id1,'_HIV_free_matrix.txt'),sep = '\t',header = T)

# matrix_HIV=read.table(file = paste0('results/results_PBMC_whole_blood/',tiss_type,'/',GEO_id2,'/',GEO_id2,'_',type,'_',platform_id2,'_matrix.txt'),sep = '\t',header = T)
matrix_HIV=read.table(file = paste0('results/results_PBMC_whole_blood/',tiss_type,'/',GEO_id2,'/',GEO_id2,'_',type,'_matrix.txt'),sep = '\t',header = T)
matrix_reference=read.table(file = paste0('results/results_PBMC_whole_blood/',tiss_type,'/',GEO_id3,'/',GEO_id3,'_expression_matrix_all_group.txt'),sep = '\t',header = T)
# matrix_reference=read.table(file = paste0('results/results_PBMC_whole_blood/',tiss_type,'/',GEO_id3,'/',GEO_id3,'_',platform_id3,'_expression_matrix_all_group_using_lumi_newpipeline_3_15.txt'),sep = '\t',header = T)

matrix_HIV=matrix_HIV[,!colnames(matrix_HIV)%in% c('gene_symbol','ENSEMBL_id')]
matrix_free=matrix_free[,!colnames(matrix_free)%in% c('gene_symbol','ENSEMBL_id')]
matrix_reference=matrix_reference[,!colnames(matrix_reference)%in% c('gene_symbol','ENSEMBL_id')]

#如果free的样本过多，可以选择抽一部分让阴性与HIV样本数目相同
set.seed(123)
free_selected=sample(colnames(matrix_free),ncol(matrix_HIV))
matrix_free=matrix_free[,colnames(matrix_free)%in%free_selected]
#--------------

#分别去除free样本与HIV样本的批次效应
#先构建HIV与free均存在的预备数据集备用
matrix_free$symbol=rownames(matrix_free)
matrix_HIV$symbol=rownames(matrix_HIV)
library(dplyr)
combined_matrix <- inner_join(matrix_HIV, matrix_free,by = 'symbol',keep = T)
rownames(combined_matrix)=combined_matrix$symbol.y
combined_matrix$symbol.y=NULL
combined_matrix$symbol.x=NULL
#保留参考数据集以及待处理数据集中都有的基因
rowname_same=intersect(rownames(combined_matrix),rownames(matrix_reference))
combined_matrix=combined_matrix[rownames(combined_matrix)%in%rowname_same,]
matrix_reference=matrix_reference[rownames(matrix_reference)%in%rowname_same,]
# combined_matrix=combined_matrix[rownames(matrix_reference),]
#待去批次样本去批次前PCA结果
batch0=c(rep(1,ncol(matrix_HIV)-1),rep(0,ncol(matrix_free)-1))
batch1=ifelse(batch0==1,paste0(type,'_',GEO_id2),paste0('free_',GEO_id1))
id0=data.frame(id=colnames(combined_matrix),batch=batch0) #获取记录了待处理数据集的批次信息矩阵
pca1 <- prcomp(t(combined_matrix),center = TRUE,scale. = TRUE)
score1 <- pca1$x # 提取PC score
score1 <- as.data.frame(score1)
summ1 <- summary(pca1)
xlab1 <- paste0("PC1(",round(summ1$importance[2,1]*100,2),"%)")
ylab1 <- paste0("PC2(",round(summ1$importance[2,2]*100,2),"%)")
library(ggplot2)
p.pca1 <- ggplot(data = score1,aes(x = PC1,y = PC2,color = batch1))+
  geom_point(size = 2.5)+
  labs(x = xlab1,y = ylab1,title = "PCA Scores Plot")+
  guides(fill = 'none')+
  stat_ellipse(level = 0.99)+
  scale_fill_manual(values = c("purple","orange"))
pdf(paste0('results/',GEO_id1,'_',GEO_id2,'_',type,'_before_combine.pdf'))
p.pca1
dev.off()
#-----------------
#构建样本id与批次之间的关系矩阵,要注意这里的样本本来是来自3个数据集的，待去批次的合并成了1个数据框所以实际上是待去批次的认为是一个数据集，参考数据集是另一个批次的
id_combined=data.frame(id=colnames(combined_matrix),batch=1)
id_reference=data.frame(id=colnames(matrix_reference),batch=2)
id_all=rbind(id_combined,id_reference)
#分开参考数据集的样本
mat_ref_free=matrix_reference[,(grep('free',colnames(matrix_reference)))]
if(type=='HIV_nonART'){
  mat_ref_HIV=matrix_reference[,grep('nonART',colnames(matrix_reference))]
}else{
  mat_ref_HIV=matrix_reference[,grep('_ART',colnames(matrix_reference))]
}
#分开待处理数据集的样本
mat_deal_free=combined_matrix[,(grep('free',colnames(combined_matrix)))]
if(type=='HIV_nonART'){
  mat_deal_HIV=combined_matrix[,grep('nonART',colnames(combined_matrix))]
}else{
  mat_deal_HIV=combined_matrix[,grep('_ART',colnames(combined_matrix))]
}

#free样本小批量投入到参考数据集free样本中，并去除批次

#决定每次投入的样本数目
free_each_cycle=ncol(mat_ref_free)
num_loop=ceiling((ncol(mat_deal_free))/free_each_cycle)
# batch_remove=function(input1,input2){
#   # batch_inside=c(rep(1,ncol(input1)),rep(0,ncol(input2)))
#   df_inside=merge(input1,input2,by='symbol')
#   batch_remove_tmp=df_inside
#   # batch_remove_tmp=sva::ComBat(dat = df_inside,batch = batch_inside)
#   return(batch_remove_tmp)
# }
result_free=mat_ref_free
for (i in 1:num_loop) {
  #计算提取待处理样本的列
  # i=2 #调试用
  start1=(i-1)*free_each_cycle+1
  end1=min(i*free_each_cycle,ncol(mat_deal_free))
  subset_free=mat_deal_free[,start1:end1]
  #合并待处理样本以及参考用样本的矩阵
  subset_free$symbol=rownames(subset_free)
  result_free$symbol=rownames(result_free)
  merge_free_sub=merge(subset_free,result_free,by='symbol')
  subset_free$symbol=NULL
  result_free$symbol=NULL
  rownames(merge_free_sub)=merge_free_sub$symbol
  merge_free_sub$symbol=NULL
 #构建批次，result_free是参考用的那组，每次投完去完批次后就更新result_free，更新完的那组作为一个批次，新投的作为另一个批次
  batch_inside=c(rep(1,ncol(subset_free)),rep(2,ncol(result_free)))
  # result_free=merge_free_sub #调试用
  result_free=sva::ComBat(dat = merge_free_sub,batch = batch_inside)
  result_free=as.data.frame(result_free)
  
}
result_free=result_free[,colnames(result_free)%in%colnames(mat_deal_free)]

#HIV样本同free样本类似

#决定每次投入的样本数目
HIV_each_cycle=ncol(mat_ref_HIV)
num_loop=ceiling((ncol(mat_deal_HIV))/HIV_each_cycle)
# batch_remove=function(input1,input2){
#   # batch_inside=c(rep(1,ncol(input1)),rep(0,ncol(input2)))
#   df_inside=merge(input1,input2,by='symbol')
#   batch_remove_tmp=df_inside
#   # batch_remove_tmp=sva::ComBat(dat = df_inside,batch = batch_inside)
#   return(batch_remove_tmp)
# }
result_HIV=mat_ref_HIV
for (i in 1:num_loop) {
  #计算提取待处理样本的列
  # i=2 #调试用
  start1=(i-1)*HIV_each_cycle+1
  end1=min(i*HIV_each_cycle,ncol(mat_deal_HIV))
  subset_HIV=mat_deal_HIV[,start1:end1]
  #合并待处理样本以及参考用样本的矩阵
  subset_HIV$symbol=rownames(subset_HIV)
  result_HIV$symbol=rownames(result_HIV)
  merge_HIV_sub=merge(subset_HIV,result_HIV,by='symbol')
  subset_HIV$symbol=NULL
  result_HIV$symbol=NULL
  rownames(merge_HIV_sub)=merge_HIV_sub$symbol
  merge_HIV_sub$symbol=NULL
  #构建批次，result_free是参考用的那组，每次投完去完批次后就更新result_free，更新完的那组作为一个批次，新投的作为另一个批次
  batch_inside=c(rep(1,ncol(subset_HIV)),rep(2,ncol(result_HIV)))
  # result_free=merge_free_sub
  result_HIV=sva::ComBat(dat = merge_HIV_sub,batch = batch_inside)
  result_HIV=as.data.frame(result_HIV)
  # print(batch_inside)
  
}
result_HIV=result_HIV[,colnames(result_HIV)%in%colnames(mat_deal_HIV)]

#从去除批次样本中提取出free样本与HIV样本，组合成要的样本
result_HIV$symbol=rownames(result_HIV)
result_free$symbol=rownames(result_free)
result_after_batch_removal=merge(result_HIV,result_free,by='symbol')
#merge后行名会丢失，因此先把行名提取出来存储到symbol列在填回去
rownames(result_after_batch_removal)=result_after_batch_removal$symbol
result_after_batch_removal$symbol=NULL

#绘制去批次后PCA图
#待去批次样本去批次后PCA结果
dict0=setNames(id0$batch,(id0$id))
batch0=(sapply(colnames(result_after_batch_removal),function(x)dict0[x]))
# batch0=c(rep(1,ncol(matrix_HIV)-1),rep(0,ncol(matrix_free)-1))
batch1=ifelse(batch0==1,paste0(type,'_',GEO_id2),paste0('free_',GEO_id1))
pca1 <- prcomp(t(result_after_batch_removal),center = TRUE,scale. = TRUE)
score1 <- pca1$x # 提取PC score
score1 <- as.data.frame(score1)
summ1 <- summary(pca1)
xlab1 <- paste0("PC1(",round(summ1$importance[2,1]*100,2),"%)")
ylab1 <- paste0("PC2(",round(summ1$importance[2,2]*100,2),"%)")
library(ggplot2)
p.pca1 <- ggplot(data = score1,aes(x = PC1,y = PC2,color = batch1))+
  geom_point(size = 2.5)+
  labs(x = xlab1,y = ylab1,title = "PCA Scores Plot")+
  guides(fill = 'none')+
  stat_ellipse(level = 0.99)+
  scale_fill_manual(values = c("purple","orange"))
pdf(paste0('results/',GEO_id1,'_',GEO_id2,'_',type,'_after_combine.pdf'))
p.pca1
dev.off()

#输出去批次后的表达矩阵
write.table(result_after_batch_removal,file = base::paste0('results/',GEO_id1,'_',GEO_id2,'_',tiss_type,'_',type,'_combined.txt'),sep = '\t',quote = F)
