remove(list = ls())
GEO_id='GSE2171'
platform_id='GPL201'
tiss_type='PBMC'
special=F #如果数据集是去批次后的此参数为T
if (!requireNamespace("here", quietly = TRUE)) install.packages("here")
library(here)
setwd(here::here())


library(limma)

#------------------------------------
DEG_module=function(input_df){
  #构建获取差异基因的函数
  FC=1.5
  get_diffgene=function(df,FC){
    df_output=df
    df_output$g=ifelse(df_output$P.Value>0.05,'stable', #if 判断：如果这一基因的P.Value>0.01，则为stable基因
                       ifelse(df_output$logFC >= log2(FC),'up', #接上句else 否则：接下来开始判断那些P.Value<0.01的基因，再if 判断：如果logFC >1.5,则为up（上调）基因
                              ifelse( df_output$logFC <= (-log2(FC)),'down','no_significant') )#接上句else 否则：接下来开始判断那些logFC <1.5 的基因，再if 判断：如果logFC <1.5，则为down（下调）基因，否则为stable基因
    )
    print(table(df_output$g))
    df_output=df_output[which(df_output$g=='up' | df_output$g=='down'),]
    return(df_output)
  }

  do_basic_limma=T

  expr_mat=input_df
  if(do_basic_limma==T){
    #根据列名构建分组,获取样本的分组信息
    expr_mat=expr_mat[,!grepl('NA',colnames(expr_mat))]
    sample_type=gsub('GSM[0-9]*_','',colnames(expr_mat))
    #使用向量化方法遍历表达矩阵中所有的分组，分别进行差异分析，此处实际上用不上这个特性了，因为后续挑选样本时使得输入的数据框除了free就是特定的某类HIV分组
    if('HIVfree' %in% sample_type){

      group=unique(sample_type)
      group=group[-which(group=='HIVfree')]
      sepreate_differential_analysis=function(input_group,input_matrix){
        groups_keep=c('HIVfree',input_group)
        input_matrix=input_matrix[,apply(sapply(groups_keep, grepl, colnames(input_matrix)), 1, any)] #保留输入矩阵中属于特定分组的样本
        sample_group=gsub('GSM[0-9]*_','',colnames(input_matrix)) #构建limma分析用的样本分组，原则是HIV组vs free组
        design=model.matrix(~0+sample_group)
        colnames(design)=levels(factor(sample_group)) #此处构建好了limma用的分组模型
        #limma差异分析流程
        fit=lmFit(input_matrix,design) 
        contrast1=paste0(input_group,'-HIVfree')
        contrast.matrix=makeContrasts(contrasts = contrast1,levels = colnames(coef(fit)))
        fit2=contrasts.fit(fit,contrast.matrix)
        fit2=eBayes(fit2)
        #至此limma分析流程结束，接下来是获取fit2中需要的差异结果
        results=topTable(fit2, adjust.method="BH", sort.by="B",coef=1,number=Inf)
        results$gene_symbol=rownames(results)
        results_DEG_only=get_diffgene(results,FC)
        results$gene_symbol=NULL #为保证输出结果格式统一，去除掉没用的gene_symbol列
        write.table(results,file = paste0('./results/',GEO_id,'_',platform_id,'_',input_group,'_',gene_type,'_vs_HIVfree_limma_results.txt'),sep = '\t',quote = F,col.names = NA)
        write.table(results_DEG_only,file = paste0('./results/',GEO_id,'_',platform_id,'_',input_group,'_',gene_type,'_vs_HIVfree_limma_results_DEG_only_FC',FC,'.txt'),sep = '\t',quote = F,col.names = NA)

      }
      sapply(group,function(x)sepreate_differential_analysis(x,expr_mat))

    }
    else{
      stop('No HIVfree sample in this dataset, try another one!')
    }
  }
}
#------------------------------------






#step1 导入数据框
if(GEO_id=='GSE4124'){
  expr_mat=read.table(file = paste0('./data/expression_mat/',tiss_type,'/',GEO_id,'/',GEO_id,'_expression_matrix_all_group_withoutlog2.txt'),sep = '\t',header = T)
  
}else{
  
  if(special==T){
    path0=paste0('./data/expression_mat/',tiss_type,'/',GEO_id,'-',type1,'-combined')
    file0=list.files(path0,full.names = T,pattern = '.txt')
    expr_mat=read.table(file = file0,sep = '\t',header = T)
  }else{
    expr_mat=read.table(file = paste0('./data/expression_mat/',tiss_type,'/',GEO_id,'/',GEO_id,'_expression_matrix_all_group.txt'),sep = '\t',header = T)
    
  }
  
  
}
expr_mat=expr_mat[,!colnames(expr_mat) %in% c('gene_symbol','ENSEMBL_id')]
expr_mat=expr_mat[,!grepl(pattern = 'NA',colnames(expr_mat))]


a=as.vector(sapply(colnames(expr_mat),function(x)gsub('HIV_free','HIVfree',x)))
colnames(expr_mat)=a
sample_type1=gsub('GSM[0-9]*_','',colnames(expr_mat))
table(sample_type1)
group1=unique(sample_type1)
group1=group1[-which(group1=='HIVfree')]
#下面是挑选样本部分，这里的循环是防止某些数据集中HIV相关组多余1个，分别获取某个特定HIV组以及free组分别进行挑选
for(group_tmp in group1){
  # group_tmp='HIV_nonART' #调试用
  group_keep_inside=c('HIVfree',group_tmp)
  expr_mat_tmp=expr_mat[,apply(sapply(group_keep_inside, grepl, colnames(expr_mat)), 1, any)]
  
  ##------------------------------------------------
  #step2 分类样本
  any(rownames(expr_mat)=='CD4')
  any(rownames(expr_mat)=='CD8A')
  any(rownames(expr_mat)=='CD8B')
  #获取样本分类情况
  sample_type=gsub('GSM[0-9]*_','',colnames(expr_mat_tmp))
  group=unique(sample_type) #输出时文件名中用的分组变量信息
  #提取每个分组中满足条件的样本
  type_list=c('CD4','CD8A','CD4-CD8A')
  
  for(j in 1:length(type_list)){
    # j=2 #调试用
    gene_type=type_list[j]
    # gene_type='CD4-CD8A' #调试用
    if(gene_type%in%c('CD8A','CD4','CD8B')){
      negative_median=apply(expr_mat_tmp[which(rownames(expr_mat_tmp)==gene_type),],1,median)
      
    }else if(gene_type %in% c('CD8A-CD8B')){
      negative_median1=apply(expr_mat_tmp[which(rownames(expr_mat_tmp)=='CD8A'),],1,median)
      negative_median2=apply(expr_mat_tmp[which(rownames(expr_mat_tmp)=='CD8B'),],1,median)
      
    }else{
      negative_median3=apply(expr_mat_tmp[which(rownames(expr_mat_tmp)=='CD4'),],1,median)
      negative_median4=apply(expr_mat_tmp[which(rownames(expr_mat_tmp)=='CD8A'),],1,median)
    }
    sample_keep=c()
    for(i in 1:length(sample_type)){
      type_sub=sample_type[i]
      if(gene_type=='CD4'){
        if(type_sub=='HIVfree'){
          if(expr_mat_tmp[which(rownames(expr_mat_tmp)==gene_type),i]>negative_median){
            sample_keep=c(sample_keep,1)
          }else{
            sample_keep=c(sample_keep,0)
          }
        }else{
          if(expr_mat_tmp[which(rownames(expr_mat_tmp)==gene_type),i]<negative_median){
            sample_keep=c(sample_keep,1)
          }else{
            sample_keep=c(sample_keep,0)
          }
        }
      }else if(gene_type=='CD4-CD8A'){
        if(type_sub=='HIVfree'){
          if(expr_mat_tmp[which(rownames(expr_mat_tmp)=='CD4'),i]>negative_median3 & expr_mat_tmp[which(rownames(expr_mat_tmp)=='CD8A'),i]<negative_median4){
            sample_keep=c(sample_keep,1)
          }else{
            sample_keep=c(sample_keep,0)
          }
        }else{
          if(expr_mat_tmp[which(rownames(expr_mat_tmp)=='CD4'),i]<negative_median3 & expr_mat_tmp[which(rownames(expr_mat_tmp)=='CD8A'),i]>negative_median4){
            sample_keep=c(sample_keep,1)
          }else{
            sample_keep=c(sample_keep,0)
          }
        }
      }else if(gene_type=='CD8A'| gene_type=='CD8B'){
        if(type_sub=='HIVfree'){
          if(expr_mat_tmp[which(rownames(expr_mat_tmp)==gene_type),i]<negative_median){
            sample_keep=c(sample_keep,1)
          }else{
            sample_keep=c(sample_keep,0)
          }
        }else{
          if(expr_mat_tmp[which(rownames(expr_mat_tmp)==gene_type),i]>negative_median){
            sample_keep=c(sample_keep,1)
          }else{
            sample_keep=c(sample_keep,0)
          }
        }
      }else{
        if(type_sub=='HIVfree'){
          if(expr_mat_tmp[which(rownames(expr_mat_tmp)=='CD8A'),i]<negative_median1 & expr_mat_tmp[which(rownames(expr_mat_tmp)=='CD8B'),i]<negative_median2){
            sample_keep=c(sample_keep,1)
          }else{
            sample_keep=c(sample_keep,0)
          }
        }else{
          if(expr_mat_tmp[which(rownames(expr_mat_tmp)=='CD8A'),i]>negative_median1 & expr_mat_tmp[which(rownames(expr_mat_tmp)=='CD8B'),i]>negative_median2){
            sample_keep=c(sample_keep,1)
          }else{
            sample_keep=c(sample_keep,0)
          }
        }
      }
      
    }
    type_output=data.frame(sample_type=sample_type,sample_keep=sample_keep,GEO_id=colnames(expr_mat_tmp))
    table=as.data.frame(table(type_output[,1:2]))
    write.table(type_output,file = paste0('./results/',tiss_type,'_',GEO_id,'_',gene_type,'_',group_tmp,'_sample_type.txt'),sep = '\t',quote = F)
    write.table(table,file = paste0('./results/',tiss_type,'_',GEO_id,'_',gene_type,'_',group_tmp,'_sample_type_summary.txt'),sep = '\t',quote = F)
    
    expr_mat_inside=expr_mat_tmp
    if(GEO_id=='GSE4124'){
      expr_mat_inside=log2(expr_mat_inside)
      
    }
    expr_mat_inside=expr_mat_inside[,which(sample_keep==1)]
    
    #step3 差异表达并输出结果
    print(paste0(max(expr_mat_inside),'_',min(expr_mat_inside)))
    tryCatch({
      # 这里是你的自定义函数
      DEG_module(input_df = expr_mat_inside)
      
    }, error = function(e) {
      # 当你的函数出现错误时，这里的代码会被执行
      print(paste("Error in iteration",":", e))
    })
  }
  ##------------------------------------------------
}










