remove(list = ls())
setwd('E:/important-HIV-cancer/analysis')
if (!requireNamespace("here", quietly = TRUE)) install.packages("here")
library(here)
setwd(here::here())
library(dplyr)
library(stringr)
mir_tar_df=read.delim('data/TF-Target-info/hsa_MTI.csv',sep = ',')
filter_ref=read.csv('data/TF-Target-info/miRTarBase_SE_WR.csv')
filter_ref=filter_ref[(filter_ref$Species..miRNA.=='hsa')&(filter_ref$Species..Target.Gene.=='hsa'),]
mir_tar_df=mir_tar_df[mir_tar_df$Species..miRNA.=='hsa'&mir_tar_df$Species..Target.Gene.=='hsa',]
mir_tar_df1=rbind(filter_ref,mir_tar_df[grep('qRT-PCR',mir_tar_df$Experiments),])
mir_tar_df1=distinct(mir_tar_df1)
mir_tar_df1=mir_tar_df1%>%
  group_by(miRNA,Target.Gene)%>%
  summarise(across(everything(),~{
    unique_values <- unique(.)
    if (length(unique_values) == 1) {
      as.character(unique_values)
    } else {
      paste(unique_values, collapse = ",")
    }
  }), .groups = "drop")

mir_tar_df1 <- mir_tar_df1 %>%
  mutate(Experiments = sapply(Experiments, function(e) {
    # 拆分字符串
    elements <- str_split(e, "[,//]+")[[1]]
    # 去重并按字母顺序排序
    unique_elements <- unique(elements)
    # 重新用 "//" 组合
    paste(unique_elements, collapse = "//")
  }))

df_database=mir_tar_df1


df_input=read.delim('results/dNADC_comma-seperate.txt')
df_database$ART=rep(NA,nrow(df_database))
df_database$nonART=rep(NA,nrow(df_database))
df_database1=df_database
remove(summary_df)
for (type in c('ART','nonART')) {
  # type='ART'
  print(type)
  # dir.create(paste0('results/',type))
  # output_dir=paste0('results/',type)
  df_sub1=df_input[df_input$type==type,]
  remove(valid_rows)
 
  for (cancer in unique(df_input$cancer)) {
    # cancer='BLCA'
    print(cancer)
    df_sub2=df_sub1[df_sub1$cancer==cancer,]
    condition_id=paste(cancer,sep = '_')
    gene_all=unlist(strsplit(df_sub2$All_list,','))
    gene_lasso=unlist(strsplit(df_sub2$LASSO_list,','))
    valid_rows <- df_database1
    deal=0
    deal1=0
    deal2=0
    for (i in 1:nrow(valid_rows)) {
      symbol = valid_rows$Target.Gene[i]
      # print(i)
      # 判断 symbol 是否在 gene_all 或 gene_lasso 列表中
      matching_conditions = c()
      if (symbol %in% gene_all) {
        matching_conditions = paste0(condition_id,'_A')
        deal1=deal1+1
        # print(deal)
      }
      if (symbol %in% gene_lasso) {
        matching_conditions = paste0(condition_id,'_L')
        deal2=deal2+1
        # print(deal)
      }

      # 假设 valid_rows$condition[i] 存储了当前基因的所有条件
      if (length(matching_conditions) > 0) {
        deal=deal+1
        if(type=='ART'){
          # 如果 condition 列已有内容，则在原内容基础上追加新的条件
          if (!is.na(valid_rows$ART[i])) {
            valid_rows$ART[i] <- paste(valid_rows$ART[i], paste(matching_conditions, collapse = ", "), sep = ", ")
          } else {
            # 否则直接赋值
            valid_rows$ART[i] <- paste(matching_conditions, collapse = ",")
          }
        }else{
          # 如果 condition 列已有内容，则在原内容基础上追加新的条件
          if (!is.na(valid_rows$nonART[i])) {
            valid_rows$nonART[i] <- paste(valid_rows$nonART[i], paste(matching_conditions, collapse = ", "), sep = ", ")
          } else {
            # 否则直接赋值
            valid_rows$nonART[i] <- paste(matching_conditions, collapse = ",")
          }
        }


      }
    }
    df_database1[rownames(valid_rows),]=valid_rows
    summary_sub=data.frame(
      Number_pairs_of_all=deal1,
      Number_pairs_of_lasso=deal2,
      condition=paste0(condition_id,'_',type)
    )
    if(!exists('summary_df')){
      summary_df=summary_sub
    }else{
      summary_df=rbind(summary_df,summary_sub)
    }
    # res_sub=df_database[which((df_database$Target)%in%gene_need),]
    # TF_sub=paste0(unique(res_sub$TF),collapse = ',')
    # if(!exists('TF_list')){
    #   TF_list=TF_sub
    # }else{
    #   TF_list=paste(TF_list,TF_sub,sep = ',')
    # }
    #
    # write.table(res_sub,file = paste0(output_dir,'/',cancer,'.txt'),sep = '\t',quote = F,row.names = F)

  }
}

# write.table(summary_df,'results/dNADC_db-with-filter_result_node.txt',sep = '\t',quote = F,row.names = F)
mir_list=unique(df_database1$miRNA[!(is.na(df_database1$ART)&is.na(df_database1$nonART))])
# write.table(mir_list,'results/dNADC_db-with-filter_result_miRNA_list.txt',sep = '\t',quote = F)
df_database1=df_database1[!(is.na(df_database1$ART)&is.na(df_database1$nonART)),]
# write.table(df_database1,file = 'results/dNADC_mir_table_with-filter.txt',sep = '\t',quote = F,row.names = F)

df_input=read.delim('results/iPredict_comma-seperate.txt')
df_database1=df_database
remove(summary_df)

for (type in c('ART','nonART')) {
  # type='ART'
  print(type)
  # dir.create(paste0('results/',type))
  # output_dir=paste0('results/',type)
  df_sub1=df_input[df_input$HIV_type==type,]

  for (group in c('CD4','CD8A','CD4-CD8A')) {
    # group='CD4'
    print(group)
    df_sub2=df_sub1[df_sub1$type==group,]
    for (cancer in unique(df_input$cancer)) {
      # cancer='BLCA'
      print(cancer)
      remove(valid_rows)
      df_sub3=df_sub2[df_sub2$cancer==cancer,]
      condition_id=paste(group,cancer,sep = '_')
      gene_all=unlist(strsplit(df_sub3$Gene_list,','))
      valid_rows <- df_database1
      deal=0
      deal1=0
      deal2=0
      for (i in 1:nrow(valid_rows)) {
        symbol = valid_rows$Target.Gene[i]

        # 判断 symbol 是否在 gene_all 或 gene_lasso 列表中
        matching_conditions = c()
        if (symbol %in% gene_all) {
          matching_conditions = paste0(condition_id)
          deal=deal+1
        }


        # 假设 valid_rows$condition[i] 存储了当前基因的所有条件
        if (length(matching_conditions) > 0) {
          deal1=deal1+1
          if(type=='ART'){
            # 如果 condition 列已有内容，则在原内容基础上追加新的条件
            if (!is.na(valid_rows$ART[i])) {
              valid_rows$ART[i] <- paste(valid_rows$ART[i], paste(matching_conditions, collapse = ", "), sep = ", ")
            } else {
              # 否则直接赋值
              valid_rows$ART[i] <- paste(matching_conditions, collapse = ",")
            }
          }else{
            # 如果 condition 列已有内容，则在原内容基础上追加新的条件
            if (!is.na(valid_rows$nonART[i])) {
              valid_rows$nonART[i] <- paste(valid_rows$nonART[i], paste(matching_conditions, collapse = ", "), sep = ", ")
            } else {
              # 否则直接赋值
              valid_rows$nonART[i] <- paste(matching_conditions, collapse = ",")
            }
          }


        }
      }
      df_database1=valid_rows
      summary_sub=data.frame(
        Number_pairs_of_all=deal1,
        condition=paste0(condition_id,'_',type)
      )
      if(!exists('summary_df')){
        summary_df=summary_sub
      }else{
        summary_df=rbind(summary_df,summary_sub)
      }
      # res_sub=df_database[which((df_database$Target)%in%gene_need),]
      # TF_sub=paste0(unique(res_sub$TF),collapse = ',')
      # if(!exists('TF_list')){
      #   TF_list=TF_sub
      # }else{
      #   TF_list=paste(TF_list,TF_sub,sep = ',')
      # }
      #
      # write.table(res_sub,file = paste0(output_dir,'/',cancer,'.txt'),sep = '\t',quote = F,row.names = F)

    }
  }

}

# write.table(summary_df,'results/ipredict_db-with-filter_result_node.txt',sep = '\t',quote = F,row.names = F)
mir_list=unique(df_database1$miRNA[!(is.na(df_database1$ART)&is.na(df_database1$nonART))])
# write.table(mir_list,'results/ipredict_db-with-filter_result_miRNA_list.txt',sep = '\t',quote = F)
df_database1=df_database1[!(is.na(df_database1$ART)&is.na(df_database1$nonART)),]
# write.table(df_database1,file = 'results/ipredict_mir_table_with-filter.txt',sep = '\t',quote = F,row.names = F)



df_input=read.delim('results/rNADC_comma-seperate.txt')
df_database$ART=NULL
df_database$nonART=NULL
df_database$condition=rep(NA,nrow(df_database))
df_database1=df_database
remove(summary_df)
for (cancer in unique(df_input$cancer)) {
  # cancer='BLCA'
  print(cancer)
  remove(valid_rows)
  df_sub3=df_input[df_input$cancer==cancer,]
  condition_id=paste(cancer,sep = '_')
  gene_all=unlist(strsplit(df_sub3$gene_need,','))
  valid_rows <- df_database1
  deal=0
  deal1=0
  deal2=0
  for (i in 1:nrow(valid_rows)) {
    symbol = valid_rows$Target.Gene[i]

    # 判断 symbol 是否在 gene_all 或 gene_lasso 列表中
    matching_conditions = c()
    if (symbol %in% gene_all) {
      matching_conditions = paste0(condition_id)
      deal=deal+1
    }


    # 假设 valid_rows$condition[i] 存储了当前基因的所有条件
    if (length(matching_conditions) > 0) {
      deal1=deal1+1
      if (!is.na(valid_rows$condition[i])) {
        valid_rows$condition[i] <- paste(valid_rows$condition[i], paste(matching_conditions, collapse = ", "), sep = ", ")
      } else {
        # 否则直接赋值
        valid_rows$condition[i] <- paste(matching_conditions, collapse = ",")
      }
    }
  }
  df_database1[rownames(valid_rows),]=valid_rows
  summary_sub=data.frame(
    Number_pairs_of_all=deal1,
    condition=paste0(condition_id)
  )
  if(!exists('summary_df')){
    summary_df=summary_sub
  }else{
    summary_df=rbind(summary_df,summary_sub)
  }
  
}
mir_list=unique(df_database1$miRNA[!(is.na(df_database1$condition))])
write.table(mir_list,'results/rNADC_db-with-filter_result_miRNA_list.txt',sep = '\t',quote = F)
df_database2=df_database1[(!is.na(df_database1$condition)),]
write.table(df_database2,file = 'results/rNADC_mir_table_with_filter.txt',sep = '\t',quote = F,row.names = F)
write.table(summary_df,file = 'results/rNADC_nodes_with_filter.txt',sep = '\t',quote = F,row.names = F)
