library(readr)
library(dplyr)
library(DT)
Read and map labeledin
labin_df <-
  # read labeledin data
  file.path('labeledin', 'data', 'indications.tsv') %>%
  readr::read_tsv(col_types = list(rxnorm_id = readr::col_character())) %>%
  # removes combo drugs which do not convert to integer
  dplyr::mutate(rxnorm_id = as.integer(rxnorm_id)) %>%
  dplyr::filter(! is.na(rxnorm_id)) %>%
  # map umls diseases to DO
  dplyr::inner_join(umls_df) %>%
  # map rxnorm compounds to drugbank
  dplyr::inner_join(rxnorm_df)
## Warning in mutate_impl(.data, dots): NAs introduced by coercion
## Joining by: "disease_cui"
## Joining by: "rxnorm_id"
 
Read and map MEDI
medi_df <- 
  file.path('medi', 'data', 'medi-umls.tsv') %>%
  readr::read_tsv() %>%
  dplyr::inner_join(rxnorm_df)
## Joining by: "rxnorm_id"
medi_df <- dplyr::bind_rows(
  umls_df %>%
    dplyr::inner_join(medi_df),
  icd9_df %>%
    dplyr::inner_join(medi_df)
)
## Joining by: "disease_cui"
## Joining by: "disease_icd9"
 
Read and map PREDICT
predict_df <- 
  file.path('msb-predict', 'data', 'indications-umls.tsv') %>%
  readr::read_tsv() %>%
  dplyr::rename(disease_cui = umls_cui, disease_omim = omim_id)
predict_df <- dplyr::bind_rows(
  umls_df %>%
    dplyr::inner_join(predict_df),
  omim_df %>%
    dplyr::inner_join(predict_df)
)
## Joining by: "disease_cui"
## Joining by: "disease_omim"
 
Read and map ehrlink
ehrlink_df <- 
  file.path('ehrlink', 'data', 'indications.tsv') %>%
  readr::read_tsv() %>%
  dplyr::rename(rxnorm_id = ingredient_rxcui) %>%
  dplyr::inner_join(rxnorm_df)
## Joining by: "rxnorm_id"
# ehrlink.df <- ehrlink.df %>%
#   dplyr::rename(subsumed_id = doid_code, subsumed_name = doid_name) %>%
#   dplyr::inner_join(
#     doslim.df %>% dplyr::transmute(subsumed_id, doid_code = slim_id, doid_name = slim_name)
#   ) %>%
#   dplyr::inner_join(rxnorm.df %>% dplyr::rename(ingredient_rxcui = rxnorm_id))
 
Join resources
indication_df <- dplyr::bind_rows(
  # LabeledIn
  labin_df %>%
    dplyr::select(doid_code, drugbank_id) %>%
    dplyr::distinct() %>%
    dplyr::mutate(resource = 'labeledin'),
  # MEDI
  medi_df %>%
    dplyr::group_by(doid_code, drugbank_id) %>%
    dplyr::summarize(
      resource = ifelse(max(hps), 'medi_hps', 'medi_lps')
    ) %>%
    dplyr::ungroup(),
  # PREDICT
  predict_df %>%
    dplyr::select(doid_code, drugbank_id) %>%
    dplyr::distinct() %>%
    dplyr::mutate(resource = 'predict'),
  # ehrlink
  ehrlink_df %>%
    dplyr::select(doid_code, drugbank_id) %>%
    dplyr::distinct() %>%
    dplyr::mutate(resource = 'ehrlink')
) %>%
  dplyr::arrange(doid_code, drugbank_id, resource)
# add compound and disease names
indication_df <- indication_df %>%
  dplyr::inner_join(drugbank_df) %>%
  dplyr::left_join(do_df)
## Joining by: "drugbank_id"
## Joining by: "doid_code"
# save
indication_df %>% readr::write_tsv(file.path('data', 'indications-verbose.tsv'))
indication_df %>% DT::datatable(rownames=F)
indication_slim_df <- indication_df %>% 
  dplyr::filter(drugbank_id %in% drugbank_slim) %>%
  dplyr::filter(doid_code %in% do_slim) %>%
  dplyr::rename(do_subsumed_id = doid_code, do_subsumed_name = doid_name) %>%
  dplyr::inner_join(
    doslim_map_df %>%
      dplyr::transmute(do_slim_id = slim_id, do_slim_name = slim_name, do_subsumed_id = subsumed_id)
  ) %>%
  dplyr::select(drugbank_id, drugbank_name, do_slim_id, do_slim_name, do_subsumed_id, do_subsumed_name, resource) %>%
  dplyr::arrange(drugbank_id, do_slim_id, do_subsumed_id, resource)
## Joining by: "do_subsumed_id"
# save sourced indications
indication_slim_df %>% readr::write_tsv(file.path('data', 'indications-slim-verbose.tsv'))
 
Convert to a single row per compound-disease pair. Include only high confidence indications
hc_sources <- c('medi_hps', 'ehrlink', 'labeledin', 'predict')
collapsed_df <- indication_slim_df %>%
  dplyr::filter(resource %in% hc_sources) %>%
  dplyr::rename(doid_id = do_slim_id, doid_name = do_slim_name) %>%
  dplyr::group_by(drugbank_id, drugbank_name, doid_id, doid_name) %>%
  dplyr::summarize(
    distinct_resources = n_distinct(resource),
    total_resources = n()
  ) %>% dplyr::ungroup()
# save
collapsed_df %>% readr::write_tsv(file.path('data', 'indications-slim-collapsed.tsv'))
collapsed_df %>% DT::datatable(rownames=F)
 
Indications per disease
collapsed_df %>%
  dplyr::group_by(doid_id, doid_name) %>%
  dplyr::summarize(
    'n_indications' = n()
  ) %>% 
  DT::datatable(rownames=F)
 
Indications per compound
collapsed_df %>%
  dplyr::group_by(drugbank_id, drugbank_name) %>%
  dplyr::summarize(
    'n_indications' = n()
  ) %>%
  DT::datatable(rownames=F)
 
Create a dataset for the curators
curation_df <- collapsed_df %>%
  dplyr::transmute(
    drug = drugbank_name,
    disease = doid_name,
    classification = '',
    notes = '',
    drug_url = paste0('http://www.drugbank.ca/drugs/', drugbank_id),
    disease_url = paste0('http://www.disease-ontology.org/term/', sub(':', '%3A', doid_id))
  ) %>% dplyr::arrange(drug, disease)
curation_df %>% readr::write_tsv(file.path('data', 'curation.tsv'))
set.seed(0)
curation_df %>%
  dplyr::sample_n(50) %>%
  dplyr::arrange(drug, disease) %>%
  readr::write_tsv(file.path('data', 'curation-subset.tsv'))