library(readr)
library(dplyr)
library(DT)

Read input for compound mapping

# Read DrugBank
drugbank_df <- 'https://raw.githubusercontent.com/dhimmel/drugbank/3e87872db5fca5ac427ce27464ab945c0ceb4ec6/data/drugbank.tsv' %>%
  readr::read_tsv() %>%
  dplyr::transmute(drugbank_id, drugbank_name = name)

# Retreive DrugBank Slim compounds
drugbank_slim <- readr::read_tsv('https://raw.githubusercontent.com/dhimmel/drugbank/3e87872db5fca5ac427ce27464ab945c0ceb4ec6/data/drugbank-slim.tsv')$drugbank_id

# Construct a rxnorm to drugbank mapping (through FDA-SRS UNII)
rxnorm_df <- dplyr::inner_join(
  file.path('data', 'rxcui-unii-map.tsv') %>%
    readr::read_tsv(),
  'https://raw.githubusercontent.com/dhimmel/drugbank/3e87872db5fca5ac427ce27464ab945c0ceb4ec6/data/mapping/fdasrs.tsv' %>%
    readr::read_tsv()
)
## Joining by: "fdasrs_id"

Read input for disease mapping

# read the file of Disease Ontology terms
do_df <- 
  'https://raw.githubusercontent.com/dhimmel/disease-ontology/72614ade9f1cc5a5317b8f6836e1e464b31d5587/data/term-names.tsv' %>%
  readr::read_tsv() %>%
  dplyr::filter(type == 'name') %>%
  dplyr::transmute(doid_code = doid, doid_name = name)

# Retreive DO Slim diseases
do_slim <- readr::read_tsv('https://raw.githubusercontent.com/dhimmel/disease-ontology/72614ade9f1cc5a5317b8f6836e1e464b31d5587/data/slim-terms-prop.tsv')$subsumed_id %>% unique()

# read disease ontology slim term propagations
doslim_map_df <- 
  'https://raw.githubusercontent.com/dhimmel/disease-ontology/72614ade9f1cc5a5317b8f6836e1e464b31d5587/data/slim-terms-prop.tsv' %>%
  readr::read_tsv()

# read disease ontology mappings
domap_df <- 
  'https://raw.githubusercontent.com/dhimmel/disease-ontology/72614ade9f1cc5a5317b8f6836e1e464b31d5587/data/xrefs.tsv' %>%
   readr::read_tsv()

# extract the DO to UMLS mapping
umls_df <- domap_df %>%
  dplyr::filter(resource == 'UMLS') %>%
  dplyr::select(-resource) %>%
  dplyr::rename(disease_cui = resource_id)

# extract the DO to ICD9 mapping
icd9_df <- domap_df %>%
  dplyr::filter(resource == 'ICD9') %>%
  dplyr::select(-resource) %>%
  dplyr::rename(disease_icd9 = resource_id)

# extract the DO to OMIM mapping
omim_df <- domap_df %>%
  dplyr::filter(resource == 'OMIM') %>%
  dplyr::select(-resource) %>%
  dplyr::rename(disease_omim = resource_id) %>%
  dplyr::mutate(disease_omim = as.integer(disease_omim))

Read and map labeledin

labin_df <-
  # read labeledin data
  file.path('labeledin', 'data', 'indications.tsv') %>%
  readr::read_tsv(col_types = list(rxnorm_id = readr::col_character())) %>%
  # removes combo drugs which do not convert to integer
  dplyr::mutate(rxnorm_id = as.integer(rxnorm_id)) %>%
  dplyr::filter(! is.na(rxnorm_id)) %>%
  # map umls diseases to DO
  dplyr::inner_join(umls_df) %>%
  # map rxnorm compounds to drugbank
  dplyr::inner_join(rxnorm_df)
## Warning in mutate_impl(.data, dots): NAs introduced by coercion
## Joining by: "disease_cui"
## Joining by: "rxnorm_id"

Read and map MEDI

medi_df <- 
  file.path('medi', 'data', 'medi-umls.tsv') %>%
  readr::read_tsv() %>%
  dplyr::inner_join(rxnorm_df)
## Joining by: "rxnorm_id"
medi_df <- dplyr::bind_rows(
  umls_df %>%
    dplyr::inner_join(medi_df),
  icd9_df %>%
    dplyr::inner_join(medi_df)
)
## Joining by: "disease_cui"
## Joining by: "disease_icd9"

Read and map PREDICT

predict_df <- 
  file.path('msb-predict', 'data', 'indications-umls.tsv') %>%
  readr::read_tsv() %>%
  dplyr::rename(disease_cui = umls_cui, disease_omim = omim_id)

predict_df <- dplyr::bind_rows(
  umls_df %>%
    dplyr::inner_join(predict_df),
  omim_df %>%
    dplyr::inner_join(predict_df)
)
## Joining by: "disease_cui"
## Joining by: "disease_omim"

Join resources

indication_df <- dplyr::bind_rows(
  # LabeledIn
  labin_df %>%
    dplyr::select(doid_code, drugbank_id) %>%
    dplyr::distinct() %>%
    dplyr::mutate(resource = 'labeledin'),
  # MEDI
  medi_df %>%
    dplyr::group_by(doid_code, drugbank_id) %>%
    dplyr::summarize(
      resource = ifelse(max(hps), 'medi_hps', 'medi_lps')
    ) %>%
    dplyr::ungroup(),
  # PREDICT
  predict_df %>%
    dplyr::select(doid_code, drugbank_id) %>%
    dplyr::distinct() %>%
    dplyr::mutate(resource = 'predict'),
  # ehrlink
  ehrlink_df %>%
    dplyr::select(doid_code, drugbank_id) %>%
    dplyr::distinct() %>%
    dplyr::mutate(resource = 'ehrlink')
) %>%
  dplyr::arrange(doid_code, drugbank_id, resource)

# add compound and disease names
indication_df <- indication_df %>%
  dplyr::inner_join(drugbank_df) %>%
  dplyr::left_join(do_df)
## Joining by: "drugbank_id"
## Joining by: "doid_code"
# save
indication_df %>% readr::write_tsv(file.path('data', 'indications-verbose.tsv'))

indication_df %>% DT::datatable(rownames=F)

indication_slim_df <- indication_df %>% 
  dplyr::filter(drugbank_id %in% drugbank_slim) %>%
  dplyr::filter(doid_code %in% do_slim) %>%
  dplyr::rename(do_subsumed_id = doid_code, do_subsumed_name = doid_name) %>%
  dplyr::inner_join(
    doslim_map_df %>%
      dplyr::transmute(do_slim_id = slim_id, do_slim_name = slim_name, do_subsumed_id = subsumed_id)
  ) %>%
  dplyr::select(drugbank_id, drugbank_name, do_slim_id, do_slim_name, do_subsumed_id, do_subsumed_name, resource) %>%
  dplyr::arrange(drugbank_id, do_slim_id, do_subsumed_id, resource)
## Joining by: "do_subsumed_id"
# save sourced indications
indication_slim_df %>% readr::write_tsv(file.path('data', 'indications-slim-verbose.tsv'))

Convert to a single row per compound-disease pair. Include only high confidence indications

hc_sources <- c('medi_hps', 'ehrlink', 'labeledin', 'predict')

collapsed_df <- indication_slim_df %>%
  dplyr::filter(resource %in% hc_sources) %>%
  dplyr::rename(doid_id = do_slim_id, doid_name = do_slim_name) %>%
  dplyr::group_by(drugbank_id, drugbank_name, doid_id, doid_name) %>%
  dplyr::summarize(
    distinct_resources = n_distinct(resource),
    total_resources = n()
  ) %>% dplyr::ungroup()

# save
collapsed_df %>% readr::write_tsv(file.path('data', 'indications-slim-collapsed.tsv'))

collapsed_df %>% DT::datatable(rownames=F)

Indications per disease

collapsed_df %>%
  dplyr::group_by(doid_id, doid_name) %>%
  dplyr::summarize(
    'n_indications' = n()
  ) %>% 
  DT::datatable(rownames=F)

Indications per compound

collapsed_df %>%
  dplyr::group_by(drugbank_id, drugbank_name) %>%
  dplyr::summarize(
    'n_indications' = n()
  ) %>%
  DT::datatable(rownames=F)

Create a dataset for the curators

curation_df <- collapsed_df %>%
  dplyr::transmute(
    drug = drugbank_name,
    disease = doid_name,
    classification = '',
    notes = '',
    drug_url = paste0('http://www.drugbank.ca/drugs/', drugbank_id),
    disease_url = paste0('http://www.disease-ontology.org/term/', sub(':', '%3A', doid_id))
  ) %>% dplyr::arrange(drug, disease)

curation_df %>% readr::write_tsv(file.path('data', 'curation.tsv'))

set.seed(0)
curation_df %>%
  dplyr::sample_n(50) %>%
  dplyr::arrange(drug, disease) %>%
  readr::write_tsv(file.path('data', 'curation-subset.tsv'))