library(dplyr)
library(DT)
library(ggplot2)

options(stringsAsFactors=FALSE)

Symptoms from the HSDN (Human symptoms–disease network)

# read Disease Ontology to MESH mapping
mesh.df <- 
  'http://git.dhimmel.com/disease-ontology/data/xrefs-prop-slim.tsv' %>%
  read.delim() %>%
  dplyr::filter(resource == 'MSH') %>%
  dplyr::select(-resource) %>%
  dplyr::rename(disease_id = resource_id)

# read HSDN supplement that was mapped to MESH
hsdn.df <- 
  'https://raw.githubusercontent.com/LABrueggs/HSDN/master/Combined-Output.tsv' %>%
  read.delim(check.names = FALSE, row.names=1) %>%
  dplyr::rename(
    symptom_id = `MeSH Symptom ID`,
    symptom_name = `MeSH Symptom Term`,
    disease_id = `MeSH Disease ID`,
    disease_name = `MeSH Disease Term`,
    tfidf_score = `TFIDF score`,
    cooccurs = `PubMed occurrence`
  )

# add MESH to hsdn.df
hsdn.df <- hsdn.df %>%
  dplyr::inner_join(mesh.df)
## Joining by: "disease_id"
path <- file.path('data', 'symptoms-DO.tsv')
write.table(hsdn.df, path, sep='\t', row.names=FALSE, quote=FALSE)

DT::datatable(hsdn.df %>% dplyr::filter(tfidf_score >= 25))

hsdn.df %>%
  ggplot(aes(x = tfidf_score)) +
  geom_histogram(alpha=0.6) +
  scale_x_log10() + theme_bw()
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

hsdn.df %>%
  ggplot(aes(x = tfidf_score)) +
  geom_histogram(alpha=0.6, binwidth=1) +
  xlim(c(0, 50)) +
  theme_bw()