rebuild the website to fix some formatting issues

grunwaldlab · Nov 2, 2023 · 2b63494 · 2b63494
1 parent 863dfdd
commit 2b63494
Show file tree

Hide file tree

Showing 20 changed files with 2,269 additions and 2,584 deletions.
diff --git a/misc_analyses/convert_new_seqs_to_old_format/convert_new_seqs_to_old_format.Rmd b/misc_analyses/convert_new_seqs_to_old_format/convert_new_seqs_to_old_format.Rmd
@@ -0,0 +1,73 @@
+
+
+
+```{r}
+library(tidyverse)
+library(readxl)
+library(metacoder)
+```
+
+Martha sent us this data (email "RPS10 database" on sep 6, 2022) and Ricardo needs it in the format for the old database.
+
+
+## Read in input data
+
+Data from Martha: 
+
+```{r}
+raw_data <- read_xlsx('rps10database_updates_july22_mas.xlsx')
+```
+
+The first version of the database:
+
+```{r}
+old_seqs <- read_fasta('../../data/releases/release_1.fa')
+old_data <- map_dfr(names(old_seqs), function(header) {
+  parts <- strsplit(header, split = '|', fixed = TRUE)[[1]]
+  parts <- strsplit(parts, split = '=', fixed = TRUE)
+  map_dfc(parts, function(p) tibble(!!p[1] := p[2]))
+})
+```
+
+add column for genus
+
+```{r}
+raw_data$genus <- str_match(raw_data$name, "^([^ ]+) ")[, 2]
+old_data$genus <- str_match(old_data$name, "^([^_]+)_")[, 2]
+```
+
+check that all genera are in the old database so that we can use their taxonomy:
+
+```{r}
+stopifnot(all(raw_data$genus %in% old_data$genus))
+```
+
+Get below-genus taxonomy from old format:
+
+```{r}
+parts <- old_data$taxonomy[!duplicated(old_data$genus)] %>%
+  str_match(pattern = '(.+;.+;.+;.+;.+;.+);(.[^_]+)')
+tax_key <- setNames(parts[,2], parts[,3])
+```
+
+Apply taxonomy to new data
+
+```{r}
+raw_data$taxonomy <- paste0(tax_key[raw_data$genus], ';', raw_data$name) %>%
+  gsub(pattern = ' ', replacement = '_')
+```
+
+make fasta file output
+
+```{r}
+raw_data <- mutate(raw_data, header = paste0('name=', gsub(name, pattern = ' ', replacement = '_'),
+                                             '|strain=', "na",
+                                             '|ncbi_acc=', "na",
+                                             '|ncbi_taxid=', "na",
+                                             '|oodb_id=', 9000 + seq_len(nrow(raw_data)),
+                                             '|taxonomy=', taxonomy))
+paste0('>', raw_data$header, '\n', raw_data$sequence) %>%
+  write_lines(file = 'new_seqs_formatted.fasta')
+```
+
+
diff --git a/misc_analyses/convert_new_seqs_to_old_format/new_seqs_formatted.fasta b/misc_analyses/convert_new_seqs_to_old_format/new_seqs_formatted.fasta
diff --git a/misc_analyses/convert_new_seqs_to_old_format/rps10database_updates_july22_mas.xlsx b/misc_analyses/convert_new_seqs_to_old_format/rps10database_updates_july22_mas.xlsx
diff --git a/notes/database_format_and_update_ideas.Rmd b/notes/database_format_and_update_ideas.Rmd
@@ -1,3 +1,9 @@
+---
+output:
+  pdf_document: default
+  html_document: default
+  word_document: default
+---
 
 ## Google Drive folder structure
 

diff --git a/website/2020-10-22_release_1_rps10.fasta b/website/2020-10-22_release_1_rps10.fasta