-
Notifications
You must be signed in to change notification settings - Fork 34
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
GL-229: Add .loom support to Optimus (#253)
* add loom to optimus * add loom test * update optimus diagram * update documentation
- Loading branch information
Showing
13 changed files
with
371 additions
and
47 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
FROM python:3.7.4 | ||
|
||
COPY requirements.txt . | ||
RUN pip install -r requirements.txt | ||
|
||
RUN mkdir /tools | ||
COPY optimus_zarr_to_loom.py /tools/ | ||
COPY unpackZARR.sh /tools/ | ||
ENV PATH=${PATH}:/tools/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
#!/usr/bin/env bash | ||
|
||
tag=$1 | ||
|
||
if [ -z $tag ]; then | ||
echo -e "\nYou must provide a tag" | ||
echo -e "\nUsage: bash build_docker.sh TAG\n" | ||
exit 1 | ||
fi | ||
|
||
|
||
docker build . -t quay.io/humancellatlas/zarr-to-loom:$tag | ||
echo "You can now push with docker push quay.io/humancellatlas/zarr-to-loom:$tag" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,126 @@ | ||
#!/usr/bin/env python3 | ||
import os | ||
import sys | ||
|
||
import zarr | ||
import loompy | ||
import scipy as sc | ||
from scipy.sparse import coo_matrix | ||
import argparse | ||
import numpy as np | ||
|
||
|
||
# Custom Exception | ||
class UnexpectedInputFormat(Exception): | ||
pass | ||
|
||
|
||
def main(): | ||
# Parse the arguments | ||
description = """This script converts ZARR optimus output to loom format""" | ||
parser = argparse.ArgumentParser(description=description) | ||
parser.add_argument('--input-zarr', dest="input_zarr_path", required=True, help="Path to input ZARR file") | ||
parser.add_argument('--output-loom', dest="output_loom_path", required=True, help="Path to output loom file") | ||
parser.add_argument('--sample-id', dest="sample_id", required=True, help="Sample identifier") | ||
args = parser.parse_args() | ||
|
||
input_zarr_path = args.input_zarr_path | ||
output_loom_path = args.output_loom_path | ||
sample_id = args.sample_id | ||
|
||
# Checks on inputs | ||
if not os.path.isdir(input_zarr_path): | ||
sys.exit("Error: the input ZARR path is not a directory.") | ||
if os.path.exists(output_loom_path): | ||
sys.exit("Error: The output loom file exists!") | ||
|
||
# Open the ZARR | ||
store = zarr.DirectoryStore(input_zarr_path) | ||
root = zarr.open(store) | ||
|
||
# Get the expression matrix | ||
# expression matrix in numpy ndarray format (dense) | ||
# NOTE: If memory is limiting this could be done by chunk | ||
nrows = root[f"/{sample_id}.zarr/expression"].shape[0] | ||
ncols = root[f"/{sample_id}.zarr/expression"].shape[1] | ||
|
||
expr_sp = sc.sparse.coo_matrix((nrows, ncols), np.float32) | ||
|
||
iter_row_count = 100; | ||
|
||
xcoord = [] | ||
ycoord = [] | ||
value = [] | ||
|
||
chunk_row_size = 10000 | ||
chunk_col_size = 10000 | ||
|
||
for i in range(0, nrows, chunk_row_size): | ||
for j in range(0, ncols, chunk_col_size): | ||
p = chunk_row_size | ||
if i + chunk_row_size > nrows: | ||
p = nrows - 1 | ||
q = chunk_col_size | ||
if j + chunk_col_size > ncols: | ||
q = ncols - j | ||
expr_sub_row_coo = sc.sparse.coo_matrix(root[f"/{sample_id}.zarr/expression"][i:(i+p), j:(j+q)]) | ||
for k in range(0, expr_sub_row_coo.data.shape[0]): | ||
xcoord.append(expr_sub_row_coo.row[k] + i) | ||
ycoord.append(expr_sub_row_coo.col[k] + j) | ||
value.append(expr_sub_row_coo.data[k]) | ||
|
||
xcoord = np.asarray(xcoord) | ||
ycoord = np.asarray(ycoord) | ||
value = np.asarray(value) | ||
|
||
expr_sp_t = sc.sparse.coo_matrix((value, (ycoord, xcoord)), shape=(expr_sp.shape[1], expr_sp.shape[0])) | ||
|
||
del xcoord | ||
del ycoord | ||
del value | ||
|
||
# ROW/GENE Metadata | ||
|
||
# Check that the first gene metadata column is the gene name as expected | ||
if not (root[f"/{sample_id}.zarr/gene_metadata_string_name"][:] == 'gene_name')[0]: | ||
raise UnexpectedInputFormat("The first gene metadata item is not the gene_name"); | ||
|
||
# Prepare row attributes (gene attributes) | ||
# Follow loom file Conventions | ||
row_attrs = { | ||
"Gene": root[f"/{sample_id}.zarr/gene_metadata_string"][:][0,], | ||
"Accession": root[f"/{sample_id}.zarr/gene_id"][:]} | ||
|
||
numeric_field_names = root[f"/{sample_id}.zarr/gene_metadata_numeric_name"][:] | ||
|
||
# Generate with a list | ||
for i in range(0, numeric_field_names.shape[0]): | ||
name = numeric_field_names[i] | ||
data = root[f"/{sample_id}.zarr/gene_metadata_numeric"][:][:, i] | ||
row_attrs[name] = data | ||
|
||
# COLUMN/CELL Metadata | ||
col_attrs = dict() | ||
col_attrs["CellID"] = root[f"/{sample_id}.zarr/cell_id"][:] | ||
bool_field_names = root[f"/{sample_id}.zarr/cell_metadata_bool_name"][:] | ||
|
||
for i in range(0, bool_field_names.shape[0]): | ||
name = bool_field_names[i] | ||
data = root[f"/{sample_id}.zarr/cell_metadata_bool"][:][:, i] | ||
col_attrs[name] = data | ||
|
||
float_field_names = root[f"/{sample_id}.zarr/cell_metadata_float_name"][:] | ||
|
||
def add_to_cell_meta_float_by_index(i): | ||
name = float_field_names[i] | ||
data = root[f"/{sample_id}.zarr/cell_metadata_float"][:][:, i] | ||
col_attrs[name] = data | ||
|
||
[add_to_cell_meta_float_by_index(i) for i in range(0, float_field_names.shape[0])] | ||
|
||
# Generate the loom file | ||
loompy.create(output_loom_path, expr_sp_t, row_attrs, col_attrs) | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
asciitree==0.3.3 | ||
fasteners==0.15 | ||
h5py==2.9.0 | ||
loompy==2.0.17 | ||
monotonic==1.5 | ||
numcodecs==0.6.3 | ||
numpy==1.17.1 | ||
pandas==0.25.1 | ||
python-dateutil==2.8.0 | ||
pytz==2019.2 | ||
scipy==1.3.1 | ||
six==1.12.0 | ||
zarr==2.3.2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
#!/usr/bin/env bash | ||
|
||
# Usage function | ||
function usage() { | ||
echo "Usage: $0 -i input_directory -o output_directory" | ||
} | ||
|
||
# Read the CLI arguments | ||
while getopts ":i:o:h" opt; do | ||
case "${opt}" in | ||
i) | ||
inputDir=$OPTARG | ||
;; | ||
o) | ||
outputDir=$OPTARG | ||
;; | ||
h) | ||
usage | ||
exit 0; | ||
;; | ||
*) | ||
usage | ||
exit 1; | ||
;; | ||
esac | ||
done | ||
shift $((OPTIND -1)) | ||
|
||
# Check the inputs | ||
if [ -z "$inputDir" ]; then | ||
echo "Error: input directory (-i) not specified" | ||
exit 1; | ||
fi | ||
if [ -z "$outputDir" ]; then | ||
echo "Error: output directory (-o) not specified" | ||
exit 1; | ||
fi | ||
if [ ! -d "$inputDir" ]; then | ||
echo "Error the input directory path is not a directory"; | ||
exit 1; | ||
fi | ||
if [ ! -d "$outputDir" ]; then | ||
echo "Error; the output directory path is not a directory"; | ||
exit; | ||
fi | ||
|
||
# Do the conversion | ||
while IFS= read -r -d '' f | ||
do | ||
bname1=$(basename "${f}") | ||
rel_final_path=$(echo "${bname1}" | tr '!' '/') | ||
rel_final_dirname=$(dirname "${rel_final_path}") | ||
rel_final_bname=$(basename "${rel_final_path}") | ||
mkdir -p "${outputDir}/${rel_final_dirname}" | ||
cp "${f}" "${outputDir}/${rel_final_dirname}/${rel_final_bname}" | ||
done < <(find "${inputDir}" -type f -print0 ) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.