spatial-test/subsample.py at main · FertigLab/spatial-test · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import os
import sys
import subprocess as sp
import shutil
import h5py
import csv
import json
import pyarrow as pa
import pyarrow.parquet as pq
import scanpy as sc

# Take paramaters from params.csv
if len(sys.argv) != 2:
    print("Usage: python subsample.py <params.csv>")
    sys.exit(1)

params_arg = sys.argv[1]

param_file = open(params_arg,'r',newline='')
param_read = csv.reader(param_file, delimiter=',')
assert param_read.__next__() == ['input_path','output_path','hd_resolution','xmin','xmax','ymin','ymax']
params = param_read.__next__()
param_file.close()

input_path = params[0]
output_path = params[1]
resolution = params[2]
hd = bool(len(resolution))
xmin = int(params[3])
xmax = int(params[4])
ymin = int(params[5])
ymax = int(params[6])


# List of all files that the pipeline uses
files = ['filtered_feature_bc_matrix.h5',
         'spatial/tissue_positions_list.csv',
         'spatial/scalefactors_json.json',
         'spatial/tissue_hires_image.png',
         'spatial/tissue_lowres_image.png']
if hd:
    files[1] = 'spatial/tissue_positions.parquet'
    base_path = input_path
    input_path = input_path + '/binned_outputs/' + resolution

# Ensure the needed files are present
for file in files:
    print(f"Checking for {file} in {input_path}")
    assert os.path.exists(input_path+'/'+file)

# Create output folders and create dummy feature_slice.h5
os.makedirs(output_path, exist_ok=True)

if hd:
    os.makedirs(output_path + '/binned_outputs/'+resolution+'/spatial')
    slice_src = os.path.join(base_path, 'feature_slice.h5')
    slice_dst = os.path.join(output_path, 'feature_slice.h5')
    shutil.copyfile(slice_src, slice_dst)

    #slim down feature_slice.h5 to keep size minimal
    with h5py.File(slice_dst, 'r+') as f:
        objects = list(f.keys())
        for obj in objects:
            del f[obj]
    sp.run(['h5repack',slice_dst,os.path.join(output_path,'temp_feature_slice.h5')])
    shutil.move(os.path.join(output_path,'temp_feature_slice.h5'),slice_dst)
    output_path = output_path + '/binned_outputs/' + resolution
else:
    os.makedirs(output_path + '/spatial')


# Copy files to output folders
for file in files:
    src = os.path.join(input_path, file)
    dst = os.path.join(output_path, file)
    print(f'Copying {src} to {dst}')
    shutil.copyfile(src,dst)
os.chdir(output_path)


# Filter spots from the position list
codes = []
if hd:
    positions = pq.read_table(files[1])
    print(f"Original dataset: {positions.num_rows} spots")
    spot_table = []
    for i in range(positions.num_columns):
        spot_table.append([])

    for i in range(positions.num_rows):
        if positions[1][i].as_py() == 1:
            if xmin<=positions[3][i].as_py()<=xmax and ymin<=positions[2][i].as_py()<=ymax:
                for j in range(positions.num_columns):
                    spot_table[j].append(positions[j][i].as_py())
                codes.append(positions[0][i].as_py())
    table = pa.table({'barcode': spot_table[0],
                    'in_tissue': spot_table[1],
                    'array_row': spot_table[2],
                    'array_col': spot_table[3],
                    'pxl_row_in_fullres': spot_table[4],
                    'pxl_col_in_fullres': spot_table[5]})
    os.remove(files[1])
    pq.write_table(table, files[1])
    print(f'Parquet with {len(codes)} spots created')
else:
    os.remove(files[1])
    new_spots = open(files[1],'w',newline='')
    writer = csv.writer(new_spots, delimiter=',')
    old_spots = open(os.path.join(input_path,files[1]),'r',newline='')
    reader = csv.reader(old_spots, delimiter=',')

    xvals = []
    yvals = []
    for row in reader:
        if row[1] == '1':                   # Only keep spots in tissue
            xvals.append(int(row[3]))
            yvals.append(int(row[2]))
            if xmin<=int(row[3])<=xmax and ymin<=int(row[2])<=ymax:
                writer.writerow(row)
                codes.append(row[0])
    print(f"Original dataset: {max(xvals)} by {max(yvals)} spots")
    print(f"CSV file processed")
    old_spots.close()
    new_spots.close()


# Filter the h5 matrix to keep only selected spots and 2k genes
adata = sc.read_10x_h5(files[0])
adata.layers["counts"] = adata.X.copy()
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, n_top_genes=2000, subset=True)
adata = adata[adata.obs_names.isin(codes), :].copy()
#copy the X back from layers to X
adata.X = adata.layers["counts"][adata.obs_names.isin(codes), :].copy()
adata.layers.clear()

#create the new h5 file
filt_mat = h5py.File(files[0],'r+')
del filt_mat['matrix/barcodes']
del filt_mat['matrix/data']
del filt_mat['matrix/indices']
del filt_mat['matrix/indptr']
del filt_mat['matrix/shape']
del filt_mat['matrix/features']

filt_mat['matrix'].create_dataset('barcodes',data = adata.obs_names.tolist())
filt_mat['matrix'].create_dataset('indptr',data = adata.X.indptr)
filt_mat['matrix'].create_dataset('data',data = adata.X.data, compression='gzip')
filt_mat['matrix'].create_dataset('indices',data = adata.X.indices, compression='gzip')
filt_mat['matrix'].create_dataset('shape', data = adata.shape[::-1])
filt_mat['matrix'].create_dataset('features/name', data = adata.var_names.tolist())
filt_mat['matrix'].create_dataset('features/id', data = adata.var['gene_ids'].tolist())
filt_mat['matrix'].create_dataset('features/feature_type', data = adata.var['feature_types'].tolist())
filt_mat['matrix'].create_dataset('features/genome', data = adata.var['genome'].tolist())
filt_mat.close()

# Repack the h5 file to save space
sp.run(['h5repack',files[0],'temp_'+files[0]])
shutil.move('temp_'+files[0], files[0])

# Pretend that lowres is the highres image and update scalefactors_json.json
shutil.copyfile('spatial/tissue_lowres_image.png', 'spatial/tissue_hires_image.png')
# Update scalefactors_json.json
with open('spatial/scalefactors_json.json', 'r') as f:
    scalefactors = json.load(f)
scalefactors['tissue_hires_scalef'] = scalefactors['tissue_lowres_scalef']
with open('spatial/scalefactors_json.json', 'w') as f:
    json.dump(scalefactors, f, indent=4)

# Test that we can read the final h5 file
test=sc.read_10x_h5(files[0])
print(f"Subsampled dataset: {test}")