Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions Config/configure_PRONTO.ini
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@ inpred_node = OUS
data_path = /data/sample_data/analysis_results/
;Please modify this if you need to specify file encoding as "ISO-8859-1" or other encoding standards in your local environment to read special Norwegian characters.
encoding_sys = utf-8
;Specify the number of top filter sections with highest priority, these sections will be named staring with FILTER0 in this configure. (NB: the other filter sections will use the comnbined results from all FILTER0):
top_filter = 3
;Specify the number of filter sections with low priority. (NB: this will also make the script to generate the number of output tables):
filter_col_nu = 4
;Specify the number of max rows of the table per slide starting from the 8th slide in report. This is used to split long tables.
Expand Down
131 changes: 32 additions & 99 deletions Script/PRONTO.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,32 +229,6 @@ def read_tsv_col(data_file,filter_column,key_word,columns,MTB_format):
return data


def add_filter_column_into_table(data_config,filter_column_add):
data = []
for i, row in enumerate(data_config):
row = [item.replace('\n', '') for item in row]
if(row[-1] == ''):
row.pop()
if(i == 0):
row.append(filter_column_add+'\n')
else:
row.append("Yes\n")
data.append(row)
return data


def filter_depth_tumor_all_col(data_config,depth_tumor_DNA):
data = []
data.append(data_config[0])
p = data_config[0].index('Depth_tumor_DNA\t')
for row in data_config:
if(row[p] != 'Depth_tumor_DNA\t' and row[p] != ''):
num = row[p].split('\t')[0]
if(int(num) >= depth_tumor_DNA):
data.append(row)
return data


def filter_depth_tumor_cols(data_config,depth_tumor_DNA):
data = [[] for n in range(len(data_config))]
data[0] = data_config[0]
Expand Down Expand Up @@ -1373,80 +1347,39 @@ def main(argv):
for i in range(0,filter_col_nu_config+1):
filter_section = str(i)
if(filter_section == "0"):
all_data_filter = []
top_filter = int(cfg.get("INPUT", "top_filter")) + 1
for top_filter_num in range(1,top_filter):
filter_column = cfg.get("FILTER0-"+str(top_filter_num), "filter_column")
key_word = cfg.get("FILTER0-"+str(top_filter_num), "key_word")
columns = cfg.get("FILTER0-"+str(top_filter_num), "columns")
try:
filter_column_add = cfg.get("FILTER0-"+str(top_filter_num), "filter_column_add")
except:
filter_column_add = ""
try:
filter_min_depth_tumor_DNA = int(cfg.get("FILTER0-"+str(top_filter_num), "min_depth_tumor_DNA"))
except:
filter_min_depth_tumor_DNA = ""
output_table = cfg.get("FILTER0-"+str(top_filter_num), "output_table")
output_table_file_config_pre = output_file_preMTB_table_path + "_" + output_table + "_pre.txt"
output_table_file_config = output_file_preMTB_table_path + "_" + output_table + ".txt"
if(',' in filter_column):
for column in filter_column.split(','):
all_data = read_tsv(data_file_small_variant_table,column,key_word)

else:
all_data = read_tsv(data_file_small_variant_table,filter_column,key_word)
if(filter_column_add != ""):
all_data = add_filter_column_into_table(all_data,filter_column_add)
if(filter_min_depth_tumor_DNA != ""):
all_data = filter_depth_tumor_all_col(all_data,filter_min_depth_tumor_DNA)
write_exl(output_table_file_config_pre,all_data)
clear_blank_line(output_table_file_config_pre,output_table_file_config)
all_data_filter.append(all_data)
concat_data = pandas.DataFrame()
topfilters = pronto.parse_topfilter(cfg, output_file_preMTB_table_path)
for filter in topfilters:
for filter_column in filter['filter_columns']:
small_variant_data = pandas.read.csv(data_file_small_variant_table, sep='\t')
try:
small_variant_data = pronto.filter_small_variant_data(small_variant_data, DNA_sampleID, filter_column, filter['key_word'])
except ValueError:
sys.exit(1)
if filter['filter_column_add']:
small_variant_data[filter['filter_column_add']] = 'Yes'
if filter['filter_depth_tumor_dna']:
small_variant_data = small_variant_data[small_variant_data['Depth_tumor_DNA'] >= filter['filter_min_depth_tumor_dna']]
small_variant_data.write_csv(filter['table_output_path'], sep='\t', index=False)
concat_data = pandas.concat([concat_data, small_variant_data])

all_data_filter = sum(all_data_filter, [])
for i in range(len(all_data_filter)):
if(i == 0):
header_length = len(all_data_filter[i])
else:
if(len(all_data_filter[i]) < header_length):
count = header_length - len(all_data_filter[i])
all_data_filter[i] = [[item.replace('\n', '') for item in cell] for cell in all_data_filter[i]]
all_data_filter[i].pop()
for j in range(1, count):
all_data_filter[i].append(' \t')
all_data_filter[i].append('\n')

unique_data = []
for current in all_data_filter:
if(current[-1] == '\n'):
current.pop()
if(current[-1].endswith('\n\t')):
current[-1] = current[-1].replace('\n\t', '\n')
is_appear = False
for existing in unique_data:
if(len(existing) > len(current)):
current_content = current.copy()
current_content[-1] = current[-1].replace('\n', '\t')
for i in range(len(existing) - len(current_content) + 1):
if(existing[i:i+len(current_content)] == current_content):
is_appear = True
# Only print last column for the rescued variants which are not include in Filter0-3.
if(existing[-1] == 'Yes\n'):
existing[-1] = '\n'
break
if is_appear:
break
if(existing == current):
is_appear = True
break
if not is_appear:
unique_data.append(current)

top_filter_output_file_pre = output_file_preMTB_table_path + "_preMTB_workingTable_pre.txt"
top_filter_output_file = output_file_preMTB_table_path + "_preMTB_workingTable.txt"
write_exl(top_filter_output_file_pre,unique_data)
clear_blank_line(top_filter_output_file_pre,top_filter_output_file)
concat_data = concat_data.drop_duplicates()
for i, irow in concat_data.iterrows():
for j, jrow in concat_data.iterrows():
# avoid comparing the same rows twice
if i >= j:
continue

# get the difference between the two rows and check if the only different is the filter column with NA value
diff = irow.compare(jrow)
if len(diff) == 1 and diff.isna().any().any():
# if yes, set the filter column with NA value to empty string in both rows
concat_data.at[i, diff.index.tolist()[0]] = ''
concat_data.at[j, diff.index.tolist()[0]] = ''
# remove duplicates
concat_data = concat_data.drop_duplicates()
top_filter_output_file = "{}_preMTB_workingTable.txt".format(output_file_preMTB_table_path)
concat_data.write_csv(top_filter_output_file, sep='\t', index=False)
continue

if(filter_section == "1"):
Expand Down
46 changes: 46 additions & 0 deletions pronto/pronto.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import configparser
import glob
import logging
import os
Expand Down Expand Up @@ -78,3 +79,48 @@ def add_table_name(shapes: pptx.shapes.shapetree.SlideShapes, table_name: str, l
paragraph.font.size = pptx.util.Pt(font_size)
paragraph.font.bold = True
paragraph.alignment = pptx.enum.text.PP_ALIGN.CENTER

# parse topfilter sections in config file and construct list of topfilter dictionaries
def parse_topfilter(cfg : configparser.ConfigParser, output_dir: str) -> list:
top_filter_dict = []
for section in cfg.sections():
if section.startswith("FILTER0-"):
filter = dict(cfg[section])
filter["filter_columns"] = filter["filter_column"].split(",")
if "filter_column_add" not in filter:
filter["filter_column_add"] = None
if "min_depth_tumor_dna" not in filter:
filter["min_depth_tumor_dna"] = None
filter["pre_table_output_path"] = os.path.join(output_dir, "{}_pre.txt".format(filter["output_table"]))
filter["table_output_path"] = os.path.join(output_dir, "{}.txt".format(filter["output_table"]))
top_filter_dict.append(filter)
return top_filter_dict

# Filter small variant data based on keyword being present in filter column
def filter_small_variant_data(data: pandas.DataFrame, sample_id: str, filter_column: str, keyword: str) -> pandas.DataFrame:

# check if required columns are present in data
for column_name in [filter_column, "IGV_QC", "Class_judgement", "SampleID"]:
if column_name not in data.columns:
logging.error("Column {} not found in data".format(column_name))
raise ValueError

# only consider data for the specified sample_id
data = data[data['SampleID'] == sample_id]

# check if IGV_QC is "Not OK" but Class_judgement is not "exclude"
if data[(data['IGV_QC'] == "Not OK") & (data['Class_judgement'] != "exclude")].shape[0] > 0:
logging.error("""Dataset error:
IGV_QC is 'Not OK', but Class_judgement is not 'exclude'. Please check the QC Excel file and fix the mistake before run this script again!
""")
raise ValueError

# filter according to keyword being present or not in filter column
if keyword.startswith('!'):
keys = keyword.replace('!', '').split(' && ')
for key in keys:
data = data[~data[filter_column].str.contains(key)] # remove rows that contain the key in filter column
else:
data = data[data[filter_column].str.contains(keyword.replace(',', '|'), na=False)] # only keep rows that contain the keyword in filter column

return data
Loading