Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 20 additions & 20 deletions vcf/src/headers.rs
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
use std::collections::HashMap;

#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Header<'src> {
pub key: &'src str,
pub value: HeaderValue<'src>,
pub struct Header {
pub key: String,
pub value: HeaderValue,
}

#[derive(Debug, Clone, PartialEq, Eq)]
pub enum HeaderValue<'src> {
Flat(&'src str),
Nested(HashMap<&'src str, &'src str>),
pub enum HeaderValue {
Flat(String),
Nested(HashMap<String, String>),
}

#[cfg(test)]
Expand All @@ -34,15 +34,15 @@ mod tests {
headers,
vec![
Header {
key: "fileformat",
value: HeaderValue::Flat("VCFv1.4"),
key: "fileformat".to_string(),
value: HeaderValue::Flat("VCFv1.4".to_string()),
},
Header {
key: "INFO",
key: "INFO".to_string(),
value: HeaderValue::Nested(HashMap::from([
("abc", "123"),
("xyz", "3125"),
("sfh", "574"),
("abc".to_string(), "123".to_string()),
("xyz".to_string(), "3125".to_string()),
("sfh".to_string(), "574".to_string()),
])),
},
],
Expand All @@ -58,11 +58,11 @@ mod tests {
header,
Ok(
Header {
key: "FORMAT",
key: "FORMAT".to_string(),
value: HeaderValue::Nested(HashMap::from([
("abc", "123"),
("xyz", "3125"),
("sfh", "1,574"),
("abc".to_string(), "123".to_string()),
("xyz".to_string(), "3125".to_string()),
("sfh".to_string(), "1,574".to_string()),
])),
}
)
Expand All @@ -78,11 +78,11 @@ mod tests {
header,
Ok(
Header {
key: "FORMAT",
key: "FORMAT".to_string(),
value: HeaderValue::Nested(HashMap::from([
("abc", "1,233"),
("xyz", "3125"),
("sfh", "157"),
("abc".to_string(), "1,233".to_string()),
("xyz".to_string(), "3125".to_string()),
("sfh".to_string(), "157".to_string()),
])),
}
)
Expand Down
22 changes: 16 additions & 6 deletions vcf/src/parse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,21 +12,30 @@ lazy_static! {
static ref HEADER_VALUE_REGEX: Regex = Regex::new(r#"(?:[^,"]+|(?:"[^"]*"))+"#).unwrap();
}

impl<'src> Header<'src> {
pub fn parse(input: &'src str) -> Result<Self, ParseError> {
pub fn convert_to_string(hm: HashMap<&str, &str>) -> HashMap<String, String> {
hm
.into_iter()
.map(|(key, value)| (key.to_string(), value.to_string()))
.collect::<HashMap<String, String>>()
}


impl Header {
pub fn parse(input: &str) -> Result<Self, ParseError> {
println!("Parsing header input: {}", input);
let line = input.trim();
let (key, value) = line.strip_prefix("##")
.and_then(|line| line.split_once('='))
.ok_or(ParseError)?;
let value = HeaderValue::parse(value)?;
Ok(Self { key, value })
Ok(Self { key: key.to_string(), value: value })
}
}

impl<'src> HeaderValue<'src> {
pub fn parse(input: &'src str) -> Result<Self, ParseError> {
impl HeaderValue {
pub fn parse(input: &str) -> Result<Self, ParseError> {
match input.strip_prefix('<').and_then(|input| input.strip_suffix('>')) {
None => Ok(Self::Flat(input)),
None => Ok(Self::Flat(input.to_string())),
Some(pairs) => {
HEADER_VALUE_REGEX.captures_iter(pairs)
.map(|c| c.get(0).unwrap().as_str())
Expand All @@ -38,6 +47,7 @@ impl<'src> HeaderValue<'src> {
}
)
.collect::<Result<HashMap<_, _>, _>>()
.map(convert_to_string)
.map(HeaderValue::Nested)
}
}
Expand Down
9 changes: 6 additions & 3 deletions vcf/src/validate_fileformat.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,19 +25,22 @@ mod tests {

#[test]
fn is_valid_if_key_is_fileformat() {
let header = Header {key: "fileformat", value: Flat("VCFv4.4")};
let header = Header {key: "fileformat".to_string(), value: Flat("VCFv4.4".to_string())};
assert!(is_valid_file_format(&header));
}

#[test]
fn is_invalid_if_key_is_not_fileformat() {
let header = Header {key: "gileformat", value: Flat("VCFv4.4")};
let header = Header {key: "gileformat".to_string(), value: Flat("VCFv4.4".to_string())};
assert!(!is_valid_file_format(&header));
}

#[test]
fn is_invalid_if_header_value_nested() {
let header = Header {key: "fileformat", value: Nested(HashMap::from([("another_key", "VCFv4.4")])) };
let header = Header {
key: "fileformat".to_string(),
value: Nested(HashMap::from([("another_key".to_string(), "VCFv4.4".to_string())]))
};
assert!(!is_valid_file_format(&header));
}
}
100 changes: 91 additions & 9 deletions vcf/src/vcf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ use crate::parse;

pub struct VCF {
pub file_format: String,
pub format: Vec<Header>,
}

#[derive(Debug)]
Expand Down Expand Up @@ -61,7 +62,10 @@ impl From<parse::ParseError> for VCFError {
/// 20 1234567 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3
/// "#;
///# use vcf::vcf::VCFError;
/// let vcf = parse_vcf(&vcf_source[..])?;
/// let vcf = match parse_vcf(&vcf_source[..]) {
/// Ok(vcf) => vcf,
/// Err(_) => panic!("Error when we should be ok")
/// };
/// assert_eq!(vcf.file_format, "VCFv4.4");
///# Ok::<(), VCFError>(())
/// ```
Expand Down Expand Up @@ -101,15 +105,93 @@ impl From<parse::ParseError> for VCFError {
/// _ => assert!(false),
/// };
/// ```
///
/// Similarly, we can obtain the format information for a file via the `format` attribute.
///
/// ```
/// use std::collections::HashMap;
/// use vcf::vcf::parse_vcf;
/// use vcf::{Header, HeaderValue};
/// let vcf_source = br#"##fileformat=VCFv4.4
/// ###fileDate=20090805
/// ###source=myImputationProgramV3.1
/// ###reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta
/// ###contig=<ID=20,length=62435964,assembly=B36,md5=f126cdf8a6e0c7f379d618ff66beb2da,species="Homo sapiens",taxonomy=x>
/// ###phasing=partial
/// ###INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
/// ###INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
/// ###INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">
/// ###INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele">
/// ###INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129">
/// ###INFO=<ID=H2,Number=0,Type=Flag,Description="HapMap2 membership">
/// ###FILTER=<ID=q10,Description="Quality below 10">
/// ###FILTER=<ID=s50,Description="Less than 50% of samples have data">
/// ###FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
/// ###FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
/// ###FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
/// ###FORMAT=<ID=HQ,Number=2,Type=Integer,Description="Haplotype Quality">
/// ##CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003
/// 20 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,.
/// 20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3
/// 20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4
/// 20 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2
/// 20 1234567 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3
/// "#;
///# use vcf::vcf::VCFError;
/// let vcf = parse_vcf(&vcf_source[..])?;
/// let hq = vcf.format
/// .iter()
/// .find(
/// |item| match &item.value {
/// HeaderValue::Nested(d) => match d.get("ID") {Some(v) => v == "HQ", _ => false},
/// _ => false
/// }
/// ).unwrap();
/// assert_eq!(
/// *hq,
/// Header {
/// key: "FORMAT".to_string(),
/// value: HeaderValue::Nested(HashMap::from([
/// ("ID".to_string(), "HQ".to_string()),
/// ("Number".to_string(), "2".to_string()),
/// ("Type".to_string(), "Integer".to_string()),
/// ("Description".to_string(), "Haplotype Quality".to_string()),
/// ]))
/// }
/// );
///# Ok::<(), VCFError>(())
/// ```
pub fn parse_vcf(source: impl BufRead) -> Result<VCF, VCFError> {
let first_line = source.lines().next().ok_or(VCFError::ParseError)??;
let mut lines = source.lines();
let first_line = lines.next().ok_or(VCFError::ParseError)??;
let parsed = Header::parse(&first_line)?;
if is_valid_file_format(&parsed) {
match parsed.value {
Flat(s) => Ok(VCF {file_format: s.to_string()}),
_ => panic!(),
}
} else {
Err(VCFError::ParseError)
if !is_valid_file_format(&parsed) {
return Err(VCFError::ParseError)
}
let file_format = match parsed.value {
Flat(s) => s.to_string(),
_ => panic!(),
};
let formats = lines
.take_while(|s| match s { Ok(s) => s.starts_with("##"), _ => true})
.map(
|result| match result {
Ok(ref line) => Header::parse(line).map_err(VCFError::from),
Err(e) => Err(VCFError::IoError(e)),
}
)
.filter(
|result| match result {
Ok(header) if header.key == "FORMAT" => true,
Err(_) => true,
_ => false,
}
)
.collect::<Result<Vec<_>, _>>()?;
Ok(
VCF {
file_format: file_format.to_string(),
format: formats,
}
)
}