Skip to content

Commit

Permalink
pairs: check each pair
Browse files Browse the repository at this point in the history
  • Loading branch information
kdm9 committed Jul 11, 2024
1 parent 4750278 commit 838ce31
Showing 1 changed file with 34 additions and 21 deletions.
55 changes: 34 additions & 21 deletions blsl/pairs.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,33 +6,38 @@



def pairslash(header):
def parse_header(header):
name, *comment = header.split(" ", 1)
if name.endswith("/1") or name.endswith("/2"):
return header

i = None
if name.endswith("/1"):
i = 1
if name.endswith("/2"):
i = 2
m = re.match(r"@([^ ]+)/([12])", name)
if m:
return header, m[1], m[2], ""
if comment:
if comment[0].startswith("1:"):
i = 1
return header, name, "1", comment[0]
if comment[0].startswith("2:"):
i = 2
if i is None:
raise ValueError(f"Unable to guess pair from header '{header}'")
return f"{name}/{i} {comment[0]}"
return header, name, "2", comment[0]
raise ValueError(f"Unable to parse fastq header '{header}'")

def pairslash(header):
_, name, i, comment = parse_header(header)
if comment:
return f"{name}/{i} {comment}"
return f"{name}/{i}"

def check_pair(r1, r2):
_, n1, p1, c1 = parse_header(r1[0])
_, n2, p2, c2 = parse_header(r2[0])
if n1 != n2:
raise ValueError(f"Bad fastq pair: {r1}, {r2}")

def read_fileset(readfiles):
res = defaultdict(dict)
for readfile in readfiles:
stem, read = re.match(r"(.+?)[_.]?(R[12]|SE|IL|INTERLEAVED)?(_001)?\.(FQ|FASTQ)(\.GZ)?", readfile).groups()[:2]
read = read.upper()
if read == "INTERLEAVED":
read = "IL"
stem, read = re.match(r"(.+?)[_.]?(R[12]|SE|IL|INTERLEAVED)?(_001)?\.(FQ|FASTQ)(\.GZ)?", readfile, re.I).groups()[:2]
if read is not None:
read = read.upper()
if read == "INTERLEAVED":
read = "IL"
if read in res[stem]:
raise ValueError("Two files with same stem and read index: {readfiles}")
res[stem][read] = readfile
Expand All @@ -47,14 +52,22 @@ def output_read(args, read):

def handle_fileset(args, stem, fileset):
if "R1" in fileset and "R2" in fileset:
for r1, r2 in tqdm(zip(fqparse(fileset["R1"]), fqparse(fileset["R2"])), desc=stem):
r1s = fqparse(fileset["R1"])
r2s = fqparse(fileset["R2"])
for r1, r2 in tqdm(zip(r1s, r2s), desc=f"{stem} R1/R2"):
check_pair(r1, r2)
output_read(args, r1)
output_read(args, r2)
if not (next(r1s, None) is None and next(r2s, None) is None):
raise ValueError(f"Uneven number of R1/R2 reads in {fileset['R1']} and {fileset['R2']}")
if "IL" in fileset:
for r in tqdm(fqparse(fileset("IL")), desc=stem):
for r in tqdm(fqparse(fileset["IL"]), desc=f"{stem} IL"):
output_read(args, r)
if "SE" in fileset:
for r in tqdm(fqparse(fileset("SE")), desc=stem):
for r in tqdm(fqparse(fileset["SE"]), desc=f"{stem} SE"):
output_read(args, r)
if None in fileset:
for r in tqdm(fqparse(fileset[None]), desc=stem):
output_read(args, r)


Expand Down

0 comments on commit 838ce31

Please sign in to comment.