# -*- coding: utf-8 -*-
"""Contains generic functions used by FRAG"""
import gzip
from multiprocessing import Pool
from functools import partial
from collections import defaultdict
import os
import sys
import ahocorasick
from frags import read
[docs]def get_reference(input_file):
""" Get the reference genome in one-string.
Only take the first sequence of the file.
Can be fasta or fastq, gzipped or not.
:param input_file: fasta/fastq file to use as reference
:type input_file: str
"""
# The reference that will be returned
ref = ""
# Is it a GZIP file?
test_file = open(input_file, "rb")
# Get the first values
magic = test_file.read(2)
# Close the file
test_file.close()
# Open the file, GZIP or not
with (gzip.open(input_file, "rt") if magic == b"\x1f\x8b"
else open(input_file)) as in_file:
# Get the first char to determine the type of file
first_char = in_file.read(1)
# Ignore header
next(in_file)
# Fasta file, can be multi-line
if first_char == ">":
# First line
tmp_line = in_file.readline().strip()
while tmp_line and not tmp_line.startswith(">"):
ref += tmp_line
tmp_line = in_file.readline().strip()
# Fastq file
elif first_char == "@":
# Get the second line only
ref = in_file.readline().strip()
else:
print("File error: enable to understand type of file {} "\
"({})".format(input_file, first_char))
sys.exit(1)
# Return the ref as a single string
return ref
[docs]def next_read(file, offset_start, offset_end):
""" Return each sequence between offsets range of a file
as a tuple (header, seq) using a generator.
Can be fasta or fastq, gzipped or not.
WARNING: spaces in headers are replaced by _
:param file: fasta/fastq file to read
:param offset_start: offset in the file from where to read
:param offset_end: offset in the file until where to read
:type file: str
:type offset_start: int
:type offset_end: int
"""
# Is it a GZIP file?
test_file = open(file, "rb")
# Get the first values
magic = test_file.read(2)
# Close the file
test_file.close()
# Open the file, GZIP or not
with (gzip.open(file, "rb") if magic == b"\x1f\x8b"
else open(file, "rb")) as in_file:
first_line = in_file.readline().decode('utf-8')
# FASTQ file
if first_line.startswith("@"):
# Go to starting offset
in_file.seek(offset_start)
# Set current offset
beg_line_offset = offset_start
# Read each line from this point
for line in in_file:
# Consider this line as a header
header = line.decode('utf-8').strip()
# It is a proper fastq header
if header.startswith("@"):
# The beginning of header is in the offset range
if beg_line_offset < offset_end:
# Get the sequence
sequence = in_file.readline().decode('utf-8').strip()
# Skip the two next lines
in_file.readline()
in_file.readline()
# Return header and sequence and wait for the next one
# WARNING: replace spaces in header by _
yield (header.replace(" ", "_"), sequence.upper())
# Out of offset, stop this loop
else:
break
# Current offset
beg_line_offset = in_file.tell()
# (multi?)FASTA file
elif first_line.startswith(">"):
# Go to starting offset
in_file.seek(offset_start)
# Set current offset
beg_line_offset = offset_start
# Read each line from this point
for line in in_file:
# Consider this line as a header
header = line.decode('utf-8').strip()
# It is a proper fasta header
if header.startswith(">"):
# The beginning of header is in the offset range
if beg_line_offset < offset_end:
# Get the sequence
sequence = in_file.readline().decode('utf-8').strip()
# Get current offset
current_offset = in_file.tell()
# Get next line
next_l = in_file.readline().decode('utf-8').strip()
# While this next line is not a fasta header...
while next_l and not next_l.startswith(">"):
# Add this to the Sequence
sequence += next_l
# Get current offset
current_offset = in_file.tell()
# Get next line
next_l = in_file.readline().decode('utf-8').strip()
# Next line is a fasta header, go back to its beginning
in_file.seek(current_offset)
# Return header and sequence and wait for the next one
# WARNING: replace spaces in header by _
yield (header.replace(" ", "_"), sequence.upper())
# Out of offset, stop this loop
else:
break
# Current offset
beg_line_offset = in_file.tell()
# Not a valid file
else:
print("File error: enable to understand type of file {} "\
"({})".format(file, first_line[0]))
sys.exit(1)
[docs]def build_graph(ref, k):
""" Index each k-mers of a genome
Aho-Corasick implementation, requires pypi package pyahocorasick
:param ref: the reference to index
:param k: k-mer size
:type ref: str
:type k: int
"""
# Create the automaton
graph = ahocorasick.Automaton()
# Add each kmers of ref in the automaton
for i, _ in enumerate(ref[0:len(ref)-k+1]):
# The current kmer
pattern = ref[i:i+k].upper()
# Add the kmer
added = graph.add_word(pattern, (i, pattern))
# If this kmer was already added
if not added:
# Print a warning
print("Warning: kmer {} at position {} is present more "\
"than one time in reference, only the last one will "\
"be used".format(pattern, i))
# Construct the graph
graph.make_automaton()
# Return the graph
return graph
[docs]def find_hits(graph, a_read):
""" Find all kmers of ref present in a read
All hits are on the form:
start_pos_read: start_pos_ref
:param graph: the graph to parse
:param a_read: the read to search in the index
:type graph: :py:class:`pyahocorasick`
:type a_read: str
"""
# Results
hits = {}
# Search the hits
for item in graph.iter(a_read):
# Add in the dict the results
hits[item[0]-len(item[1][1])+1] = item[1][0]
# Return the hits
return hits
[docs]def reverse_complement(seq):
""" Take an input sequence and return its revcomp
:param seq: the seq to compute
:type seq: str
"""
# Empty return sequence
ret = ""
# Hash of correspondence
rc_val = {"A": "T", "T": "A", "C": "G", "G": "C", "N": "N"}
# For each nucleotides
for nucl in seq[::-1]:
try:
# Add its rc to ret
ret += rc_val[nucl.upper()]
except KeyError:
print("Ignoring character '{}' founded in {}".format(nucl, seq))
# Return ret
return ret
[docs]def get_recombinations(offset_start, offset_end, file, k, gap, graph1, graph2=None):
""" Main parallelized function that retrieve each read
of a offset range and find matches and breakpoint of them.
:param offset_start: where to start taking sequences in the file
:param offset_end: where to stop taking sequences in the file
:param file: the filename of the file where to take sequences from
:param k: size of kmers
:param gap: maximum authorized gap size for continuous hits
:param graph1: the graph to parse for genome1
:param graph2: the graph to parse for genome2
:type offset_start: int
:type offset_end: int
:type file: string
:type k: int
:type gap: int
:type graph1: :type graph: :py:class:`pyahocorasick`
:type graph2: :type graph: :py:class:`pyahocorasick` or None
"""
# Resulting Reads of current offset range
all_queries = []
# Query each read, one by one, in the offset range
for header, sequence in next_read(file, offset_start, offset_end):
# Construct the Read
query = read.Read(header, sequence)
# Find hits for this query
hits = find_hits(graph1, query.sequence)
# Transforms hits to matches
query.get_matches(hits, gap, k, 0, 1)
# Find hits for this query (Rev comp)
hits_rc = find_hits(graph1, reverse_complement(query.sequence))
# Transforms hits to matches
query.get_matches(hits_rc, gap, k, 1, 1)
# Two ref files
if graph2:
# Find hits for this query
hits = find_hits(graph2, query.sequence)
# Transforms hits to matches
query.get_matches(hits, gap, k, 0, 2)
# Find hits for this query (Rev comp)
hits_rc = find_hits(graph2,reverse_complement(query.sequence))
# Transforms hits to matches
query.get_matches(hits_rc, gap, k, 1, 2)
# Create the breakpoints
query.get_breakpoints()
# Add this query to the result
all_queries.append(query)
# Add the global result into the queue
return all_queries
[docs]def get_all_queries(file, nb_proc, k, gap, graph1, graph2=None):
""" Launch all parallel process to get all queries from a file
:param file: the filename of the file where to take sequences from
:param nb_proc: number of precess to run in parallel
:param k: size of kmers
:param gap: maximum authorized gap size for continuous hits
:param graph1: the graph to parse for genome1
:param graph2: the graph to parse for genome2
:type file: string
:type nb_proc: int
:type k: int
:type gap: int
:type graph1: :type graph: :py:class:`pyahocorasick`
:type graph2: :type graph: :py:class:`pyahocorasick` or None
"""
# Get the size of the file
total_size = os.path.getsize(file)
# Size of what to read
chunk_size = total_size // nb_proc
# Starting offset
offset_start = 0
# Create the pool of process
pool = Pool()
# Partial function to fix all but firsts arguments
prod_recomb=partial(get_recombinations, file=file, k=k, gap=gap,
graph1=graph1, graph2=graph2)
# All tuples of offset_start, offset_end
all_offsets = []
# For each thread/chunk
for _ in range(nb_proc - 1):
# Compute the ending offset for this chunk
offset_end = offset_start + chunk_size
# Add this couple of start/end
all_offsets.append((offset_start, offset_end))
# Next start is where it stops
offset_start = offset_start + chunk_size
# Add the last chunk
all_offsets.append((offset_start, total_size))
# Launch all process (Results is a list of list)
results = pool.starmap(prod_recomb, all_offsets)
# Get a flatten list
all_queries = []
for i in results:
all_queries += i
# Return all queries
return all_queries
[docs]def prepare_blast_file(breakpoint_file, all_queries, minsizeblast):
""" Prepare a fasta file to be Blasted containing all breakpoints
of at least minsizeblast nucleotides.
WARNING: this wrote a FASTA file, regardless of the format of
the original file
WARNING: headers of original files are modified to add the
information of which breakpoint(s) of a specific read are
Blasted: original_header_#bp
:param breakpoint_file: the filename of the file to be written
:param all_queries: all queries that may contain breakpoints
:param minsizeblast: minimal size of breakpoint accepted
:type breakpoint_file: string
:type all_queries: list(:py:class:`Read`)
:type minsizeblast: int
"""
# This dict will contain headers (minus the first '>') of
# breakpoints to blast as they are written, associated to there
# index in all_queries.
all_breakpoints = {}
# Open the file to be Blasted
with open(breakpoint_file, "w") as out_brp:
# Parse each query and extract potential breakpoints
for index, query in enumerate(all_queries):
# Only the matched ones
if query.get_ref() != 0:
# Get all breakpoints and there id
for nb_brp, brp in enumerate(query.breakpoints):
# Minimal size to Blast (could be negative)
if abs(brp.size) >= minsizeblast:
# Forge un header pour dico et pour ecrire,
# le +1 car les bp sont de 1 à n
# Forge the header: originalheader_idBreakpoint
# breakpoint count starts at 1
header = "{}_{}".format(query.header[1:], nb_brp+1)
# Associate it with its id in all_queries
all_breakpoints[header] = index
# Compute start/stop pos (size could be negative)
beg = min(brp.beg_pos_read, brp.beg_pos_read+brp.size)
end = max(brp.beg_pos_read, brp.beg_pos_read+brp.size)
# Write it to the file
out_brp.write(">{}\n{}\n".\
format(header, query.sequence[beg:end]))
# Return the main dict of Breakpoints/index
return all_breakpoints
[docs]def process_blast_res(compressed_file, res_blast_file, sep, all_breakpoints):
""" Compress Blast result to only show the bests hits and output
result in a fasta-like file. Header is the original header
with breakpoint id, e-value and bit-score.
WARNING: this wrote a FASTA file, regardless of the format of
the original file
:param compressed_file: the filename of the file to be written
:param res_blast_file: Blast result file
:param sep: separator to use in the result file
:param all_breakpoints: dict of Breakpoints/index created before the Blast
:type compressed_file: string
:type res_blast_file: string
:type sep: list(char)
:type all_breakpoints: dict
"""
# Dict that will contain index in all_queries associated with
# the list of breakpoints that have a Blast result
all_bp_query = defaultdict(list)
# Open output file where to write
with open(compressed_file, "w") as compressed:
# Open Blast result file
with open(res_blast_file) as blast_out:
# Get the 4 info for the first line
cur_header, cur_matches, cur_evalue, cur_bitscore = blast_out.\
readline().\
split()
# key is index in all_queries and Blasted bp is added
all_bp_query[all_breakpoints[cur_header]].append(cur_header.\
split("_")[-1])
# Forge the header for this first line
fasta_header = ">{0}{1}{2}{1}{3}\n".format(cur_header, sep[1],
cur_evalue,
cur_bitscore)
# Remove some blast info
small_match = cur_matches.split("location")[0][:-2]
# Set of matches for this header
fasta_line = {small_match}
# For each other line in the blast result file
for line in blast_out:
# Info from this line
header, matches, evalue, bitscore = line.split()
# Same header than before?
if header == cur_header:
# Same bit-score/e-value?
if evalue == cur_evalue and bitscore == cur_bitscore:
# Remove some blast info
small_match = matches.split("location")[0][:-2]
# Add this blast result
fasta_line.add(small_match)
# New header
else:
# Write the previous header
compressed.write(fasta_header)
# Write the corresponding results
compressed.write(sep[2].join(fasta_line) + "\n")
# Get new header
cur_header = header
# Get new match
cur_matches = matches
# Get new e-value
cur_evalue = evalue
# Get new bit-score
cur_bitscore = bitscore
# Associate the current breakpoint to its index
# in all_queries
all_bp_query[all_breakpoints[cur_header]].\
append(cur_header.split("_")[-1])
# Forge the new header
fasta_header = ">{0}{1}{2}{1}{3}\n".format(cur_header,
sep[1],
cur_evalue,
cur_bitscore)
# Remove some blast info
small_match = cur_matches.split("location")[0][:-2]
# Set of matches for this header
fasta_line = {small_match}
# Write the last header
compressed.write(fasta_header)
# Write the corresponding results
compressed.write(sep[2].join(fasta_line) + "\n")
# Return blasted breakpoint and associated index in all_queries
return all_bp_query