Source code for frags.read

# -*- coding: utf-8 -*-

"""Contains class and functions related to read definition and use"""
from operator import attrgetter

[docs]class Read:
    """Define a read.

    :param header: header of the read
    :param sequence: sequence of the read
    :type header: str
    :type sequence: str
    """
    def __init__(self, header, sequence):
        self.header = header
        self.sequence = sequence
        self.matches = []
        self.breakpoints = []
        self.all_strand = []
        self.all_ref = []
        self.bp_blasted = [] # Breakpoints that have Blast hit

    # Equality between two Read
    def __eq__(self, other):
        if isinstance(self, other.__class__):
            return self.__dict__ == other.__dict__
        return False

    # self representation for print
    def __repr__(self):
        return repr((self.header, self.sequence, self.matches,
             self.breakpoints, self.get_strand(), self.get_ref(),
             self.bp_blasted))

[docs]    def get_strand(self):
        """ Compute the strand of this Read
            (-1=nothing / 0=normal / 1=revcomp / 2=normal AND revcomp)
        """
        # Nothing
        strand = -1
        # If it has matches
        if self.all_strand:
            # Get all uniq strand
            all_strand_uniq = set(self.all_strand)
            # A single strand
            if len(all_strand_uniq) == 1:
                strand = self.all_strand[0]
            # Two strand
            else:
                strand = 2
        return strand

[docs]    def get_ref(self):
        """ Compute the ref of this Read
            (0=nothing / 1=ref1 / 2=ref2 / 3=ref1 AND ref2) """
        # Nothing
        ref = 0
        # If it has matches
        if self.all_ref:
            # Get all uniq ref
            all_ref_uniq = set(self.all_ref)
            # A single ref
            if len(all_ref_uniq) == 1:
                ref = self.all_ref[0]
            # Two ref
            else:
                ref = 3
        return ref

[docs]    def output(self, sep):
        """ Proper output of a line of the result file

        :param sep: Separator to use in CSV
        :type sep: list(char)
        """
        # Output the first info
        ret = self.header + sep[2] + str(self.get_strand()) + sep[2] + \
              str(len(self.breakpoints)) + sep[2]

        # Output all breakpoints
        if len(self.breakpoints) > 0:
            ret += sep[1].join([x.output(sep) for x in self.breakpoints])
        # No breakpoint
        else:
            ret += "None"
        # Tab before next column
        ret += sep[2]

        # Sort matches by position on the read
        matches = sorted(self.matches, key=attrgetter('beg_pos_read'))

        read_info = []
        ref_info = []
        sizes = []
        inserts = []
        # For each matches in position order
        for i in matches:
            read_info.append(i.output_read(sep[0]))
            ref_info.append(i.output_ref(sep[0]))
            sizes.append(i.size)
            inserts.append(sep[0].join(map(str, i.inserts)))

        # Clean output for matches read
        ret += sep[1].join(read_info) + sep[2]
        # Clean output for matches ref
        ret += sep[1].join(ref_info) + sep[2]
        # Clean output for sizes
        ret += sep[1].join(map(str, sizes)) + sep[2]
        # Clean output for insertions
        ret += sep[1].join(map(str, inserts)) + sep[2]
        # Output ids of all breakpoints that get successfully Blasted
        if len(self.bp_blasted) > 0:
            ret += sep[1].join(map(str, self.bp_blasted))
        # Nothing blasted
        else:
            ret += "None"
        # New line
        ret += "\n"

        return ret

[docs]    def add_a_match(self, match):
        """ Test if this match should be added or not.
            It must be added if it is not a subpart of an already added
            other match. In some case, some already added matches are
            subparts of the match to add. If so, they are removed.

        :param match: the match to add
        :type match: :py:class:`Match`
        """
        # Is this match a subpart of a match already added?
        to_add = True
        for i in self.matches:
            if match.is_include_in(i):
                to_add = False
                break
        # If we need to add this match
        if to_add:
            # Matches to be removed
            matches_to_remove = []
            # Is some matches included in the tested one?
            for i in self.matches:
                if i.is_include_in(match):
                    # Add it to be removed
                    matches_to_remove.append(i)
            # Remove them
            for i in matches_to_remove:
                # Remove it
                self.matches.remove(i)
                # Remove the corresponding ref/strand
                self.all_ref.remove(i.ref)
                self.all_strand.remove(i.strand)
            # Add the match
            self.matches.append(match)
            # Add ref/strand
            self.all_ref.append(match.ref)
            self.all_strand.append(match.strand)

[docs]    def get_matches(self, hits, gap, k, strand, ref):
        """ Populate matches list from all hits, for one strand

        :param hits: matching position on read and ref
        :param gap: maximum authorized gap size for continuous hits
        :param k: k-mer size
        :param strand: the strand of this hit
        :param ref: the reference index of this hit
        :type hits: dict
        :type gap: int
        :type k: int
        :type strand: int
        :type ref: int
        """
        # If there is some hits
        if hits:
            insert = []
            beg_pos_read = sorted(hits.keys())[0]
            beg_pos_ref = hits[beg_pos_read]
            prev_pos_read = beg_pos_read
            prev_pos_ref = beg_pos_ref

            for i in sorted(hits.keys()):
                # Is the read or ref NOT continuous (modulo gap)?
                if i >= prev_pos_read+gap+k+1 or \
                   hits[i] >= prev_pos_ref+gap+k+1 or \
                   hits[i] < prev_pos_ref:
                    # Add the previous good and complete match
                    size = prev_pos_read - beg_pos_read + k
                    match = Match(beg_pos_read, beg_pos_ref, strand, ref, size,
                                  insert, len(self.sequence))
                    # Add the match
                    self.add_a_match(match)
                    # Reinit insertions
                    insert = []
                    # New match
                    beg_pos_read = i
                    beg_pos_ref = hits[i]
                # Not completely contiguous
                else:
                    # Compute the size of insertion
                    insert_tmp = max(i - (prev_pos_read + k),
                                     hits[i] - (prev_pos_ref + k))
                    if insert_tmp > 0:
                        # Add this insert for this hit
                        insert.append(insert_tmp)
                # Set both prev_pos to current_pos
                prev_pos_read = i
                prev_pos_ref = hits[i]
            # Add the last one
            size = prev_pos_read - beg_pos_read + k
            match = Match(beg_pos_read, beg_pos_ref, strand, ref, size, insert,
                          len(self.sequence))
            # Add the match
            self.add_a_match(match)

[docs]    def get_breakpoints(self):
        """ Populate breakpoints list using all hits, for both strands """
        # Sort matches by position on the read
        matches = sorted(self.matches, key=attrgetter('beg_pos_read'))
        # If there is more than one match on this read
        if len(matches) > 1:
            # breakpoints are everything between two matches
            for i in range(1, len(matches)):
                # Starting  pos of bp is the ending pos of previous match
                # Warning, end_pos_read can be greater than beg_pos of
                # current match (overlapping reads). Breakpoint is then
                # of negative size.
                #start_bp = min(matches[i-1].end_pos_read,
                #               matches[i].beg_pos_read)
                start_bp = matches[i-1].end_pos_read
                # Size of bp is position of current match minus
                # ending pos of previous match
                # Warning, it can be negative (overlapping reads)
                size_bp = matches[i].beg_pos_read - matches[i-1].end_pos_read
                # Create the breakpoint
                break_point = Breakpoint(start_bp, size_bp)
                # Add it to breakpoint list for this read
                if break_point not in self.breakpoints:
                    self.breakpoints.append(break_point)


[docs]class Match:
    """Define a match.

    :param beg_pos_read: starting position in the read of this match
    :param beg_pos_ref: starting position in the ref of this match
    :param strand: strand of this match
    :param ref: the ref index for this match
    :param size: size of the match
    :param inserts: size of potential insertions (possible to have several insertions in ONE match)
    :param seq_l: size of the read (needed for rev comp computation)
    :type beg_pos_read: int
    :type beg_pos_ref: int
    :type strand: int
    :type ref: int
    :type size: int
    :type inserts: list(int)
    :type seq_l: int
    """
    def __init__(self, beg_pos_read, beg_pos_ref, strand, ref, size, inserts, \
                 seq_l):
        self.ref = ref
        self.strand = strand # 0 -> normal / 1 -> rc
        # Compute ending position on the read (not that useful anymore)
        # Normal strand, direct
        if self.strand == 0:
            self.beg_pos_read = beg_pos_read
            self.end_pos_read = beg_pos_read + size
        # Reverse complement: use total length of the read
        else:
            self.beg_pos_read = seq_l - size - beg_pos_read
            self.end_pos_read = seq_l - beg_pos_read
        # Positions on the ref
        self.beg_pos_ref = beg_pos_ref
        self.end_pos_ref = beg_pos_ref + size
        # Size of the match
        self.size = size
        # Potential insertions
        if not inserts:
            # No insertion
            self.inserts = [0]
        else:
            self.inserts = inserts

    # self representation for print
    def __repr__(self):
        return repr((self.strand, self.beg_pos_read, self.end_pos_read,
                     self.beg_pos_ref, self.end_pos_ref, self.size,
                     self.inserts, self.ref))

    # Equality between two Matches
    def __eq__(self, other):
        if isinstance(self, other.__class__):
            return self.__dict__ == other.__dict__
        return False

[docs]    def is_include_in(self, other):
        """ Check if this match is included in another match

        :param other: the match to compare with
        :type other: :py:class:`Match`
        """
        ret = False
        if self.beg_pos_read >= other.beg_pos_read and\
           self.end_pos_read <= other.end_pos_read:
            ret = True
        return ret

[docs]    def output_read(self, sep):
        """ Correct output of read infos

        :param sep: Separator to use in CSV
        :type sep: list(char)
        """
        # Reverse complement
        if self.strand:
            # Add ref
            ret = "(-)"
        else:
            ret = "(+)"
        # Match info
        ret += "({}){}{}{}".format(self.ref, self.beg_pos_read,
                                   sep[0], self.size)
        return ret

[docs]    def output_ref(self, sep):
        """ Correct output of ref infos

        :param sep: Separator to use in CSV
        :type sep: list(char)
        """
        return "({}){}{}{}".format(self.ref, self.beg_pos_ref,
                                   sep[0], self.size)


[docs]class Breakpoint:
    """Define a breakpoint.

    :param beg_pos_read: starting position in the read of this match
    :param size: size of the match
    :type beg_pos_read: int
    :type beg_pos_ref: int
    :type size: int
    """
    def __init__(self, beg_pos_read, size):
        self.beg_pos_read = beg_pos_read
        # Size of the match
        self.size = size

    # self representation for print
    def __repr__(self):
        return repr((self.beg_pos_read, self.size))

    # Equality between two Matches
    def __eq__(self, other):
        if isinstance(self, other.__class__):
            return self.__dict__ == other.__dict__
        return False

[docs]    def output(self, sep):
        """ Proper output of a line in the result file

        :param sep: Separator to use in CSV
        :type sep: list(char)
        """
        return str(self.beg_pos_read) + sep[0] + str(self.size)