
import logging

class Proteins():
    """
    Contains a list of protein sequences and names and several conversion functions.    
    >>> p=Proteins(">TestProtein\\nFNCLGMSNRDFLEGVSG")
    >>> p.sequences
    ['FNCLGMSNRDFLEGVSG']
    >>> p.names
    ['TestProtein']
    >>> p=Proteins(">TestProtein1\\nFNCLGMSNRDFLEGVSG\\n>TestProtein2\\nFNCLGMSNRDFLEGVSG")
    >>> p.transfer_to_fasta_list()
    ['>TestProtein1\\nFNCLGMSNRDFLEGVSG', '>TestProtein2\\nFNCLGMSNRDFLEGVSG']
    >>> p=Proteins(">TestProtein FNCLGMSNRDFLEGVSG", 'named_space_separated')
    >>> p.sequences
    ['FNCLGMSNRDFLEGVSG']
    >>> p.names
    ['TestProtein']
    >>> p=Proteins([{"name":"LCMV Armstrong, Protein GP","sequence":"MGQIVTMFEALPHIIDEVINIVIIVLIVITGI"}], 'json')
    >>> p.sequences
    ['MGQIVTMFEALPHIIDEVINIVIIVLIVITGI']
    >>> p.names
    ['LCMV Armstrong, Protein GP']
    """
    def __init__(self, input_sequences, sequence_format='auto', accept_x=False, accept_dash=False, iupac=False, any_letter=False):
        """
        Function accept 2 parameters, and the format could be one the 3: fasta, one_sequence, space_separated. 
        If only sequences was give, the function will try to recognize the format of it.
        """
        self.sequences = []
        self.names = []
        if not sequence_format or sequence_format.startswith('auto'):
            self.sequence_format = 'auto'
        else:
            self.sequence_format = sequence_format
        self.accept_x = accept_x
        self.accept_dash = accept_dash
        self.iupac = iupac
        self.any_letter = any_letter
        if not input_sequences == None:
            logging.debug('input_sequences:%s' % input_sequences)
            input_sequences = input_sequences.strip()
            self.extractForm(input_sequences, self.sequence_format)

    def add_protein(self, sequence, name=""):
        """
        adding one protein sequence to the instance with validation.
        """
        logging.debug('add protein for sequence"%s, name:%s' % (sequence, name))
        sequence = sequence.strip().upper()
        logging.debug(sequence)
        if self.any_letter:
            legal_char = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
        elif self.iupac:
            legal_char = "ABCDEFGHIKLMNPQRSTVWXYZ"
        else:
            legal_char = "ACDEFGHIKLMNPQRSTVWY"

        if self.accept_x:
            legal_char += "X"
        elif self.accept_dash:
            legal_char += "-"


        logging.debug(legal_char)
        for amino_acid in sequence:
            if not amino_acid in legal_char:
                raise ValueError("Sequence: '%s' contains an invalid character: '%c' at position %d." % (sequence, amino_acid, sequence.find(amino_acid)))
        self.sequences.append(sequence) 
        if not name:
            name = "sequence %d" % (len(self.sequences))
        name = str(name)
        self.names.append(name)
        logging.debug('add protien done')

    def extractFasta(self, fasta):
        """
        To extact sequences from the input string with fasta format.
        """
        logging.debug('extractFasta ...')
        input_sequences = fasta.split(">")
        if len(input_sequences) < 2:
            raise ValueError("Invalid FASTA format: No '>' found.")
        for i in input_sequences[1:]:
            if(len(i) > 0):
                end_of_name = i.find("\n")
                if end_of_name == -1:
                    raise ValueError("Invalid FASTA format: No Protein sequence found between two names.")
                name = i[:end_of_name]
                seq = i[end_of_name:].split()
                self.add_protein("".join(seq), name)

    def convert_first_seq(self, seq):
        """
        To convert the first sequence into FASTA by appending a temporary header
        """
        seq = '\n'.join([x for x in seq.split("\n") if x.strip() != ''])
        seq_list = seq.split('\r\n')
        if seq_list[0].startswith('>') == False:
            seq_list.insert(0, ">sequence 1")
        seq_list = '\n'.join([str(x) for x in seq_list])
        return seq_list

    def transfer_to_fasta_list(self):
        """
        TO transter the instance to a list of fasta format sequences.
        """
        return [">"+"\n".join([name, seq]) for (name, seq) in zip(self.names, self.sequences)]

    def extractForm(self, input_sequences, sequence_format):
        """
        To extract data from the input parameters. 
        This will be called be __init__.py
        the sequence_format  should be one of the 4: fasta, space_separated, named_space_separated, json
        """
        logging.debug('extractForm ...')
        if sequence_format == "auto":
            if type(input_sequences) is list:
                sequence_format = "json"
            elif ">" in input_sequences:
                # convert the first sequence into FASTA by appending a temporary header
                input_sequences = self.convert_first_seq(input_sequences)
                sequence_format = "fasta"
            else:
                rows = input_sequences.split('\n')
                if all([len(row.strip().split())==2 for row in rows]):
                    sequence_format = 'named_space_separated'
                else:
                    sequence_format = "space_separated"

        if sequence_format == "fasta":
            self.extractFasta(input_sequences)
        elif sequence_format == "space_separated":
            seqs = input_sequences.split()
            for seq in seqs:
                self.add_protein(seq)
        elif sequence_format == "named_space_separated":
            self.extractNamedSpaceSeparated(input_sequences)
        elif sequence_format.upper() == "json":
            self.extractJSON(input_sequences)

    def all_seq_len_less_than(self, length):        
        """
        If all_seq_len_less_than a certain length.
        """
        for seq in self.sequences:
        # if there is even one sequence longer then minimum, avoid the error message
            if len(seq) >= length:
                return False
        return True 

    def get_protein_seq(self, seq_nums=None):
        """
        return a list of protein sequences with (peptide_index, name, sequence)'
        """
        result = []
        for i, (name, sequence) in enumerate(zip(self.names, self.sequences)):
            peptide_index = seq_nums[i] if seq_nums else (i + 1)
            blocklen = 50
            if len(sequence) > blocklen:
                seqblock = ""
                for block in range(0, len(sequence) - blocklen, blocklen):
                    seqblock += """%s\n""" % sequence[block:block + blocklen]
                seqblock += sequence[block + blocklen:]
                result.append(tuple([peptide_index, name, seqblock]))
            else:
                result.append(tuple([peptide_index, name, sequence]))
        return result

    def extractJSON(seqs_in_json):
        '''
        seqs_in_json = [{"name":"LCMV Armstrong, Protein GP","sequence":"MGQIVTMFEALPHIIDEVINIVIIVLIVITGIK"}]
        '''
        self.sequences = []
        self.names = []
        for seq in seqs_in_json:
            self.add_protein(seq['sequence'], seq['name'])

    def extractNamedSpaceSeparated(self, named_space_separated_sequences):
        """
        seqs_in_json = "LCMV Armstrong, Protein GP MGQIVTMFEALPHIIDEVINIVIIVLIVITGIK"
        """
        # TODO: add validation
        logging.debug('extractNamedSpaceSeparated ...')
        input_sequences = named_space_separated_sequences.split('\n')

        for row in input_sequences:
            name, seq = row.strip().rsplit()
            name = name.strip()
            self.add_protein(seq, name)



class ProteinsWithWarnings():
    """
    Contains a list of protein sequences and names and several conversion functions.    
    >>> p=Proteins(">TestProtein\\nFNCLGMSNRDFLEGVSG")
    >>> p.sequences
    ['FNCLGMSNRDFLEGVSG']
    >>> p.names
    ['TestProtein']
    >>> p=Proteins(">TestProtein1\\nFNCLGMSNRDFLEGVSG\\n>TestProtein2\\nFNCLGMSNRDFLEGVSG")
    >>> p.transfer_to_fasta_list()
    ['>TestProtein1\\nFNCLGMSNRDFLEGVSG', '>TestProtein2\\nFNCLGMSNRDFLEGVSG']
    >>> p=Proteins(">TestProtein FNCLGMSNRDFLEGVSG", 'named_space_separated')
    >>> p.sequences
    ['FNCLGMSNRDFLEGVSG']
    >>> p.names
    ['TestProtein']
    >>> p=Proteins([{"name":"LCMV Armstrong, Protein GP","sequence":"MGQIVTMFEALPHIIDEVINIVIIVLIVITGI"}], 'json')
    >>> p.sequences
    ['MGQIVTMFEALPHIIDEVINIVIIVLIVITGI']
    >>> p.names
    ['LCMV Armstrong, Protein GP']
    """
    def __init__(self, input_sequences, sequence_format='auto', accept_x=False, accept_dash=False, iupac=False, any_letter=False):
        """
        Function accept 2 parameters, and the format could be one the 3: fasta, one_sequence, space_separated. 
        If only sequences was give, the function will try to recognize the format of it.
        """
        self.sequences = []
        self.names = []
        self.warnings = []
        self.errors = []
        if not sequence_format or sequence_format.startswith('auto'):
            self.sequence_format = 'auto'
        else:
            self.sequence_format = sequence_format
        self.accept_x = accept_x
        self.accept_dash = accept_dash
        self.iupac = iupac
        self.any_letter = any_letter
        if input_sequences:
            logging.debug('input_sequences:%s' % input_sequences)
            input_sequences = input_sequences.strip()
            self.extractForm(input_sequences, self.sequence_format)

        self.input_sequences = '\n'.join(['>%s\n%s' % t for t in zip(self.names, self.sequences)])

    def add_protein(self, sequence, name=""):
        """
        adding one protein sequence to the instance with validation.
        """
        logging.debug('add protein for sequence"%s, name:%s' % (sequence, name))
        sequence = sequence.strip().upper()
        logging.debug(sequence)
        if self.any_letter:
            legal_char = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
        elif self.iupac:
            legal_char = "ABCDEFGHIKLMNPQRSTVWXYZ"
        else:
            legal_char = "ACDEFGHIKLMNPQRSTVWY"

        if self.accept_x:
            legal_char += "X"
        elif self.accept_dash:
            legal_char += "-"


        logging.debug(legal_char)
        for amino_acid in sequence:
            if not amino_acid in legal_char:
                self.warnings.append("Sequence: '%s' contains an invalid character: '%c' at position %d." % (sequence, amino_acid, sequence.find(amino_acid)))
                return
        self.sequences.append(sequence) 
        if not name:
            name = "sequence %d" % (len(self.sequences))
        name = str(name)
        self.names.append(name)
        logging.debug('add protien done')

    def extractFasta(self, fasta):
        """
        To extact sequences from the input string with fasta format.
        """
        logging.debug('extractFasta ...')
        input_sequences = fasta.split(">")
        if len(input_sequences) < 2:
            self.errors.append("Invalid FASTA format: No '>' found.")
        for i in input_sequences[1:]:
            if(len(i) > 0):
                end_of_name = i.find("\n")
                if end_of_name == -1:
                    self.errors.append("Invalid FASTA format: No Protein sequence found between two names.")
                name = i[:end_of_name]
                seq = i[end_of_name:].split()
                self.add_protein("".join(seq), name)

    def convert_first_seq(self, seq):
        """
        To convert the first sequence into FASTA by appending a temporary header
        """
        seq = '\n'.join([x for x in seq.split("\n") if x.strip() != ''])
        seq_list = seq.split('\r\n')
        if seq_list[0].startswith('>') == False:
            seq_list.insert(0, ">sequence 1")
        seq_list = '\n'.join([str(x) for x in seq_list])
        return seq_list

    def transfer_to_fasta_list(self):
        """
        TO transter the instance to a list of fasta format sequences.
        """
        return [">"+"\n".join([name, seq]) for (name, seq) in zip(self.names, self.sequences)]

    def extractForm(self, input_sequences, sequence_format):
        """
        To extract data from the input parameters. 
        This will be called be __init__.py
        the sequence_format  should be one of the 4: fasta, space_separated, named_space_separated, json
        """
        logging.debug('extractForm ...')
        if sequence_format == "auto":
            if ">" in input_sequences:
                # convert the first sequence into FASTA by appending a temporary header
                input_sequences = self.convert_first_seq(input_sequences)
                sequence_format = "fasta"
            else:
                seq = input_sequences.split()
                sequence_format = "space_separated"
        self.sequence_format = sequence_format

        if sequence_format == "fasta":
            self.extractFasta(input_sequences)
        elif sequence_format == "space_separated":
            seqs = input_sequences.split()
            for seq in seqs:
                self.add_protein(seq)
        elif sequence_format == "named_space_separated":
            self.extractNamedSpaceSeparated(input_sequences)
        elif sequence_format.upper() == "JSON":
            self.extractJSON(input_sequences)
        self.remove_duplicate()

    def remove_duplicate(self):
        new_names = []
        unique_sequences = []
        for name, seq in zip(self.names, self.sequences):
            if seq in unique_sequences:
                logging.info('remove duplicated sequence "%s"' % seq)
                self.warnings.append('remove duplicated sequence "%s"' % seq)
            else:
                new_names.append(name)
                unique_sequences.append(seq)
        self.names = new_names
        self.sequences = unique_sequences

    def all_seq_len_less_than(self, length):        
        """
        If all_seq_len_less_than a certain length.
        """
        for seq in self.sequences:
        # if there is even one sequence longer then minimum, avoid the error message
            if len(seq) >= length:
                return False
        return True 

    def get_protein_seq(self, seq_nums=None):
        """
        return a list of protein sequences with (peptide_index, name, sequence)'
        """
        result = []
        for i, (name, sequence) in enumerate(zip(self.names, self.sequences)):
            peptide_index = seq_nums[i] if seq_nums else (i + 1)
            blocklen = 50
            if len(sequence) > blocklen:
                seqblock = ""
                for block in range(0, len(sequence) - blocklen, blocklen):
                    seqblock += """%s\n""" % sequence[block:block + blocklen]
                seqblock += sequence[block + blocklen:]
                result.append(tuple([peptide_index, name, seqblock]))
            else:
                result.append(tuple([peptide_index, name, sequence]))
        return result

    def extractJSON(seqs_in_json):
        '''
        seqs_in_json = [{"name":"LCMV Armstrong, Protein GP","sequence":"MGQIVTMFEALPHIIDEVINIVIIVLIVITGIK"}]
        '''
        self.sequences = []
        self.names = []
        for seq in seqs_in_json:
            self.add_protein(seq['sequence'], seq['name'])

    def extractNamedSpaceSeparated(self, named_space_separated_sequences):
        """
        seqs_in_json = "LCMV Armstrong, Protein GP MGQIVTMFEALPHIIDEVINIVIIVLIVITGIK"
        """
        # TODO: add validation
        logging.debug('extractNamedSpaceSeparated ...')
        input_sequences = named_space_separated_sequences.split('\n')

        for row in input_sequences:
            name, seq = row.strip().rsplit()
            name = name.strip()
            self.add_protein(seq, name)


