from copy import deepcopy

# a map of column names to their display nemas, source, sort order, description, etc.

BASE_COLUMN_MAP = {
    # core
    "core.sequence_number": { "name": "sequence_number", "display_name":"seq #", "type": "int", "source": "core", "sort_order": 0, "row_sort_priority": 5, "default_order": "ascending", "description": "Index of the input sequence among all input sequences.", "hidden": False},
    "core.sequence_name": { "name": "sequence_name", "display_name":"sequence name", "type": "text", "source": "core", "sort_order": 0, "row_sort_priority": None, "default_order": None, "description": "Name of the input sequence as supplied by the user in FASTA, JSON, or named space-separated formats.  If no name is supplied, a serial number is assigned.", "hidden": False},
    "core.sequence": { "name": "sequence", "display_name":"sequence", "type": "text", "source": "core", "sort_order": 0, "row_sort_priority": None, "default_order": None, "description": "Amino acid sequence of the input sequence.", "hidden": False},
    "core.peptide": { "name": "peptide", "display_name":"peptide", "type": "text", "source": "core", "sort_order": 3, "row_sort_priority": None, "default_order": None, "description": "Peptide sequence sequence", "hidden": False },
    "core.length": { "name": "length", "display_name":"peptide length", "type": "int", "source": "core", "sort_order": 4, "row_sort_priority": None, "default_order": None, "description": "Peptide sequence length", "hidden": False },
    "core.start": { "name": "start", "display_name":"start", "type": "int", "source": "core", "sort_order": 1, "row_sort_priority": None, "default_order": None, "description": "Peptide sequence start within the context of the input sequence", "hidden": False },
    "core.end": { "name": "end", "display_name":"end", "type": "int", "source": "core", "sort_order": 2, "row_sort_priority": None, "default_order": None, "description": "Peptide sequence end within the context of the input sequence", "hidden": False },
    "core.allele": { "name": "allele", "display_name":"allele", "type": "text", "source": "core", "sort_order": 5, "row_sort_priority": None, "default_order": None, "description": "MHC allele used in the prediction", "hidden": False},
    "core.peptide_index": { "name": "peptide_index", "display_name":"peptide index", "type": "int", "source": "core", "sort_order": 6, "row_sort_priority": None, "default_order": None, "description": "Serial number of the peptide among all peptides", "hidden": True},
    # binding - overall
    # these parameters will be from source 'binding', not tied to a specific method
    # there will be 1 instance of these columns regardless of how many methods
    "binding.median_percentile": { "name": "median_percentile", "display_name": "median binding percentile", "type": "float", "source": "binding/elution", "sort_order": 2, "row_sort_priority":0, "default_order": "ascending", "number_of_digits": None, "description": "The median percentile rank of binding/elution predictions", "hidden": False },

    # binding - method-specific
    # this set of columns (or a subset thereof) will be created for each binding
    # method that is selected
    "binding.score": { "name": "score", "display_name": "score", "type": "float", "source": "binding/elution", "sort_order": 0, "row_sort_priority": 2, "default_order": "descending", "number_of_digits": 4, "description": "binding/elution prediction score which indicates binding affinity", "hidden": False },
    "binding.ic50": { "name": "ic50", "display_name": "IC50", "type": "float", "source": "binding/elution", "sort_order": 1, "row_sort_priority": 3, "default_order": "ascending", "number_of_digits": 2, "description": "Measured in (nM). Lower number indicates higher affinity.", "hidden": False },
    "binding.percentile": { "name": "percentile", "display_name": "percentile", "type": "float", "source": "binding/elution", "sort_order": 1, "row_sort_priority": None, "default_order": "ascending", "number_of_digits": None, "description": "The percentile rank generated by comparing the peptide's IC50/score against those of a set of random peptides from SWISSPROT database", "hidden": False },
    "binding.core": { "name": "core", "display_name":"core", "type": "text", "source": "binding/elution", "sort_order": 3, "row_sort_priority": None, "default_order": None, "description": "Always 9 amino acids long sequence. It's a construction used for sequence alignment and identification of binding anchors.", "hidden": True },
    "binding.icore": { "name": "icore", "display_name":"icore", "type": "text", "source": "binding/elution", "sort_order": 3, "row_sort_priority": None, "default_order": None, "description": "Substring of peptide that encompasses all residues between P1 and P-omega of the MHC.", "hidden": True },
    "binding.consensus_percentile": { "name": "consensus_percentile", "display_name": "consensus percentile", "type": "float", "source": "binding/elution", "sort_order": 2, "row_sort_priority":1, "default_order": "ascending", "number_of_digits": None, "description": "The median percentile rank of the binding predictions 'ann', 'smm', and 'comblib_sidney2008'", "hidden": False },
    # immunogenicity
    "immunogenicity.score": { "name": "score", "display_name":"immunogenicity score", "type": "float", "source": "immunogenicity", "sort_order": 0, "row_sort_priority": 2, "default_order": "descending", "number_of_digits": 5, "description": "Scores greater than 0 indecate a higher probability of being immunogenic than non-immunogenic, while scores less than 0 indicate the opposite.  The greater the distance from 0, the higher the certainty.", "hidden": False },
    # netchop
    'processing.netchop.prediction_score': { "name": "prediction_score", "display_name":"NetChop Predictions score", "type": "float", "source": "processing.netchop", "sort_order": 2, "row_sort_priority": None, "default_order": None, "number_of_digits": 6, "description": "NetChop prediction score", "hidden": False },
    "core.amino_acid": { "name": "amino_acid", "display_name":"amino acid", "type": "text", "source": "core", "sort_order": 3, "row_sort_priority": None, "default_order": None, "description": "amino acid residue", "hidden": False },
    "core.position": { "name": "position", "display_name":"position", "type": "int", "source": "core", "sort_order": 1, "row_sort_priority": None, "default_order": None, "description": "Position of the amino acid residue within the input sequence", "hidden": False },
    # netctl
    'processing.netctl.supertype': { "name": "supertype", "display_name":"NetCTL Supertype", "type": "text", "source": "processing.netctl", "sort_order": 3, "row_sort_priority": None, "default_order": None, "description": "Supertype of MHC allele", "hidden": True },
    'processing.netctl.c_terminal_cleavage_affinity': { "name": "c_terminal_cleavage_affinity", "display_name":"NetCTL c terminal cleavage affinity", "type": "float", "source": "processing.netctl", "sort_order": 0, "row_sort_priority": None, "default_order": None, "default_order": None, "number_of_digits": 4, "description": " Predicted proteasomal cleavage score", "hidden": False },
    'processing.netctl.tap_transport_efficiency': { "name": "tap_transport_efficiency", "display_name":"NetCTL TAP transport efficiency", "type": "float", "source": "processing.netctl", "sort_order": 1, "row_sort_priority": None, "default_order": None, "number_of_digits": 4, "description": "Predicted TAP transport efficiency", "hidden": False },
    'processing.netctl.predictions_score': { "name": "predictions_score", "display_name":"NetCTL Prediction score", "type": "float", "source": "processing.netctl", "sort_order": 2, "row_sort_priority": None, "default_order": None, "number_of_digits": 4, "description": "Overall prediction score", "hidden": False },
    'processing.netctl.predicted_mhc_binding_affinity': { "name": "predicted_mhc_binding_affinity", "display_name":"NetCTL Predicted MHC binding affinity", "type": "float", "source": "processing.netctl", "sort_order": 4, "row_sort_priority": 2, "default_order": "descending", "number_of_digits": 4, "description": "The value is given as 1 - log50k(aff), where log50k is the logarithm with base 50.000, and aff is the affinity in nM units", "hidden": False },
    'processing.netctl.rescale_binding_affinity': { "name": "rescale_binding_affinity", "display_name":"NetCTL Rescaled binding affinity", "type": "float", "source": "processing.netctl", "sort_order": 3, "row_sort_priority": None, "default_order": None, "number_of_digits": 4, "description": "The predicted binding affinity is normalized by the 1st percentile score", "hidden": False },
    # netctlpan
    'processing.netctlpan.cleavage_prediction_score': { "name": "cleavage_prediction_score", "display_name":"NetCTLpan cleavage prediction score", "type": "float", "source": "processing.netctlpan", "sort_order": 0, "row_sort_priority": None, "default_order": None, "default_order": None, "number_of_digits": 5, "description": "Predicted proteasomal cleavage score", "hidden": False },
    'processing.netctlpan.tap_prediction_score': { "name": "tap_prediction_score", "display_name":"NetCTLpan tap score", "type": "float", "source": "processing.netctlpan", "sort_order": 1, "row_sort_priority": None, "default_order": None, "number_of_digits": 5, "description": "The TAP score estimates an effective -log(IC50) values for the binding to TAP of a peptide or its N-terminal prolonged precursors.", "hidden": False },
    'processing.netctlpan.mhc_prediction': { "name": "mhc_prediction", "display_name":"NetCTLpan MHC score", "type": "float", "source": "processing.netctlpan", "sort_order": 2, "row_sort_priority": None, "default_order": None, "number_of_digits": 5, "description": "The MHC binding prediction is identical to the Class-I. And the output is -log(IC50) values.", "hidden": False },
    'processing.netctlpan.combined_prediction_score': { "name": "combined_prediction_score", "display_name":"NetCTLpan combined score", "type": "float", "source": "processing.netctlpan", "sort_order": 4, "row_sort_priority": 2, "default_order": "descending", "number_of_digits": 5, "description": "This score combines the proteasomal cleavage, TAP transport and MHC binding predictions.", "hidden": False },
    'processing.netctlpan.percentile_rank': { "name": "percentile_rank", "display_name":"NetCTLpan percentile rank", "type": "float", "source": "processing.netctlpan", "sort_order": 3, "row_sort_priority": None, "default_order": None, "number_of_digits": 2, "description": "Percentile Rank of prediction score to a set of 1000 random natural 9mer peptides", "hidden": False },
    # mhcnp
    "processing.mhcnp.score": { "name": "score", "display_name":"MHC-NP Prob score", "type": "float", "source": "processing.mhcnp", "sort_order": 0, "row_sort_priority": 2, "default_order": "descending", "number_of_digits": 5, "description": "higher the score, more likely to be naturally processed", "hidden": False },
    # basic_processing
    "processing.basic_processing.proteasome_score": { "name": "proteasome_score", "display_name":"proteasome score", "type": "float", "source": "processing.basic_processing", "sort_order": 0, "row_sort_priority": None, "default_order": None, "default_order": None, "number_of_digits": 2, "description": "The scores can be interpreted as logarithms of the total amount of cleavage site usage liberating the peptide C-terminus.", "hidden": False },
    "processing.basic_processing.tap_score": { "name": "tap_score", "display_name":"tap score", "type": "float", "source": "processing.basic_processing", "sort_order": 1, "row_sort_priority": None, "default_order": None, "number_of_digits": 2, "description": "The TAP score estimates an effective -log(IC50) values for the binding to TAP of a peptide or its N-terminal prolonged precursors.", "hidden": False },
    "processing.basic_processing.mhc_score": { "name": "mhc_score", "display_name":"mhc score", "type": "float", "source": "processing.basic_processing", "sort_order": 2, "row_sort_priority": None, "default_order": None, "number_of_digits": 2, "description": "The -log(IC50) using the selected MHC binding method.", "hidden": False },
    "processing.basic_processing.processing_score": { "name": "processing_score", "display_name":"processing score", "type": "float", "source": "processing.basic_processing", "sort_order": 3, "row_sort_priority": None, "default_order": None, "number_of_digits": 2, "description": "This score combines the proteasomal cleavage and TAP transport predictions.", "hidden": False },
    "processing.basic_processing.total_score": { "name": "total_score", "display_name":"processing total score", "type": "float", "source": "processing.basic_processing", "sort_order": 4, "row_sort_priority": 2, "default_order": "descending", "number_of_digits": 2, "description": "Sum of the MHC and Processing scores", "hidden": False },

    # pepmatch
    "pepmatch.peptide": { "name": "peptide", "display_name":"Input Sequence", "type": "text", "source": "pepmatch", "sort_order": 3, "row_sort_priority": None, "default_order": None, "description": "Input/query peptide sequence", "hidden": False },
    "pepmatch.matched_sequence": { "name": "matched_sequence", "display_name":"Matched Sequence", "type": "text", "source": "pepmatch", "sort_order": 3, "row_sort_priority": None, "default_order": None, "description": "Peptide sequence from the selected proteome that matches the input sequence with the currently selected parameter set", "hidden": False },
    "pepmatch.taxon_id": { "name": "taxon_id", "display_name":"Taxon ID", "type": "int", "source": "pepmatch", "sort_order": 4, "row_sort_priority": None, "default_order": None, "description": "NCBI Taxonomy ID of the proteome database", "hidden": True },
    "pepmatch.species": { "name": "species", "display_name":"Species", "type": "text", "source": "pepmatch", "sort_order": 4, "row_sort_priority": None, "default_order": None, "description": "Species name of the proteome database", "hidden": True },
    "pepmatch.gene": { "name": "gene", "display_name":"Gene", "type": "text", "source": "pepmatch", "sort_order": 4, "row_sort_priority": None, "default_order": None, "description": "Gene symbol corresponding to the matched protein in the proteome", "hidden": False },
    "pepmatch.protein_id": { "name": "protein_id", "display_name":"Protein ID", "type": "text", "source": "pepmatch", "sort_order": 1, "row_sort_priority": None, "default_order": None, "description": "Unique identifier/accession of the protein containing the matched peptide", "hidden": False },
    "pepmatch.protein_name": { "name": "protein_name", "display_name":"Protein Name", "type": "text", "source": "pepmatch", "sort_order": 1, "row_sort_priority": None, "default_order": None, "description": "Name of the protein containing the matched peptide", "hidden": False, "display_length": 30 },
    "pepmatch.mismatches": { "name": "mismatches", "display_name":"Mismatches", "type": "int", "source": "pepmatch", "sort_order": 1, "row_sort_priority": 0, "default_order": 'ascending', "description": "The number of substitutions between the input peptide and the matched peptide", "hidden": False },
    "pepmatch.mutated_positions": { "name": "mutated_positions", "display_name":"Mutated Positions", "type": "text", "source": "pepmatch", "sort_order": 1, "row_sort_priority": None, "default_order": None, "description": "The positions of the mismatches between the input and matched peptides", "hidden": False },
    "pepmatch.index_start": { "name": "index_start", "display_name":"Index start", "type": "int", "source": "pepmatch", "sort_order": 1, "row_sort_priority": None, "default_order": None, "description": "The start position of the matched peptide within the full-length protein sequence", "hidden": True },
    "pepmatch.index_end": { "name": "index_end", "display_name":"Index end", "type": "int", "source": "pepmatch", "sort_order": 2, "row_sort_priority": None, "default_order": None, "description": "The end position of the matched peptide within the full-length protein sequence", "hidden": True },
    "pepmatch.protein_existence_level": { "name": "protein_existence_level", "display_name":"Protein Existence Level", "type": "int", "source": "pepmatch", "sort_order": 5, "row_sort_priority": None, "default_order": None, "description": "Evidence level that the protein exists", "hidden": True},

    # cluster
    "cluster.cluster_number": { "name": "cluster_number", "display_name":"Cluster.Sub-Cluster Number", "type": "text", "source": "cluster", "sort_order": 1, "row_sort_priority": 1, "default_order": "ascending", "description": "Serial number of the cluster (parental cluster) and subcluster, separated by a period.", "hidden": False},
    "cluster.peptide_number": { "name": "peptide_number", "display_name":"Peptide Number", "type": "text", "source": "cluster", "sort_order": 2, "row_sort_priority": None, "default_order": None, "description": "Serial number of the peptide within the cluster, starting with 1 at the N terminus of the cluster.", "hidden": False},
    "cluster.alignment": { "name": "alignment", "display_name":"Alignment", "type": "text", "source": "cluster", "sort_order": 3, "row_sort_priority": None, "default_order": None, "description": "Alignment of the peptide to the cluster", "hidden": False },
    "cluster.position": { "name": "position", "display_name":"Position", "type": "text", "source": "cluster", "sort_order": 4, "row_sort_priority": None, "default_order": None, "description": "Position of the peptide within the alignment/consensus of the cluster", "hidden": False },
    "cluster.sequence_number": { "name": "sequence_number", "display_name":"Input seq id", "type": "text", "source": "cluster", "sort_order": 5, "row_sort_priority": None, "default_order": "ascending", "description": "Index of the peptide among the input sequences", "hidden": False},
    "cluster.peptide": { "name": "peptide", "display_name":"Peptide", "type": "text", "source": "cluster", "sort_order": 6, "row_sort_priority": None, "default_order": None, "description": "Peptide sequence.", "hidden": False },
    "cluster.cluster_consensus": { "name": "cluster_consensus", "display_name":"Cluster Consensus", "type": "text", "source": "cluster", "sort_order": 7, "row_sort_priority": None, "default_order": None, "description": "Consensus sequence of the cluster, built from the alignment.", "hidden": False },
    "cluster.clique_number": { "name": "clique_number", "display_name":"Cliques Number", "type": "int", "source": "cluster", "sort_order": 1, "row_sort_priority": None, "default_order": None, "description": "Serial number of the clique", "hidden": False},

    # allele_distances_table
    "allele_distances.input_allele": { "name": "input_allele", "display_name":"Input Allele", "type": "text", "source": "allele_distances", "sort_order": 5, "row_sort_priority": None, "default_order": None, "description": "the predicted allele", "hidden": False},
    "allele_distances.closest_allele": { "name": "closest_allele", "display_name":"Closest Allele", "type": "text", "source": "allele_distances", "sort_order": 5, "row_sort_priority": None, "default_order": None, "description": " its nearest neighbor in the training set", "hidden": False},
    "allele_distances.allele_distances": { "name": "allele_distances", "display_name":"Distance", "type": "int", "source": "allele_distances", "sort_order": 6, "row_sort_priority": None, "default_order": None, "description": " Alleles with lower distances to the training set will have more accurate predictions. A distance of 0 indicates a perfect match between alleles and values at or below 0.1 is considered acceptable for generating accurate predictions.", "hidden": False},

}

# define the binding methods
#TODO: this should be pulled from some central source
binding_methods = ['ann', 'comblib_sidney2008', 'consensus', 
                   'netmhcpan_ba', 'netmhcpan_el', 'smm',
                   'smmpmbec', 'mhcnp', 'mhcflurry']

# define the binding summary columns that do not need to be copied
# into each method
binding_summary_fields = ['binding.median_percentile','binding.consensus_percentile']

peptide_binding_comparison_suffixes = ['peptideA', 'peptideB', 'peptideA_minusB']

# add additional data for each binding method

# this is the final column map that will be used
# we copy fields into it since we can't change the
# size of the hash while we iterate over it
COLUMN_MAP = dict()

for k, v in BASE_COLUMN_MAP.items():
    if k.startswith('binding.') and k not in binding_summary_fields:
        k_parts = k.split('.')
        for m in binding_methods:
            new_k_parts = k_parts.copy()
            new_k_parts.insert(1,m)
            new_k = '.'.join(new_k_parts)
            # we make a copy of the base column map items
            # as we don't want to operate on the originals
            v_copy = deepcopy(v)
            v_copy['display_name'] = m + ' ' + v['display_name']
            v_copy['source'] += '.' + m
            COLUMN_MAP[new_k] = v_copy

    else:
        COLUMN_MAP[k] = v

# adding peptide_binding_comparison_suffixes
for k, v in COLUMN_MAP.copy().items():
    if k.startswith('core.') or k.startswith('binding.') or k.startswith('immunogenicity.'):
        for suffix in peptide_binding_comparison_suffixes:
            new_k = f"{k}-{suffix.lower()}"
            v_copy = deepcopy(v)
            v_copy['name'] = f"{v_copy['name']}-{suffix.lower()}"
            # modify display name with peptide_binding_comparison preffixes
            v_copy['display_name'] = suffix.replace('peptideA_minusB', 'difference (peptideA - peptideB)') + ' ' + v['display_name']
            v_copy['display_name'] = v_copy['display_name'].replace('peptideA peptide', 'peptideA').replace('peptideB peptide', 'peptideB')
            if suffix == 'peptideA_minusB':
                v_copy['number_of_digits'] = 4
                if 'percentile' in k:
                    v_copy['number_of_digits'] = 2
            if 'percentile' in k or 'length' in k:
                v_copy['hidden'] = True
                v_copy['row_sort_priority'] = None
            COLUMN_MAP[new_k] = v_copy


def get_column_info(column_name):
    """Given a column name, return its info"""
    
    lower_name = column_name.lower()
    
    if lower_name in COLUMN_MAP:
        return COLUMN_MAP[lower_name]
    else:
        # TODO: throw a warning
        print("Warning - Column name not found: " + lower_name)
        return { "name": lower_name,
          "display_name": column_name,
          "type": "text",
          "source": "unknown",
          "sort_order": None,
          "row_sort_priority": None,
          "default_order": None,
          "description": "N/A",
          "hidden": False}
