import re
import os
import pandas as pd
from difflib import SequenceMatcher


'''==============================================================================
# DESCRIPTION :
# DTU has a separate file (allelenames) that contains all valid tool labels for
# the tool 'netmhcpan-4.1'. After careful analysis, we have discovered that
# 'allelenames' file had extra 1,554 alleles compared to what was in Tools_MRO_
# mapping.xlsx.
#
# Steps that were taken:
# 1. Remove duplicates
#    There are total 1378 duplicate pairs in the 'allelenames' file.
# 2. From the remaining 11,276 alleles from 'allelenames', check to make sure
#    they are able to be mapped to Tools_MRO_mapping.xlsx.
#       - 176 alleles couldn't be mapped to Tools_MRO.
# 3. Try mapping the remaining alleles to 'molecules.tsv' file.
#       - 14 alleles were mapped. 162 alleles will get discarded.
# 
# Ticket: 
# https://gitlab.lji.org/iedb/tools/tools-redesign/ar-redesign-prototype/-/issues/346#note_26421
=============================================================================='''
def clean_label(label):
    _label = re.sub('_', '-', label)
    return re.sub(r'[^\w]', '', _label).lower()

def change_column_names(input_df):
    columns_map = {
        'tool': 'Tool',
        'term': 'Tool Label',
        'MRO name': 'MRO Name',
        'MRO ID': 'MRO ID',
    }

    input_df.rename(columns_map, axis=1, inplace=True)

    return input_df


def check_for_duplicates(input_df):
    # tools_mapping_df = pd.read_excel('data/Tools_MRO_Mapping_VFYD.xlsx', engine='openpyxl', sheet_name='all')
    tools_mapping_df = input_df[(input_df['tool'] == 'netmhcpan-4.1')]
    tools_mapping_header = list(tools_mapping_df.columns)
    tools_mapping_dict = {}

    # Create list of dictionary where key is the (MRO ID, TOOL) and value is the entire row.
    for row in tools_mapping_df.itertuples(name=None, index=False) :
        mro_id = str(row[tools_mapping_header.index('MRO ID')])
        tool_name = row[tools_mapping_header.index('tool')]
        tool_label = row[tools_mapping_header.index('term')]
        mro_name = row[tools_mapping_header.index('MRO name')]
        # iedb_label = row[tools_mapping_header.index('IEDB Label')]

        # This should essentially remove completely duplicated rows, but still capture unique tool labels
        # NOTE: Some alleles have same iedb_label/mro_id/tool_name, but different tool_label
        if (tool_name, tool_label, mro_name, mro_id) not in tools_mapping_dict :
            tools_mapping_dict[(tool_name, tool_label, mro_name, mro_id)] = {
                'Tool': row[tools_mapping_header.index('tool')],
                'Tool Label': row[tools_mapping_header.index('term')],
                'MRO Name': row[tools_mapping_header.index('MRO name')],
                'MRO ID': str(mro_id) # There are some with 'nan' values
            }
        else :
            print("Duplicate found... %s" %(tool_label))



def remove_deprecated_tools(input_df):
    # Read from Tools MRO file and extract only netmhcpan4.1 alleles
    # mro_df = pd.read_excel('data/Tools_MRO_Mapping_VFYD.xlsx', engine='openpyxl')
    deprecated_tools_list = [
        'ann-3.4',
        'netmhcpan',
        'arb',
        'comblib',
        'netmhccons',
        'netmhcstabpan',
        'recommended',
        'nn_align'
    ]

    for dep_tool in deprecated_tools_list:
        input_df.drop(
            input_df[(input_df['tool'] == dep_tool)].index, 
            inplace=True
        )
    
    # print(mro_df.to_string())

    return input_df


def verify_terms(input_df):
    # Read allelenames
    allelenames_tools_labels = []
    allelenames_tool2iedb_mapper = {}
    with open('data/netmhcpan-4.1/allelenames', 'r') as f:
        content = f.readlines()
        for row in content :
            tool_label = row.split(' ')[0]
            iedb_label = row.split(' ')[1].strip()
            allelenames_tools_labels.append(tool_label)
            allelenames_tool2iedb_mapper[tool_label] = iedb_label


    # Read from Tools MRO file and extract only netmhcpan4.1 alleles
    # mro_df = pd.read_excel('data/Tools_MRO_Mapping_VFYD.xlsx', engine='openpyxl')
    mro_df = input_df[input_df['tool'] == 'netmhcpan-4.1']
    mro_tools_labels = mro_df['term'].to_list()
    
    verified_alleles = 0
    invalid_alleles = 0
    for _ in allelenames_tools_labels:
        if _ in mro_tools_labels :
            verified_alleles += 1
        else:
            # print("%s was not found!!" %(_))
            invalid_alleles += 1

    print("# of verified alleles: %s" %(verified_alleles))
    print("# of unknown alleles: %s" %(invalid_alleles))


def add_missing_mapped_alleles(df):
    # Add MRO:0001369   Mamu-B017:04    Mamu-B*017:04 protein complex
    tool = 'netmhcpan-4.1'
    term = 'Mamu-B017:04'
    mro_name = 'Mamu-B*017:04 protein complex'
    mro_id = 'MRO:0001369'

    mamu_b01704_row = {
        'tool': tool,
        'term': term,
        'MRO name': mro_name,
        'MRO ID': mro_id
    }
    
    df = df.append(mamu_b01704_row, ignore_index=True)

    return df


def replace_netmhcpan_4_1_labels(allele_records_df):
    # Read allelenames
    allelenames_tools_labels = []
    allelenames_tool2iedb_mapper = {}
    with open('data/netmhcpan-4.1/allelenames', 'r') as f:
        content = f.readlines()
        for row in content :
            tool_label = row.split(' ')[0]
            iedb_label = row.split(' ')[1].strip()
            allelenames_tools_labels.append(tool_label)
            allelenames_tool2iedb_mapper[tool_label] = iedb_label


    mro_df = allele_records_df[allele_records_df['tool'] == 'netmhcpan-4.1']
    mro_tools_labels = mro_df['term'].to_list()
    mro_iedb_labels = mro_df['MRO name'].to_list()
    mro_zipped_labels = ''

    # Let's try to preserve the IEDB label from the original tools_mro file rather than
    # using iedb labels from the allelenames file.
    for i in range(len(mro_iedb_labels)):
        mro_iedb_labels[i] = mro_iedb_labels[i].split(' ')[0]
    
    mro_zipped_labels = list(zip(mro_tools_labels, mro_iedb_labels))

    for k, v in allelenames_tool2iedb_mapper.items():
        stripped_allelename_tool_label = clean_label(k)

        for mro_label_pair in mro_zipped_labels :
            stripped_mro_tool_label = clean_label(mro_label_pair[0])

            if stripped_allelename_tool_label == stripped_mro_tool_label :
                if mro_label_pair[1] != 'remove':
                    allelenames_tool2iedb_mapper[k] = mro_label_pair[1]
                
                break
            

    # Create mapper from clean label to tools label for Tools MRO
    # 1-to-1 mapping
    tools_mro_mapper = {}
    for mro_tools_label in mro_tools_labels :
        cleaned_tools_label = clean_label(mro_tools_label)
        if cleaned_tools_label not in tools_mro_mapper :
            tools_mro_mapper[cleaned_tools_label] = [mro_tools_label]
        else :
            tools_mro_mapper[cleaned_tools_label].append(mro_tools_label)

    # Create mapper from clean label to tools label for allelenames
    # 1-to-many(2) mapping
    allelenames_mapper = {}
    for allelenames_tools_label in allelenames_tools_labels:
        cleaned_tools_label = clean_label(allelenames_tools_label)
        if cleaned_tools_label not in allelenames_mapper :
            allelenames_mapper[cleaned_tools_label] = [allelenames_tools_label]
        else :
            allelenames_mapper[cleaned_tools_label].append(allelenames_tools_label)
 
    # Handle duplicate tool labels in the allelenames file.
    for k, v in allelenames_mapper.items():
        if 1 < len(v):
            candidate_allele1 = v[0]
            candidate_allele2 = v[1]
            target_allele = tools_mro_mapper[k][0]
            
            # Need to select the one that matches closer to the IEDB label.
            similarity1 = SequenceMatcher(None, candidate_allele1, target_allele).ratio()
            similarity2 = SequenceMatcher(None, candidate_allele2, target_allele).ratio()
            
            if similarity1 < similarity2 :
                v.remove(candidate_allele1)
            else :
                v.remove(candidate_allele2)
    


    mro_header = list(allele_records_df.columns)
    updated_tool_col = []
    updated_term_col = []
    updated_mro_name_col = []
    updated_mro_id_col = []

    # alleles_already_added = [
    #     'BoLA-NC1:00101',
    #     'BoLA-NC1:00201',
    #     'BoLA-NC1:00301',
    #     'BoLA-NC1:00401',
    #     'BoLA-NC2:00101',
    #     'BoLA-NC2:00102',
    #     'BoLA-NC3:00101',
    #     'BoLA-NC4:00101',
    #     'BoLA-NC4:00201',
    #     'H-2-Dq',
    #     'H-2-Kq',
    #     'H-2-Lq',
    #     'HLA-A30:14L',
    #     'SLA-3:0402'
    # ]
    
    for row in allele_records_df.itertuples(name=None, index=False) :
        tool = row[mro_header.index('tool')]
        term = row[mro_header.index('term')]
        mro_name = row[mro_header.index('MRO name')]
        mro_id = row[mro_header.index('MRO ID')]

        # If they were part of the 14 alleles, then skip as they are already included.
        # if term in alleles_already_added:
        #     continue

        if tool == 'netmhcpan-4.1':
            clean_term = clean_label(term)
            if clean_term in allelenames_mapper:
                term = allelenames_mapper[clean_term][0]
                mro_name = allelenames_tool2iedb_mapper[term] + ' protein complex'

        updated_term_col.append(term)
        updated_mro_name_col.append(mro_name)
        updated_tool_col.append(tool)
        updated_mro_id_col.append(mro_id)

    # Populate the output data as dictionary, then turn it back to Dataframe
    output_data = {}
    # output_data['tool'] = copy_mro_df['tool'].to_list()
    output_data['tool'] = updated_tool_col
    output_data['term'] = updated_term_col
    output_data['MRO name'] = updated_mro_name_col
    output_data['MRO ID'] = updated_mro_id_col
    # output_data['MRO ID'] = copy_mro_df['MRO ID'].to_list()
    # print(len(output_data['tool']))
    # print(len(output_data['term']))
    # print(len(output_data['MRO name']))
    # print(len(output_data['MRO ID']))

    return pd.DataFrame.from_dict(output_data)


def add_allelesnames_to_tools_mapping():
    # Read allelenames
    allelenames_tools_labels = []
    allelenames_tool2iedb_mapper = {}
    with open('data/netmhcpan-4.1/allelenames', 'r') as f:
        content = f.readlines()
        for row in content :
            tool_label = row.split(' ')[0]
            iedb_label = row.split(' ')[1].strip()
            allelenames_tools_labels.append(tool_label)
            allelenames_tool2iedb_mapper[tool_label] = iedb_label

    # Read from Tools MRO file and extract only netmhcpan4.1 alleles
    orig_mro_df = pd.read_excel('data/Tools_MRO_mapping.xlsx', engine='openpyxl')
    mro_df = orig_mro_df[orig_mro_df['tool'] == 'netmhcpan-4.1']
    mro_tools_labels = mro_df['term'].to_list()

    ''' 154 alleles that matches (no change needed) '''
    # match_counter = 0
    # for allelenames_tools_label in allelenames_tools_labels:
    #     if allelenames_tools_label in mro_tools_labels:
    #         print('%s is fine.' %(allelenames_tools_label))
    #         match_counter = match_counter + 1

    # print(match_counter)
    ''''''''''''''''''''''''''''''''''''''''''''''''''''''

    # Create mapper from clean label to tools label for Tools MRO
    # 1-to-1 mapping
    tools_mro_mapper = {}
    for mro_tools_label in mro_tools_labels :
        cleaned_tools_label = clean_label(mro_tools_label)
        if cleaned_tools_label not in tools_mro_mapper :
            tools_mro_mapper[cleaned_tools_label] = [mro_tools_label]
        else :
            tools_mro_mapper[cleaned_tools_label].append(mro_tools_label)


    # Create mapper from clean label to tools label for allelenames
    # 1-to-many(2) mapping
    allelenames_mapper = {}
    for allelenames_tools_label in allelenames_tools_labels:
        cleaned_tools_label = clean_label(allelenames_tools_label)
        if cleaned_tools_label not in allelenames_mapper :
            allelenames_mapper[cleaned_tools_label] = [allelenames_tools_label]
        else :
            allelenames_mapper[cleaned_tools_label].append(allelenames_tools_label)


    # Handle duplicate tool labels in the allelenames file.
    for k, v in allelenames_mapper.items():
        if 1 < len(v):
            candidate_allele1 = v[0]
            candidate_allele2 = v[1]
            target_allele = tools_mro_mapper[k][0]
            
            # Need to select the one that matches closer to the IEDB label.
            similarity1 = SequenceMatcher(None, candidate_allele1, target_allele).ratio()
            similarity2 = SequenceMatcher(None, candidate_allele2, target_allele).ratio()
            
            if similarity1 < similarity2 :
                v.remove(candidate_allele1)
            else :
                v.remove(candidate_allele2)
            
    # Compare allelenames against the Tools-MRO to see if there are any differences.
    unmapped_allele_counter = 0
    unmapped_alleles_from_tools_mro = []
    for k, v in allelenames_mapper.items():
        allelenames_tlabel = v[0]

        try:
            tools_mro_tlabel = tools_mro_mapper[k]
        except:
            unmapped_allele_counter = unmapped_allele_counter + 1
            # print('%s is not in tools_mro_tlabel...%s' %(k, allelenames_tlabel))
            unmapped_alleles_from_tools_mro.append(allelenames_tlabel)
            

    print("Number of alleles that aren't mapped to Tools MRO: %s" %(len(unmapped_alleles_from_tools_mro)))

    # From the leftover alleles (unmapped alleles) from Toos MRO, compare them to molecules.tsv file
    # to see if any of them gets mapped to MRO ID.
    mol_df = pd.read_csv('data/mro_molecules.tsv', skipinitialspace=True, sep='\t')
    mol_tool_labels = mol_df['Label'].to_list()
    mol_tool_labels = [_.replace('protein complex', '').strip() for _ in mol_tool_labels]
    
    # Create a mapper for mol_tool_label
    molecules_mapper = {}
    for mol_tool_label in mol_tool_labels :
        cleaned_mol_tool_label = clean_label(mol_tool_label)

        if cleaned_mol_tool_label not in molecules_mapper :
            molecules_mapper[cleaned_mol_tool_label] = [mol_tool_label]
        else :
            molecules_mapper[cleaned_mol_tool_label].append(mol_tool_label)
    
    # Compare unmapped alleles to the mapper for the mro_molecules
    still_unmapped = 0
    unmapped_alleles_final = []
    records = []
    for unmapped_allele in unmapped_alleles_from_tools_mro :
        clean_unmapped_allele = clean_label(unmapped_allele)
        
        try:
            mapped_allele_from_mol = molecules_mapper[clean_unmapped_allele][0]
            
            # Since now it's confirmed that this allele exists in molecules.tsv, find the corresponding name in
            # allelenames file.
            cleaned_mapped_allele = clean_label(mapped_allele_from_mol)
            mapped_allele = allelenames_mapper[cleaned_mapped_allele][0]
            # print(mapped_allele)

            mapped_mro_name = mapped_allele_from_mol + ' protein complex'
            mapped_row_dict = mol_df.loc[mol_df['Label'] == mapped_mro_name].to_dict('records')[0]
            corr_mro_id = mapped_row_dict['MRO ID']
            row = {
                    'tool': 'netmhcpan-4.1', 
                    'term': mapped_allele,
                    'MRO name': mapped_mro_name,
                    'MRO ID': corr_mro_id,
                    }
            records.append(row)

        except :
            still_unmapped = still_unmapped + 1
            unmapped_alleles_final.append((clean_unmapped_allele, unmapped_allele))


    print('Number of alleles that still are not mapped to either Tools MRO or Molecules.tsv: %s' %(still_unmapped))

    # 14 alleles that needs to be appended to the original dataframe from the excel file.
    records_df = pd.DataFrame(records, index=None)

    return orig_mro_df.append(records_df)



def main():
    # ================================================================================================================
    # STEP 1: Adding 14 alleles to Tools_Mapping
    # ================================================================================================================
    allele_records_df = add_allelesnames_to_tools_mapping()
    # tools_mapping_header = list(allele_records_df.columns)
    # for row in allele_records_df.itertuples(name=None, index=False) :
    #     print(row)
        # tool_name = row[tools_mapping_header.index('term')]
        # print(tool_name)
        # if tool_name == 'HLA-A*30:14L':
        #     print("HLA-A*30:14L exists!!")
        #     break
    # exit()

    # ================================================================================================================
    # STEP 2: Replace 'term' from Tools_Mapping with labels from allelenames file
    # ================================================================================================================
    output_df = replace_netmhcpan_4_1_labels(allele_records_df)
    # tools_mapping_header = list(output_df.columns)
    # for row in output_df.itertuples(name=None, index=False) :
    #     tool_name = row[tools_mapping_header.index('term')]
    #     print(tool_name)

    # ================================================================================================================
    # STEP 2a: Add missing alleles
    # As of July 12. 2023, Mamu-B017:04 has finally mapped to MRO ID.
    # ================================================================================================================
    output_df = add_missing_mapped_alleles(output_df)
    
    # ================================================================================================================
    # ANALYSIS: Find out how many alleles from allelenames file are not mapped in the Tools Mapping / molecules file. 
    # ================================================================================================================
    verify_terms(output_df)
    

    # ================================================================================================================
    # STEP 3: Remove alleles that are part of older tools that are going to be depracated for the new site.
    # ================================================================================================================
    finalized_allele_df = remove_deprecated_tools(output_df)
    print('Available alleles after removing deprecated tools: %s' %(len(finalized_allele_df)))


    # ================================================================================================================
    # ANALYSIS: Check for duplicates
    # ================================================================================================================
    check_for_duplicates(finalized_allele_df)


    # ================================================================================================================
    # STEP 4: Post-Processing (Rename columns)
    # ================================================================================================================
    pproc_allele_df = change_column_names(finalized_allele_df)


    # ================================================================================================================
    # STEP 5: Write output to excel file.
    # ================================================================================================================
    file_path = 'data/Tools_MRO_Mapping_VFYD.xlsx'
    if (os.path.exists(file_path) == True) :
        os.remove(file_path)

    pproc_allele_df.to_excel(file_path, engine='openpyxl', sheet_name='all', index=False)


if __name__ == '__main__':
    main()