import sys
import time
import pandas as pd
from pathlib import Path
from tqdm import tqdm

# ===============================================================================================
# Global Vars.
# ===============================================================================================
DATA_DIR = str(Path(__file__).parent) + "/data"
TOOLS_MAPPING_FILE = 'tools-mapping.tsv'
MOLECULE_FILE = 'mro_molecules.tsv'
MHC_ALLELES_FILE = 'mhc_alleles.tsv'


def create_autocomplete_datasource():
    '''===============================================================================================
        \n\tDescription :
          This function will prepare datasource file that will be used for Allele Autocomplete file.
          It will have all IEDB Label and Synonyms in 'Alleles' column of the datasource file. Also,
          all the alleles that has 'Predictor Availability' as 0 will not be included in the
          datasource.
    
        Parameters :\n
          \t- None

        Return Value :\n
          \t- TSV file (allele_datasource.tsv)\n
    ==============================================================================================='''
    mol_df = pd.read_csv('{}/{}'.format(DATA_DIR, MHC_ALLELES_FILE), skipinitialspace=True, sep='\t', encoding='utf-8')
    mol_df_headers = list(mol_df.columns)
    mol_df["Synonyms"] = mol_df["Synonyms"].fillna("")
    aa_df = pd.DataFrame(columns=["Alleles", "Complement", "Species", "Tool Group", "Is Label"])

    mol_df.columns = mol_df.columns.str.replace(r"\s+", "_")

    for ref_row in tqdm(mol_df.itertuples(name=None, index=False)) :
        # Only include those with Predictor Availability
        # NOTE: This also implies that the Tool Group is populated (e.g. mhci)
        if ref_row[mol_df_headers.index('Predictor Availability')] == 0 :
            continue

        ref_row_synonyms = ref_row[mol_df_headers.index('Synonyms')]

        row = {
            "Alleles": ref_row[mol_df_headers.index('IEDB Label')],
            "Complement": ref_row_synonyms,
            "Species": ref_row[mol_df_headers.index('In Taxon')],
            "Tool Group": ref_row[mol_df_headers.index('Tool Group')],
            "Is Label": 1
        }
        aa_df = pd.concat([aa_df, pd.DataFrame([row])], ignore_index=True)
        
        row.clear()
        # Add additional rows for each synonyms
        if ref_row_synonyms :
            synonyms = ref_row_synonyms.split("|")
            for synonym in synonyms :
                row = {
                    "Alleles": synonym,
                    "Complement": ref_row[mol_df_headers.index('IEDB Label')],
                    "Species": ref_row[mol_df_headers.index('In Taxon')],
                    "Tool Group": ref_row[mol_df_headers.index('Tool Group')],
                    "Is Label": 0
                }
                # aa_df = aa_df.append(row, ignore_index=True)
                aa_df = pd.concat([aa_df, pd.DataFrame([row])], ignore_index=True)

    aa_df.to_csv('{}/allele_datasource.tsv'.format(DATA_DIR), sep='\t', index=False)

    # Remove 'Tool Group' from mhc_alleles.tsv file
    mol_df = pd.read_csv('{}/{}'.format(DATA_DIR, MHC_ALLELES_FILE), skipinitialspace=True, sep='\t', encoding='utf-8')
    mol_df = mol_df.drop('Tool Group', axis=1)
    mol_df.to_csv('{}/{}'.format(DATA_DIR, MHC_ALLELES_FILE), sep='\t', index=False)



def add_tool_label_as_synonym(df) :
    '''===============================================================================================
        \n\tDescription :
        This function will map 'MRO ID' from 'mro_molecules' to 'tools-mapping', and will grab the
        corresponding 'Tool Label' to add as a synonym to 'mro_molecules'. If the 'Tool Label' from
        'tools-mapping' is already in 'IEDB Label' format, then it will not add itself as a synonym
        to 'molecule-dev'.
    
        Parameters :\n
        \t- None

        Return Value :\n
        \t- TSV file (mhc_alleles.tsv)\n
    ==============================================================================================='''
    mol_df = df
    tools_mapping_df = pd.read_csv('{}/{}'.format(DATA_DIR, TOOLS_MAPPING_FILE), skipinitialspace=True, sep='\t')
    mol_header = list(mol_df.columns)
    visited_mro_ids = {}


    updated_result_list = []

    for mol_row in tqdm(mol_df.itertuples(name=None, index=False)) :
        mro_id = mol_row[mol_header.index('MRO ID')]
        iedb_label = mol_row[mol_header.index('IEDB Label')]
        mro_synonyms = mol_row[mol_header.index('Synonyms')]
        mol_tool_group = mol_row[mol_header.index('Tool Group')]
        synonyms = []

        if str(mro_synonyms) != 'nan' :
            synonyms = mro_synonyms.split('|')

        # (mro_id, mol_tool_group) is the ID that needs to be tracked
        if (mro_id, mol_tool_group) not in visited_mro_ids :
            visited_mro_ids[(mro_id, mol_tool_group)] = True
        else :
            continue

        # retrieve subset of tools-maping dataframe that holds matched MRO IDs from molecules file only.
        matched_mro_id_df = tools_mapping_df[(tools_mapping_df['MRO ID'] == mro_id) & (tools_mapping_df['Tool Group'] == mol_tool_group)]
        tool_labels = list(set(matched_mro_id_df['Tool Label'].tolist()))
        
        for potential_synonym in tool_labels :
            if (potential_synonym not in synonyms) and (potential_synonym != iedb_label):
                synonyms.append(potential_synonym)

        # append synonyms as the following format: syn1|syn2|syn3...
        mol_row = list(mol_row)
        mol_row[mol_header.index('Synonyms')] = '|'.join(synonyms)
        updated_result_list.append(mol_row)

    final_df = pd.DataFrame(updated_result_list, columns=mol_header)

    # Drop the 'Tool Group' column
    # final_df = final_df.drop('Tool Group', axis=1)

    # Save to file
    final_df.to_csv('{}/{}'.format(DATA_DIR, MHC_ALLELES_FILE), sep='\t', index=False)



def create_mhc_alleles():
    mol_df = pd.read_csv('{}/{}'.format(DATA_DIR, MOLECULE_FILE), skipinitialspace=True, sep='\t')
    tm_df = pd.read_csv('{}/{}'.format(DATA_DIR, TOOLS_MAPPING_FILE), skipinitialspace=True, sep='\t')
    tm_header = list(tm_df.columns)

    '''
    Create a sub DataFrame of 'mro_molecules', and add two empty columns
    that will be needed from the 'tools-mapping' dataframe.
    '''
    selected_columns = ['MRO ID', 'IEDB Label', 'Synonyms', 'In Taxon', 'Parent']
    mol_sub_df = mol_df[selected_columns].copy()
    mol_sub_df.loc[:, 'Tool Group'] = ''
    mol_sub_df.loc[:, 'Predictor Availability'] = 0
    mol_sub_df_header = list(mol_sub_df.columns)
 

    # Set index to 'MRO ID' column as filtering will be based on this column.
    tm_df.set_index('MRO ID', inplace=True)

    # Get the 'MRO ID' column (aka index col) as a list
    tm_mro_ids = tm_df.index.tolist()

    # Filter by MRO ID
    additional_entries = []
    for mol_row in tqdm(mol_sub_df.itertuples(name=None, index=False)):
        mroid = mol_row[mol_sub_df_header.index('MRO ID')]
        unique_filtered_tm_df = None

        if mroid not in tm_mro_ids:
            # Unknown ID that doesn't exist in the tools-mapping.
            continue

        # Drop duplicate values in column 'Tool Group'
        # NOTE: .loc[[mroid], :] will ensure DataFrame return
        unique_filtered_tm_df = tm_df.loc[[mroid], :].drop_duplicates(subset='Tool Group')
        
        '''
        Most of the time, entries will be less than 5. It should be okay to
        have this nested loop.
        '''
        is_first_entry = True
        for tm_row in unique_filtered_tm_df.itertuples(name=None, index=False):
            if is_first_entry:
                mol_sub_df.loc[mol_sub_df['MRO ID'] == mroid, 'Tool Group'] = tm_row[tm_header.index('Tool Group')]
                mol_sub_df.loc[mol_sub_df['MRO ID'] == mroid, 'Predictor Availability'] = 1
                is_first_entry = False
            
            '''
            Starting from second entry, it should be added as a new row
            to the mro_molecule dataframe. These entries will have same MRO ID,
            but different Tool Group.
            '''
            tmp_row = list(mol_row)
            tmp_row[mol_sub_df_header.index('Tool Group')] = tm_row[tm_header.index('Tool Group')]
            tmp_row[mol_sub_df_header.index('Predictor Availability')] = 1
            additional_entries.append(tmp_row)
            
    # Add all entries that has same MRO ID, but different Tool Group to the DataFrame
    mol_sub_df = pd.concat([mol_sub_df, pd.DataFrame(additional_entries, columns=mol_sub_df_header)], ignore_index=True)
    
    return mol_sub_df

    
if __name__=='__main__':
    '''
    These steps will create two files:
    1. 'mhc_alleles.tsv' file
    2. 'aucomplete_datasource.tsv' file
    '''
    s = time.time()
    
    # Create initial DataFrame that will become the 'mhc_alleles'
    df = create_mhc_alleles()

    # Add 'Tool Label' from tools-mapping file as a synonym
    add_tool_label_as_synonym(df)

    # Create autocomplete datasource file
    create_autocomplete_datasource()

    e = time.time()
    print(f'Time taken: {e-s}')