import pandas as pd
from pathlib import Path
from tqdm import tqdm


# ===============================================================================================
# Global Vars.
# ===============================================================================================
DATA_DIR = str(Path(__file__).resolve().parents[0]) + "/data"
MRO_MOLECULES_FILE = 'mro_molecules.tsv'
TOOLS_MAPPING_FILE = 'tools-mapping.tsv'

# def generate_report():
#     '''
#     # NOTE:
#     # Potential edge cases that can appear:
#     #   1. Allele from mhc_alleles is no longer in MRO.
#     #   2. Allele from mhc_alleles is assigned to new M
#     '''

def create_autocomplete_datasource():
    '''===============================================================================================
        \n\tDescription :
          This function will prepare datasource file that will be used for Allele Autocomplete file.
          It will have all IEDB Label and Synonyms in 'Alleles' column of the datasource file. Also,
          all the alleles that has 'Predictor Availability' as 0 will not be included in the
          datasource.
    
        Parameters :\n
          \t- None

        Return Value :\n
          \t- TSV file (allele_datasource.tsv)\n
    ==============================================================================================='''
    mol_df = pd.read_csv('{}/mhc_alleles.tsv'.format(DATA_DIR), skipinitialspace=True, sep='\t', encoding='utf-8')
    mol_df_headers = list(mol_df.columns)
    mol_df["Synonyms"] = mol_df["Synonyms"].fillna("")
    aa_df = pd.DataFrame(columns=["Alleles", "Complement", "Species", "Tool Group", "Is Label"])

    mol_df.columns = mol_df.columns.str.replace(r"\s+", "_")
    
    for ref_row in tqdm(mol_df.itertuples(name=None, index=False)) :
        # Only include those with Predictor Availability
        # NOTE: This also implies that the Tool Group is populated (e.g. mhci)
        if ref_row[mol_df_headers.index('Predictor Availability')] == 0 :
            continue

        ref_row_synonyms = ref_row[mol_df_headers.index('Synonyms')]

        row = {
            "Alleles": ref_row[mol_df_headers.index('IEDB Label')],
            "Complement": ref_row_synonyms,
            "Species": ref_row[mol_df_headers.index('In Taxon')],
            "Tool Group": ref_row[mol_df_headers.index('Tool Group')],
            "Is Label": 1
        }
        # aa_df = aa_df.append(row, ignore_index=True)
        aa_df = pd.concat([aa_df, pd.DataFrame([row])], ignore_index=True)
        
        row.clear()
        # Add additional rows for each synonyms
        if ref_row_synonyms :
            synonyms = ref_row_synonyms.split("|")
            for synonym in synonyms :
                row = {
                    "Alleles": synonym,
                    "Complement": ref_row[mol_df_headers.index('IEDB Label')],
                    "Species": ref_row[mol_df_headers.index('In Taxon')],
                    "Tool Group": ref_row[mol_df_headers.index('Tool Group')],
                    "Is Label": 0
                }
                # aa_df = aa_df.append(row, ignore_index=True)
                aa_df = pd.concat([aa_df, pd.DataFrame([row])], ignore_index=True)

    aa_df.to_csv('{}/allele_datasource.tsv'.format(DATA_DIR), sep='\t', index=False)


def populate_tool_group() :
    '''===============================================================================================
        \n\tDescription :
          This function will add 'Tool Group' column to 'mhc_alleles.tsv'. It simply
          takes the IEDB Label from the 'mhc_alleles.tsv' and maps that to 'tools_mapping.xlsx'.
          This will grab the mapped allele's tool group name. If it successfully does so, it will indicate
          with its tool group name as 'Tool Group' in 'mhc_alleles.tsv'. Unmapped allele's 
          tool group name will be indicated as '-'.
    
        Parameters :\n
          \t- None

        Return Value :\n
          \t- TSV file (mhc_alleles.tsv)\n
    ==============================================================================================='''
    mol_df = pd.read_csv('{}/mhc_alleles.tsv'.format(DATA_DIR), skipinitialspace=True, sep='\t')
    tools_mapping_df = pd.read_csv('{}/{}'.format(DATA_DIR, TOOLS_MAPPING_FILE), skipinitialspace=True, sep='\t')
    mol_df_headers = list(mol_df.columns)
    tool_group_col = [0] * (len(mol_df))
    new_molecule_data = {}

    # Copy molecule.tsv into dictionary form
    for header in ['MRO ID', 'IEDB Label', 'Synonyms', 'In Taxon', 'Predictor Availability'] :
        new_molecule_data[header] = mol_df[header].tolist()
    
    ''' Filling predictor column and tools type column '''
    for mol_row in tqdm(mol_df.itertuples(name=None)) :
        iedb_label = mol_row[mol_df_headers.index('IEDB Label') + 1]

        # map predictor availability
        toolgroup = tools_mapping_df.loc[tools_mapping_df['IEDB Label']==iedb_label, 'Tool Group']
        
        if toolgroup.tolist() :
            tool_group_name = list(set(toolgroup.tolist()))[0]
            tool_group_col[mol_row[0]] = tool_group_name
        else :
            tool_group_col[mol_row[0]] = '-'
        
    new_molecule_data['Tool Group'] = tool_group_col
    final_molecule_df = pd.DataFrame.from_dict(new_molecule_data)

    # reordering columns
    column_ordering = ['MRO ID', 'Tool Group', 'IEDB Label', 'Synonyms', 'In Taxon', 'Predictor Availability']
    final_molecule_df = final_molecule_df[column_ordering]

    # Save to file
    final_molecule_df.to_csv('{}/{}'.format(DATA_DIR, 'mhc_alleles.tsv'), sep='\t', index=False)


def populate_predictor_availability() :
    '''===============================================================================================
        \n\tDescription :
          This function will add 'Predictor Availability' column to 'mhc_alleles.tsv'. It simply
          takes the IEDB Label from the 'mhc_alleles.tsv' and maps that to 'tools_mapping.xlsx'.
          This will grab the mapped allele's tool name. If it successfully does so, it will indicate
          '1' as 'Predictor Availability' in 'mhc_alleles.tsv'. Unmapped allele's tool name will
          be indicated as '0'.
    
        Parameters :\n
          \t- None

        Return Value :\n
          \t- TSV file (mhc_alleles.tsv)\n
    ==============================================================================================='''
    mol_df = pd.read_csv('{}/{}'.format(DATA_DIR, 'mhc_alleles.tsv'), skipinitialspace=True, sep='\t')
    tools_mapping_df = pd.read_csv('{}/{}'.format(DATA_DIR, TOOLS_MAPPING_FILE), skipinitialspace=True, sep='\t')
    mol_df_headers = list(mol_df.columns)
    predictor_col = [0] * (len(mol_df))
    new_molecule_data = {}

    # Copy molecule.tsv into dictionary form
    for header in ['MRO ID', 'IEDB Label', 'Synonyms', 'In Taxon'] :
        new_molecule_data[header] = mol_df[header].tolist()
    
    ''' Filling predictor column and tools type column '''
    for mol_row in tqdm(mol_df.itertuples(name=None)) :
        iedb_label = mol_row[mol_df_headers.index('IEDB Label') + 1]

        # map predictor availability
        relevant_pred = tools_mapping_df.loc[tools_mapping_df['IEDB Label']==iedb_label, 'Tool']
        
        if relevant_pred.tolist() :
            predictor_col[mol_row[0]] = 1
        
    new_molecule_data['Predictor Availability'] = predictor_col
    final_molecule_df = pd.DataFrame.from_dict(new_molecule_data)

    # Save to file
    final_molecule_df.to_csv('{}/{}'.format(DATA_DIR, 'mhc_alleles.tsv'), sep='\t', index=False)

def add_netctl_alleles() :
    tools_mapping_df = pd.read_csv('{}/{}'.format(DATA_DIR, TOOLS_MAPPING_FILE), skipinitialspace=True, sep='\t')
    netctl_file = DATA_DIR + '/' + 'netctl-alleles.txt'
    with open(netctl_file, 'r') as f :
        content = f.readlines()
    
    netctl_rows = []

    for i, line in enumerate(content):
      if i == 0 : continue
      netctl_allele, netctl_len = line.strip().split(' ')

      # search if the allele exists in the tools_mapping_df
      print(f'searching for {netctl_allele} in the tools_mapping file')
      tmp_df = tools_mapping_df.loc[tools_mapping_df['IEDB Label'] == netctl_allele]
      
      if 0 < len(tmp_df):
          # add rows for the allele that it's available for netctl
          netctl_row = {
            'Tool Group': 'mhci', 
            'Tool': 'netctl',
            'Tool Version': '1.1',
            'Tool Label': tmp_df.iloc[0]['Tool Label'],
            'IEDB Label': netctl_allele,
            'MRO ID': tmp_df.iloc[0]['MRO ID'],
            'Lengths': 9,
          }
          netctl_rows.append(netctl_row)

    # netctl_rows = pd.concat([netctl_rows, netctl_row], ignore_index=True)
    tools_mapping_df = tools_mapping_df.append(netctl_rows, ignore_index=True, sort=False)

    tools_mapping_df.to_csv('{}/{}'.format(DATA_DIR, 'tools-mapping.tsv'), sep='\t', index=False)


def add_tool_label_as_synonym() :
    '''===============================================================================================
        \n\tDescription :
          This function will map 'MRO ID' from 'mro_molecules' to 'tools-mapping', and will grab the
          corresponding 'Tool Label' to add as a synonym to 'mro_molecules'. If the 'Tool Label' from
          'tools-mapping' is already in 'IEDB Label' format, then it will not add itself as a synonym
          to 'molecule-dev'.
    
        Parameters :\n
          \t- None

        Return Value :\n
          \t- TSV file (mhc_alleles.tsv)\n
    ==============================================================================================='''
    mol_df = pd.read_csv('{}/{}'.format(DATA_DIR, 'mhc_alleles.tsv'), skipinitialspace=True, sep='\t')
    tools_mapping_df = pd.read_csv('{}/{}'.format(DATA_DIR, TOOLS_MAPPING_FILE), skipinitialspace=True, sep='\t')
    mol_header = list(mol_df.columns)
    visited_mro_ids = {}

    updated_result_list = []

    for mol_row in tqdm(mol_df.itertuples(name=None, index=False)) :
        mro_id = mol_row[mol_header.index('MRO ID')]
        iedb_label = mol_row[mol_header.index('IEDB Label')]
        mro_synonyms = mol_row[mol_header.index('Synonyms')]
        synonyms = []

        if str(mro_synonyms) != 'nan' :
            synonyms = mro_synonyms.split('|')

        if mro_id not in visited_mro_ids :
          visited_mro_ids[mro_id] = True
        else :
          continue

        # retrieve subset of tools-maping dataframe that holds matched MRO IDs from molecules file only.
        matched_mro_id_df = tools_mapping_df[tools_mapping_df['MRO ID'] == mro_id]
        tool_labels = list(set(matched_mro_id_df['Tool Label'].tolist()))

        for potential_synonym in tool_labels :
            if (potential_synonym not in synonyms) and (potential_synonym != iedb_label):
                # print("Appening tool_label (%s) as synonym..." %(potential_synonym))
                synonyms.append(potential_synonym)

        # append synonyms as the following format: syn1|syn2|syn3...
        mol_row = list(mol_row)
        mol_row[mol_header.index('Synonyms')] = '|'.join(synonyms)

        updated_result_list.append(mol_row)

    final_df = pd.DataFrame(updated_result_list, columns=mol_header)

    # Save to file
    final_df.to_csv('{}/{}'.format(DATA_DIR, 'mhc_alleles.tsv'), sep='\t', index=False)


if __name__ == '__main__':
    # This will update mhc_alleles.tsv file from the newly pulled MRO file.
    # (Takes mro_molecule file and turn it into mhc_alleles file.)
    add_tool_label_as_synonym()

    # This will add netctl alleles to tools-mapping.tsv
    add_netctl_alleles()

    # TODO: Should do the reporting. (WIP)
    # This should check 'tools_mapping' against 'mhc_alleles' and
    # check any missing mroids, if so, export/report
    # generate_report()

    # Add additional columns to 'mhc_alleles.tsv'.
    populate_predictor_availability()
    populate_tool_group()

    # # Create datasource for Autocomplete.
    create_autocomplete_datasource()