import sys
import re
import time
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm import tqdm

# ===============================================================================================
# Global Vars.
# ===============================================================================================
DATA_DIR = str(Path(__file__).parent) + "/data"
TOOLS_MAPPING_FILE = 'tools-mapping.tsv'
MOLECULE_FILE = 'mro_molecules.tsv'
MHC_ALLELES_FILE = 'mhc_alleles.tsv'


def clean_allele(allele: str) -> str:
    return re.sub(r'[^A-Za-z0-9]', '', allele)


def remove_single_dp_alleles():
    '''===============================================================================================
        \n\tDescription :
          This is to address the issue mentioned in 785, where we need to remove single DP alleles.
    
        Parameters :\n
          \t- None

        Return Value :\n
          \t- TSV file (allele_datasource.tsv)\n
    ==============================================================================================='''
    mol_df = pd.read_csv('{}/{}'.format(DATA_DIR, 'allele_datasource.tsv'), skipinitialspace=True, sep='\t', encoding='utf-8')
    mol_df_headers = list(mol_df.columns)
    
    MHCII_PATTERN = r'^(?:HLA-)?(D[PQ][AB]\d{0,3})\*(\d{1,3})[:|\d](\d{1,3})[A-Z]{0,2}/' + \
                r'(?:HLA-)?(D[PQ][AB]\d{0,3})\*(\d{1,3})[:|\d](\d{1,3})[A-Z]{0,2}'
    # NOTE: Perhaps bring in tools-mapping info and do join to verify if the allele is in the tools-mapping.

    for ref_row in tqdm(mol_df.itertuples(name=None, index=False)) :
        allele = ref_row[mol_df_headers.index('Alleles')]
        
        # Only perform checking on alleles that starts with the following prefix:
        # HLA-DPA|DPB|DQA|DPB
        if re.match(r'^(?:HLA-)?(DPA|DPB|DQA|DQB)', allele):
            match = re.match(MHCII_PATTERN, allele)
            if not match:
                print(f"Did not match: {allele}")


def double_check_single_mhcii_alleles():
    mol_df = pd.read_csv('{}/{}'.format(DATA_DIR, MHC_ALLELES_FILE), skipinitialspace=True, sep='\t', encoding='utf-8')

    # Filter to only contain MHCII
    filtered_mhcii_df = mol_df[(mol_df['Parent'] == 'MHC class II protein complex')]
    mhcii_df_headers = list(filtered_mhcii_df.columns)

    paired_alleles = []
    # NOTE: Retrieve all alleles that is paired with alpha and beta chains.
    for row in tqdm(filtered_mhcii_df.itertuples(name=None, index=False)):
        curr_allele = row[mhcii_df_headers.index('IEDB Label')]
        allele_upper = curr_allele.upper()

        if 'DQA' in allele_upper and 'DQB' in allele_upper:
            paired_alleles.append(curr_allele)
        
        if 'DPA' in allele_upper and 'DPB' in allele_upper:
            paired_alleles.append(curr_allele)
    
    # Then manually break them up to make sure we indeed have 
    # all alpha chains and beta chains separately.
    alpha_chains = []
    beta_chains = []
    for allele in paired_alleles:
        prefix = 'HLA-'
        if allele.startswith(prefix):
            cleaned_allele = allele.replace(prefix, '')
            parts = re.split(r'[/\-]', cleaned_allele)
            if len(parts) == 2:
                alpha_chains.append(prefix + parts[0])
                beta_chains.append(prefix + parts[1])
            else:
                print(f"Skipping: {allele} (unexpected format)")
    
    # alpha_chains = list(set(alpha_chains))
    # beta_chains = list(set(beta_chains))

    # Let's now seasrch for these individual alleles from the MHC_ALLELES_FILE
    # to get MRO IDs.
    iedblabels = filtered_mhcii_df['IEDB Label'].to_list()    
    
    mapped_alleles_alpha = []
    mapped_alleles_beta = []

    unmapped_alleles_alpha = []
    unmapped_alleles_beta = []
    for i in range(len(alpha_chains)):
        alpha = alpha_chains[i]
        beta = beta_chains[i]

        if alpha in iedblabels:
            mapped_alleles_alpha.append(alpha)
        else:
            unmapped_alleles_alpha.append(alpha)

        if beta in iedblabels:
            mapped_alleles_beta.append(beta)
        else:
            unmapped_alleles_beta.append(beta)

    mapped_alleles_alpha = list(set(mapped_alleles_alpha))
    mapped_alleles_beta = list(set(mapped_alleles_beta))

    unmapped_alleles_alpha = list(set(unmapped_alleles_alpha))
    unmapped_alleles_beta = list(set(unmapped_alleles_beta))

    # print('-------')
    # print(f'{len(mapped_alleles_alpha)} mapped out of {len(alpha_chains)}')
    # print(f'{len(mapped_alleles_beta)} mapped out of {len(beta_chains)}')
    # print(len(unmapped_alleles_alpha))
    # print(len(unmapped_alleles_beta))

    # print(unmapped_alleles_alpha)
    # print(unmapped_alleles_beta)
    
    return unmapped_alleles_alpha, unmapped_alleles_beta


def add_mhci_alleles_to_phbr():
    '''===============================================================================================
        \n\tDescription :
          Add MHCI alleles to PHBR as all MHCI alleles should be compatible with PHBR.
          This can be done by taking the allele_datasource.tsv and taking all MHCI rows,
          and duplicating it and setting the 'Tool Group' as 'phbr'.
        
        Parameters :\n
          \t- None

        Return Value :\n
          \t- TSV file (allele_datasource.tsv)\n
    ==============================================================================================='''
    allele_datasource_df = pd.read_csv('{}/{}'.format(DATA_DIR, 'allele_datasource.tsv'), skipinitialspace=True, sep='\t', encoding='utf-8')
    # mol_df = pd.read_csv('{}/{}'.format(DATA_DIR, MHC_ALLELES_FILE), skipinitialspace=True, sep='\t', encoding='utf-8')

    # Filter to only contain MHCI
    
    filtered_mhci_df = allele_datasource_df[(allele_datasource_df['Tool Group'] == 'mhci')]
    mhci_df_headers = list(filtered_mhci_df.columns)
    print(mhci_df_headers)

    # additional data to be added and merged to allele_datasource.tsv
    rows_to_add = []
    
    # NOTE: We want to add all single chains. (no pairs, skip if any)
    # DRB doesn't come as a pair, so we should only focus on DQ- DP- alleles
    for row in tqdm(filtered_mhci_df.itertuples(name=None, index=False)):
        curr_row = list(row)
        curr_row[mhci_df_headers.index('Tool Group')] = 'phbr'
        curr_row = tuple(curr_row)
        rows_to_add.append(curr_row)
    
    new_df = pd.DataFrame(rows_to_add, columns=mhci_df_headers)
    combined_df = pd.concat([allele_datasource_df, new_df], ignore_index=True)
    
    combined_df.to_csv('{}/allele_datasource.tsv'.format(DATA_DIR), sep='\t', index=False)
        

def add_single_chains_for_phbr():
    '''===============================================================================================
        \n\tDescription :
          This will iterate over "allele_datasource.tsv" and duplicate all single DPA, DPB, DQA, DQB,
          DRB alleles for PHBR.
    
        Parameters :\n
          \t- None

        Return Value :\n
          \t- TSV file (allele_datasource.tsv)\n
    ==============================================================================================='''
    allele_datasource_df = pd.read_csv('{}/{}'.format(DATA_DIR, 'allele_datasource.tsv'), skipinitialspace=True, sep='\t', encoding='utf-8')
    mol_df = pd.read_csv('{}/{}'.format(DATA_DIR, MHC_ALLELES_FILE), skipinitialspace=True, sep='\t', encoding='utf-8')

    # Filter to only contain MHCII
    filtered_mhcii_df = mol_df[(mol_df['Parent'] == 'MHC class II protein complex')]
    mhcii_df_headers = list(filtered_mhcii_df.columns)

    # additional data to be added and merged to allele_datasource.tsv
    rows_to_add = []
    
    # NOTE: We want to add all single chains. (no pairs, skip if any)
    # DRB doesn't come as a pair, so we should only focus on DQ- DP- alleles
    for row in tqdm(filtered_mhcii_df.itertuples(name=None, index=False)):
        curr_allele = row[mhcii_df_headers.index('IEDB Label')]
        allele_upper = curr_allele.upper()

        # Edge case allele
        if curr_allele == 'HLA-DQA1*01:02/DRB1*15:01':
            continue

        has_dqa = 'DQA' in allele_upper
        has_dqb = 'DQB' in allele_upper
        has_dpa = 'DPA' in allele_upper
        has_dpb = 'DPB' in allele_upper

        if has_dqa and not has_dqb:
            rows_to_add.append(row)
        elif has_dqb and not has_dqa:
            rows_to_add.append(row)
        elif has_dpa and not has_dpb:
            rows_to_add.append(row)
        elif has_dpb and not has_dpa:
            rows_to_add.append(row)

    # This is a Dataframe containing all single chain DQA/DQB/DPA/DPB alleles
    # (disregarding species)
    single_dp_dq_df = pd.DataFrame(rows_to_add, columns=mhcii_df_headers)
    
    rows_to_add.clear()
    # Add each row from single_dp_dq_df, and reformat it so that it can be
    # added to the autocomplete datasource file.
    for row in tqdm(single_dp_dq_df.itertuples(name=None, index=False)):
        curr_iedb_allele = row[mhcii_df_headers.index('IEDB Label')]
        curr_synonyms = row[mhcii_df_headers.index('Synonyms')]

        if row[mhcii_df_headers.index('MRO ID')].startswith('NOMRO'):
            is_unobserved = 1
        else:
            is_unobserved = 0

        new_row_to_add = {
            "Alleles": curr_iedb_allele,
            "Complement": curr_synonyms,
            "Species": row[mhcii_df_headers.index('In Taxon')],
            "Tool Group": 'phbr',
            "Is Label": 1,
            "Unobserved": is_unobserved
        }

        rows_to_add.append(new_row_to_add)

        # For all alleles that have synonyms, we need to create separate rows
        # for the allele datasource file
        if not pd.isna(curr_synonyms):
            synonyms = [syn.strip() for syn in curr_synonyms.split("|")]

            for synonym in synonyms :
                new_row_to_add = {
                    "Alleles": synonym,
                    "Complement": curr_iedb_allele,
                    "Species": row[mhcii_df_headers.index('In Taxon')],
                    "Tool Group": 'phbr',
                    "Is Label": 0,
                    "Unobserved": is_unobserved
                }

                rows_to_add.append(new_row_to_add)


    # This will return any unmapped alpha and beta chains (Meaning NO MRO IDs)
    alpha_chains, beta_chains = double_check_single_mhcii_alleles()

    if alpha_chains:
        for alpha in alpha_chains:
            new_row_to_add = {
                "Alleles": alpha,
                "Complement": '',
                "Species": 'human',
                "Tool Group": 'phbr',
                "Is Label": 0,
                "Unobserved": 1
            }

            rows_to_add.append(new_row_to_add)
    
    if beta_chains:
        for beta in beta_chains:
            new_row_to_add = {
                    "Alleles": beta,
                    "Complement": '',
                    "Species": 'human',
                    "Tool Group": 'phbr',
                    "Is Label": 0,
                    "Unobserved": 1
                }

            rows_to_add.append(new_row_to_add)

    new_df = pd.DataFrame(rows_to_add)
    
    combined_df = pd.concat([allele_datasource_df, new_df], ignore_index=True)
    
    combined_df.to_csv('{}/allele_datasource.tsv'.format(DATA_DIR), sep='\t', index=False)


def create_autocomplete_datasource():
    '''===============================================================================================
        \n\tDescription :
          This function will prepare datasource file that will be used for Allele Autocomplete file.
          It will have all IEDB Label and Synonyms in 'Alleles' column of the datasource file. Also,
          all the alleles that has 'Predictor Availability' as 0 will not be included in the
          datasource.
    
        Parameters :\n
          \t- None

        Return Value :\n
          \t- TSV file (allele_datasource.tsv)\n
    ==============================================================================================='''
    mol_df = pd.read_csv('{}/{}'.format(DATA_DIR, MHC_ALLELES_FILE), skipinitialspace=True, sep='\t', encoding='utf-8')
    mol_df_headers = list(mol_df.columns)
    mol_df["Synonyms"] = mol_df["Synonyms"].fillna("")
    aa_df = pd.DataFrame(columns=["Alleles", "Complement", "Species", "Tool Group", "Is Label", "Unobserved"])

    mol_df.columns = mol_df.columns.str.replace(r"\s+", "_")

    for ref_row in tqdm(mol_df.itertuples(name=None, index=False)) :
        # Only include those with Predictor Availability
        # NOTE: This also implies that the Tool Group is populated (e.g. mhci)
        if ref_row[mol_df_headers.index('Predictor Availability')] == 0 :
            continue
        
        ref_row_synonyms = ref_row[mol_df_headers.index('Synonyms')]

        if ref_row[mol_df_headers.index('MRO ID')].startswith('NOMRO'):
            is_unobserved = 1
        else:
            is_unobserved = 0

        row = {
            "Alleles": ref_row[mol_df_headers.index('IEDB Label')],
            "Complement": ref_row_synonyms,
            "Species": ref_row[mol_df_headers.index('In Taxon')],
            "Tool Group": ref_row[mol_df_headers.index('Tool Group')],
            "Is Label": 1,
            "Unobserved": is_unobserved
        }
        aa_df = pd.concat([aa_df, pd.DataFrame([row])], ignore_index=True)
        
        row.clear()
        # Add additional rows for each synonyms
        if ref_row_synonyms :
            synonyms = ref_row_synonyms.split("|")
            for synonym in synonyms :
                row = {
                    "Alleles": synonym,
                    "Complement": ref_row[mol_df_headers.index('IEDB Label')],
                    "Species": ref_row[mol_df_headers.index('In Taxon')],
                    "Tool Group": ref_row[mol_df_headers.index('Tool Group')],
                    "Is Label": 0,
                    "Unobserved": is_unobserved
                }
                # aa_df = aa_df.append(row, ignore_index=True)
                aa_df = pd.concat([aa_df, pd.DataFrame([row])], ignore_index=True)

    aa_df.to_csv('{}/allele_datasource.tsv'.format(DATA_DIR), sep='\t', index=False)

    # Remove 'Tool Group' from mhc_alleles.tsv file
    mol_df = pd.read_csv('{}/{}'.format(DATA_DIR, MHC_ALLELES_FILE), skipinitialspace=True, sep='\t', encoding='utf-8')
    mol_df = mol_df.drop('Tool Group', axis=1)
    mol_df.to_csv('{}/{}'.format(DATA_DIR, MHC_ALLELES_FILE), sep='\t', index=False)



def add_tool_label_as_synonym(df) :
    '''===============================================================================================
        \n\tDescription :
        This function will map 'MRO ID' from 'mro_molecules' to 'tools-mapping', and will grab the
        corresponding 'Tool Label' to add as a synonym to 'mro_molecules'. If the 'Tool Label' from
        'tools-mapping' is already in 'IEDB Label' format, then it will not add itself as a synonym
        to 'molecule-dev'.
    
        Parameters :\n
        \t- None

        Return Value :\n
        \t- TSV file (mhc_alleles.tsv)\n
    ==============================================================================================='''
    mol_df = df
    tools_mapping_df = pd.read_csv('{}/{}'.format(DATA_DIR, TOOLS_MAPPING_FILE), skipinitialspace=True, sep='\t')
    mol_header = list(mol_df.columns)
    visited_mro_ids = {}


    updated_result_list = []

    for mol_row in tqdm(mol_df.itertuples(name=None, index=False)) :
        mro_id = mol_row[mol_header.index('MRO ID')]
        iedb_label = mol_row[mol_header.index('IEDB Label')]
        mro_synonyms = mol_row[mol_header.index('Synonyms')]
        mol_tool_group = mol_row[mol_header.index('Tool Group')]
        synonyms = []

        if mro_id.startswith('NOMRO'):
            updated_result_list.append(mol_row)
            continue

        if str(mro_synonyms) != 'nan' :
            synonyms = mro_synonyms.split('|')

        # (mro_id, mol_tool_group) is the ID that needs to be tracked
        if (mro_id, mol_tool_group) not in visited_mro_ids :
            visited_mro_ids[(mro_id, mol_tool_group)] = True
        else :
            continue

        # retrieve subset of tools-maping dataframe that holds matched MRO IDs from molecules file only.
        matched_mro_id_df = tools_mapping_df[(tools_mapping_df['MRO ID'] == mro_id) & (tools_mapping_df['Tool Group'] == mol_tool_group)]
        tool_labels = list(set(matched_mro_id_df['Tool Label'].tolist()))
        
        for potential_synonym in tool_labels :
            if (potential_synonym not in synonyms) and (potential_synonym != iedb_label):
                synonyms.append(potential_synonym)

        # Also use 'convert_pseudo.dat' to make sure we cover all netmhciipan alleles
        pseudo_data_file_path = '{}/{}/{}'.format(DATA_DIR, 'netmhciipan-4.3', 'convert_pseudo.dat')
    
        netmhciipan_43_dict = {}
        with open (pseudo_data_file_path, 'r') as f:
            for row in f.readlines():
                iedb_label_2, tool_label = row.split(' ')
                cleaned_iedb_label = clean_allele(iedb_label_2)
                netmhciipan_43_dict[cleaned_iedb_label] = [iedb_label_2, tool_label]
        
        curr_iedb_label_cleaned = clean_allele(iedb_label)

        if curr_iedb_label_cleaned in netmhciipan_43_dict:
            synonyms += netmhciipan_43_dict[curr_iedb_label_cleaned]
            synonyms = list(set(synonyms))

        # append synonyms as the following format: syn1|syn2|syn3...
        mol_row = list(mol_row)
        mol_row[mol_header.index('Synonyms')] = '|'.join(synonyms)
        updated_result_list.append(mol_row)

    final_df = pd.DataFrame(updated_result_list, columns=mol_header)

    final_df = add_icerfire_synonyms(final_df)
    # Drop the 'Tool Group' column
    # final_df = final_df.drop('Tool Group', axis=1)

    # Drop any duplicate rows
    final_df = final_df.drop_duplicates(keep='last')

    # Save to file
    final_df.to_csv('{}/{}'.format(DATA_DIR, MHC_ALLELES_FILE), sep='\t', index=False)

def add_icerfire_synonyms(df):
    # Set display options to print the entire column
    # pd.set_option('display.max_columns', None)
    # pd.set_option('display.max_rows', None)
    # pd.set_option('display.max_colwidth', None)
    '''
    These synonyms should be added separately here because they are not in the MRO-molecules file.
    As these are synonyms, they are not in tools-mapping file either.

    df : DataFrame
                MRO ID   IEDB Label               Synonyms     In Taxon                       Parent Tool Group  Predictor Availability
    0      MRO:0037012     Aime-128                         giant panda  MHC class I protein complex                                  0
    1      MRO:0002036     Anpl-UAA                                duck  MHC class I protein complex                                  0
    2      MRO:0002037  Anpl-UAA*01                                duck  MHC class I protein complex                                  0
     ..            ...          ...                    ...          ...                          ...        ...                     ...
    27661  MRO:0001162  HLA-C*12:02              HLA-C1202        human  MHC class I protein complex        pvc                       1
    27662  MRO:0001163  HLA-C*12:03  HLA-Cw*1203|HLA-C1203        human  MHC class I protein complex        pvc                       1
    '''
    tools_mapping_df = pd.read_csv('{}/{}'.format(DATA_DIR, TOOLS_MAPPING_FILE), skipinitialspace=True, sep='\t')
    tm_header = list(tools_mapping_df.columns)
    filtered_tools_mapping_df = tools_mapping_df.loc[(tools_mapping_df['Tool Group'] == 'pvc') & (tools_mapping_df['Tool'] == 'icerfire')].reset_index(drop=True)

    for mol_row in tqdm(filtered_tools_mapping_df.itertuples(name=None, index=False)) :
        mro_id = mol_row[tm_header.index('MRO ID')]
        tool_label = mol_row[tm_header.index('Tool Label')]

        # Turn 'HLA-A1234' into 'HLA-A*1234':
        # Use regular expression to insert '*' after the first 5 characters
        mod_tool_label = re.sub(r'^(HLA-[A-Z])(\d{4})$', r'\1*\2', tool_label)
        tool_labels = [tool_label, mod_tool_label]

        # Check if the MRO ID exists in mol_df and retrieve the matching row
        if mro_id in df['MRO ID'].values:
            matching_row = df.loc[df['MRO ID'] == mro_id]
            iedb_label = matching_row['IEDB Label'].values[0]
            synonyms = matching_row['Synonyms'].values[0]

            # Update the synonyms
            if str(synonyms) != 'nan':
                synonyms_list = synonyms.split('|')
            else:
                synonyms_list = []

            # Check both 'HLA-A1234' into 'HLA-A*1234'
            for tlabel in tool_labels:
                if tlabel not in synonyms_list and tlabel != iedb_label:
                    synonyms_list.append(tlabel)
                    df.loc[df['MRO ID'] == mro_id, 'Synonyms'] = '|'.join(synonyms_list)

        else:
            print(f'MRO ID {mro_id} not found in mol_df')

    return df


def add_missing_mro_from_mhcii_alleles(df):
    missing_alleles_df = pd.read_excel('{}/{}'.format(DATA_DIR, 'netmhciipan_missing_mro_alleles.xlsx'))
    pseudo_data_file_path = '{}/{}/{}'.format(DATA_DIR, 'netmhciipan-4.3', 'convert_pseudo.dat')
    
    netmhciipan_43_dict = {}
    with open (pseudo_data_file_path, 'r') as f:
        for row in f.readlines():
            iedb_label, tool_label = row.split(' ')
            netmhciipan_43_dict[iedb_label] = tool_label
    
    def get_synonym(label):
        prefix = 'HLA-'
        formatted_label = label
        synonyms = []

        # Only reformat allele with DQA/DQB
        if 'DQA' in label and 'DQB' in label:
            formatted_label = label[len(prefix):]
            formatted_label = formatted_label.replace('/', '-')
            formatted_label = prefix + formatted_label
            synonyms.append(formatted_label)

        tlabel = netmhciipan_43_dict[formatted_label]
        synonyms.append(tlabel)

        return "|".join(synonyms)

    
    new_rows = []
    for row in tqdm(missing_alleles_df.itertuples(name=None, index=False)):
        allele = row[0]
        taxon = ''
        if allele.startswith('HLA'):
            taxon = 'human'
        if allele.startswith('BoLA'):
            taxon = 'cattle'

        new_row = {
            'MRO ID': 'NOMRO:0000000',
            'IEDB Label': allele,
            # 'Synonyms': np.nan,
            'Synonyms': get_synonym(allele).rstrip('\n'),
            'In Taxon': taxon,
            'Parent': 'MHC class II protein complex',
            'Tool Group': 'mhcii',
            'Predictor Availability': 1
        }
        
        new_rows.append(new_row)

    new_rows_df = pd.DataFrame(new_rows)
    df = pd.concat([df, new_rows_df], ignore_index=True)
    
    return df

def create_mhc_alleles():
    mol_df = pd.read_csv('{}/{}'.format(DATA_DIR, MOLECULE_FILE), skipinitialspace=True, sep='\t')
    tm_df = pd.read_csv('{}/{}'.format(DATA_DIR, TOOLS_MAPPING_FILE), skipinitialspace=True, sep='\t')
    tm_header = list(tm_df.columns)

    '''
    Create a sub DataFrame of 'mro_molecules', and add two empty columns
    that will be needed from the 'tools-mapping' dataframe.
    '''
    selected_columns = ['MRO ID', 'IEDB Label', 'Synonyms', 'In Taxon', 'Parent']
    mol_sub_df = mol_df[selected_columns].copy()
    mol_sub_df.loc[:, 'Tool Group'] = ''
    mol_sub_df.loc[:, 'Predictor Availability'] = 0
    mol_sub_df_header = list(mol_sub_df.columns)
 

    # Set index to 'MRO ID' column as filtering will be based on this column.
    tm_df.set_index('MRO ID', inplace=True)

    # Get the 'MRO ID' column (aka index col) as a list
    tm_mro_ids = tm_df.index.tolist()

    # Filter by MRO ID
    additional_entries = []
    for mol_row in tqdm(mol_sub_df.itertuples(name=None, index=False)):
        mroid = mol_row[mol_sub_df_header.index('MRO ID')]
        unique_filtered_tm_df = None

        if mroid not in tm_mro_ids:
            # Unknown ID that doesn't exist in the tools-mapping.
            continue

        # Drop duplicate values in column 'Tool Group'
        # NOTE: .loc[[mroid], :] will ensure DataFrame return
        unique_filtered_tm_df = tm_df.loc[[mroid], :].drop_duplicates(subset='Tool Group')
        
        '''
        Most of the time, entries will be less than 5. It should be okay to
        have this nested loop.
        '''
        is_first_entry = True
        for tm_row in unique_filtered_tm_df.itertuples(name=None, index=False):
            if is_first_entry:
                mol_sub_df.loc[mol_sub_df['MRO ID'] == mroid, 'Tool Group'] = tm_row[tm_header.index('Tool Group')]
                mol_sub_df.loc[mol_sub_df['MRO ID'] == mroid, 'Predictor Availability'] = 1
                is_first_entry = False
            
            '''
            Starting from second entry, it should be added as a new row
            to the mro_molecule dataframe. These entries will have same MRO ID,
            but different Tool Group.
            '''
            tmp_row = list(mol_row)
            tmp_row[mol_sub_df_header.index('Tool Group')] = tm_row[tm_header.index('Tool Group')]
            tmp_row[mol_sub_df_header.index('Predictor Availability')] = 1
            additional_entries.append(tmp_row)
            
    # Add all entries that has same MRO ID, but different Tool Group to the DataFrame
    mol_sub_df = pd.concat([mol_sub_df, pd.DataFrame(additional_entries, columns=mol_sub_df_header)], ignore_index=True)
    
    return mol_sub_df

    
if __name__=='__main__':
    '''
    These steps will create two files:
    1. 'mhc_alleles.tsv' file
    2. 'aucomplete_datasource.tsv' file
    '''
    s = time.time()
    
    # Create initial DataFrame that will become the 'mhc_alleles'
    df = create_mhc_alleles()

    # add mhcii non-mapped alleles
    df = add_missing_mro_from_mhcii_alleles(df)
    
    # Add 'Tool Label' from tools-mapping file as a synonym
    add_tool_label_as_synonym(df)

    # Create autocomplete datasource file
    create_autocomplete_datasource()

    # duplicate MCHI alleles for PHBR
    add_mhci_alleles_to_phbr()

    # Add single alpha/beta chains for PHBR
    add_single_chains_for_phbr()

    # Last filtering regarding issue 785: single DP alleles
    # NOTE: This might be unnecessary as DP and DQ alleles have been removed
    # from the tools-mapping. Running the below functions (which uses regex)
    # can't find any single DP/DQ alleles.
    # remove_single_dp_alleles()

    e = time.time()
    print(f'Time taken: {e-s}')