import sys
import csv
import os
import re
import json
import shutil
import subprocess
import tempfile
import textwrap
import validators
import pandas as pd
import warnings
from enum import Enum
from pathlib import Path
from typing import Dict, List, Union, Optional, Any, Tuple
from io import StringIO
from itertools import product
from pprint import pprint


class TCellClass(str, Enum):
    I = 'i'
    II = 'ii'


def create_default_filestructure(output_dir: Path) -> None:
    predict_input_dir = output_dir / 'predict-inputs'
    predict_input_dir.mkdir(parents=True, exist_ok=True)
    predict_output_dir = output_dir / 'predict-outputs'
    predict_output_dir.mkdir(parents=True, exist_ok=True)
    
    # create aggregate directory
    aggregate_dir = output_dir / 'aggregate'
    aggregate_dir.mkdir(parents=True, exist_ok=True)

    # if it's for 'phbr', then create 'result' directory
    if output_dir.name == 'phbr':
        result_dir = output_dir / 'results'
        result_dir.mkdir(parents=True, exist_ok=True)


def reorganize_file_structure(data: Dict[str, Any]) -> None:
    print(data)
    output_dir = data['metadata']['output_dir']
    mhci_config = data.get('class_i', {})
    mhcii_config = data.get('class_ii', {})

    # if the outptu_dir is not empty, then clear the directory
    if output_dir.exists():
        shutil.rmtree(output_dir, ignore_errors=True)

    if mhci_config:
        mhci_dir = output_dir / 'mhci'
        mhci_dir.mkdir(parents=True, exist_ok=False)

        # Create mhci directory
        create_default_filestructure(mhci_dir)
    
    if mhcii_config:
        mhcii_dir = output_dir / 'mhcii'
        mhcii_dir.mkdir(parents=True, exist_ok=False)

        # Create mhcii directory
        create_default_filestructure(mhcii_dir)
    
    # Create phbr directory
    phbr_dir = output_dir / 'phbr'
    phbr_dir.mkdir(parents=True, exist_ok=False)

    # Create phbr directory
    create_default_filestructure(phbr_dir)

    print('File structure created successfully!')


def restructure_folder_structure_post_prediction(output_dir: Path) -> None:
    print('Restructuring folder structure post prediction...')
    print('output_dir: ', output_dir)

    data_path = output_dir / 'predict-inputs' / 'data'
    params_path = output_dir / 'predict-inputs' / 'params'

    # Create a new directory called 'data'
    data_path.mkdir(parents=True, exist_ok=False)

    # Create a new directory called 'params'
    params_path.mkdir(parents=True, exist_ok=False)

    # Move all files that starts with 'tmp' to the 'data' directory
    for file in output_dir.glob('tmp*'):
        shutil.move(file, data_path)


    # Move all files that starts with a number and has json extension to the 'params' directory
    # For example, 1.json, 2.json, 3.json, ... 10.json, 11.json...
    # Move all files that start with a number and have .json extension to params directory
    for file in output_dir.glob('*.json'):
        if file.stem.isdigit() and file.suffix == '.json':
            shutil.move(file, params_path)

    # each json file in the 'params' directory needs to change their 'peptide_file_path' to the new path
    for file in params_path.glob('*.json'):
        with open(file, 'r') as f:
            data = json.load(f)
        
        peptide_file_name = data['peptide_file_path'].split('/')[-1]
        data['peptide_file_path'] = str(data_path / peptide_file_name)
        
        with open(file, 'w') as f:
            json.dump(data, f, indent=2)



    predict_output_dir = output_dir / 'predict-outputs'
    predict_output_dir.mkdir(parents=True, exist_ok=False)

    aggregate_dir = output_dir / 'aggregate'
    aggregate_dir.mkdir(parents=True, exist_ok=False)

    # Move output_dir.parent/results directory to the output_dir
    results_dir = output_dir.parent / 'results'
    shutil.move(results_dir, output_dir)

    print('Folder structure restructured successfully!')



def save_json(result: Dict[str, Any], output_path: Union[str, Path]) -> None:
    output_dir = os.path.dirname(output_path)
    if output_dir:
        os.makedirs(output_dir, exist_ok=True)

    with open(output_path, 'w') as w_file:
        json.dump(result, w_file, indent=2)


def needs_mhc_binding(input_json_data: Dict[str, Any]) -> bool:
    # NOTE: This is when aggregated result is passed in as a parameter.
    # No need to run MHC binding.
    if 'mhc_peptide_json' in input_json_data:
        return False
    
    if 'mhc_sequence_tsv' in input_json_data and \
        'mhc_peptide_tsv' not in input_json_data:
        return True
    
    if 'input_neoepitopes' in input_json_data:
        return True

    return False


def split_mhc_inputs(data: Dict[str, Any]) -> None:
    result_dir_phbr = data['metadata']['output_dir'] / 'phbr' / 'predict-inputs'

    mhci_data = data.copy()  # Create a copy to avoid modifying the original
    del mhci_data['class_ii']
    del mhci_data['metadata']

    mhcii_data = data.copy()  # Create a copy to avoid modifying the original
    del mhcii_data['class_i']
    del mhcii_data['metadata']

    # Save the data
    save_json(mhci_data, result_dir_phbr / 'mhci-input.json')
    save_json(mhcii_data, result_dir_phbr / 'mhcii-input.json')


def create_fasta_string(sequence_table_df: pd.DataFrame) -> str:
    """
    Create a fasta string from the sequence table.
    """
    # Read the header, and replace 'sequence' with 'peptide'
    # For example, if the header is ['sequence name', 'sequence', 'mutpos'],
    # then the header will be ['peptide name', 'peptide', 'mutpos']
    # Even column that has 'sequence' as substring, replace it with 'peptide'    
    
    print("Function 'create_fasta_string' called.")
    print(sequence_table_df)
    
    header = sequence_table_df.columns.tolist()
    header = [col.replace('peptide', 'sequence') for col in header]
    sequence_table_df.columns = header


    fasta_string = ""
    for index, row in sequence_table_df.iterrows():
        if 'sequence name' in row:
            fasta_string += f">{row['sequence name']}\n{row['sequence']}\n"
        else:
            fasta_string += f">sequence {index+1}\n{row['sequence']}\n"
    
    print("Fasta string: ", fasta_string)
    return fasta_string


def update_job_description_paths(job_description: List[Dict[str, Any]], output_dir: Path) -> List[Dict[str, Any]]:
    # Set paths
    param_dir = output_dir / 'predict-inputs' / 'params'
    predict_output_dir = output_dir / 'predict-outputs'
    result_dir = output_dir / 'results'
    aggregate_dir = output_dir / 'aggregate'

    for job in job_description:
        job_id = job['job_id']
        job_cmd = job['shell_cmd'].split(' ')

        # Update the input path
        if '-j' in job_cmd:
            # Find the index of the argument that contains '-j'
            j_index = job_cmd.index('-j')
            
            # Replace the argument with the new path
            job_cmd[j_index + 1] = str(param_dir / f'{job_id}.json')

        # Update the output path
        if '-o' in job_cmd:
            # Find the index of the argument that contains '-o'
            o_index = job_cmd.index('-o')
            
            # Replace the argument with the new path
            job_cmd[o_index + 1] = str(predict_output_dir / f'{job_id}')

        # Update expected outputs
        if 'expected_outputs' in job:
            job['expected_outputs'] = [str(predict_output_dir / f'{job_id}.json')]

        job['shell_cmd'] = ' '.join(job_cmd)


    # # Create a temporary file for job descriptions
    # tmp_file_path = tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False).name
    # with open(tmp_file_path, 'w') as f:
    #     json.dump(job_description, f, indent=2)

    local_jd_file = output_dir / 'job_descriptions.json'
    with open(local_jd_file, 'w') as f:
        json.dump(job_description, f, indent=2)

    # Handle aggregate job
    aggregate_job = job_description[-1]
    aggregate_job_list = aggregate_job['shell_cmd'].split(' ')
    
    # Find the index of the element that contains '--aggregate-input-dir' as a substring
    aggregate_input_dir_index = next((i for i, arg in enumerate(aggregate_job_list) if '--aggregate-input-dir' in arg), None)
    if aggregate_input_dir_index is not None:
        # Replace the argument with the new path
        aggregate_job_list[aggregate_input_dir_index] = f'--aggregate-input-dir={result_dir}'

    # Update the aggregate-result-dir
    aggregate_result_dir_index = next((i for i, arg in enumerate(aggregate_job_list) if '--aggregate-result-dir' in arg), None)
    if aggregate_result_dir_index is not None:
        # Replace the argument with the new path
        aggregate_job_list[aggregate_result_dir_index] = f'--aggregate-result-dir={aggregate_dir}'

    # find the index of the element that contains '--job-desc-file' as a substring
    job_desc_file_index = next((i for i, arg in enumerate(aggregate_job_list) if '--job-desc-file' in arg), None)
    if job_desc_file_index is not None:
        # Replace the argument with the new path
        aggregate_job_list[job_desc_file_index] = f'--job-desc-file={local_jd_file}'

    aggregate_job['shell_cmd'] = ' '.join(aggregate_job_list)
    aggregate_job['expected_outputs'] = [str(aggregate_dir / 'aggregated_result.json')]
    
    job_description[-1] = aggregate_job

    return job_description



def run_mhc_binding(input_data: Dict[str, Any], class_type: TCellClass) -> Path:
    cmd = alleles = method = sequence_file = None
    sequence_table_df = None
    MHCI_DEFAULT_PEPTIDE_LENGTH_RANGE = [8, 11]
    MHCII_DEFAULT_PEPTIDE_LENGTH_RANGE = [15, 15]

    if 'mhc_sequence_tsv' not in input_data:
        raise ValueError("No sequence table found in the input JSON file.")

    root_path = input_data['metadata']['root_path']
    sequence_file = input_data['mhc_sequence_tsv']
    

    print("MHCI Binding is running...")
    
    if class_type.value == 'i':
        print('Running MHCI....')
        # NOTE: If the input is a combined MHCI and MHCII input, then we need to get the MHCI sequence table from the input_data
        if 'mhc_sequence_tsv' in input_data['class_i']:
            sequence_file = input_data['class_i']['mhc_sequence_tsv']
        
        sequence_table_df = pd.read_csv(sequence_file, sep='\t')
        fasta_string = create_fasta_string(sequence_table_df)
        
        # Need to turn alleles dictionary values to string format
        alleles = input_data['class_i']['alleles']
        method = input_data['class_i']['prediction_method']['method']


        alleles_list = list(alleles.values())
        
        # Split each allele string by comma and flatten the list
        flattened_alleles = []
        for allele_group in alleles_list:
            flattened_alleles.extend(allele_group.split(','))
        
        print('flattened_alleles: ', flattened_alleles)

        valid_alleles, invalid_alleles = validators.validate_alleles(flattened_alleles, class_type=class_type, method=method)
        alleles = ','.join(valid_alleles)

        print('valid_alleles: ', valid_alleles)
        print('invalid_alleles: ', invalid_alleles)

        if invalid_alleles:
            warning_msg = "The following alleles are invalid and will be excluded from predictions:\n"
            warning_msg += "\n".join(f"  - {allele}" for allele in invalid_alleles)
            warnings.warn(warning_msg, UserWarning)

        peptide_length_range = MHCI_DEFAULT_PEPTIDE_LENGTH_RANGE


    if class_type.value == 'ii':
        print('Running MHCII....')
        # NOTE: If the input is a combined MHCI and MHCII input, then we need to get the MHCII sequence table from the input_data
        if 'mhc_sequence_tsv' in input_data['class_ii']:
            sequence_file = input_data['class_ii']['mhc_sequence_tsv']
        
        sequence_table_df = pd.read_csv(sequence_file, sep='\t')
        fasta_string = create_fasta_string(sequence_table_df)
        
        alleles = input_data['class_ii']['alleles']
        alleles_list = alleles.get('DRB', '').split(',') if alleles.get('DRB') else []
        dpa_alleles = alleles.get('DPA', '').split(',') if alleles.get('DPA') else []
        dpb_alleles = alleles.get('DPB', '').split(',') if alleles.get('DPB') else []
        dqa_alleles = alleles.get('DQA', '').split(',') if alleles.get('DQA') else []
        dqb_alleles = alleles.get('DQB', '').split(',') if alleles.get('DQB') else []

        # Create combination of dpa and dpb alleles
        for dpa_allele, dpb_allele in product(dpa_alleles, dpb_alleles):
            if dpb_allele.startswith('HLA'):
                # Remove the HLA prefix
                dpb_allele = dpb_allele.replace('HLA-', '')
            
            paired_allele = f"{dpa_allele}/{dpb_allele}"

            if paired_allele not in alleles_list:
                alleles_list.append(paired_allele)

        # Create combination of dqa and dqb alleles
        for dqa_allele, dqb_allele in product(dqa_alleles, dqb_alleles):
            if dqb_allele.startswith('HLA'):
                # Remove the HLA prefix
                dqb_allele = dqb_allele.replace('HLA-', '')
            
            # NOTE: If the allele is not in the list, then add it
            paired_allele = f"{dqa_allele}/{dqb_allele}"
            if paired_allele not in alleles_list:
                alleles_list.append(paired_allele)

        # Making sure there are no empty strings in the list
        alleles_list = list(filter(None, alleles_list))

        method = input_data['class_ii']['prediction_method']['method']

        # Validate alleles
        valid_alleles, invalid_alleles = validators.validate_alleles(alleles_list, class_type=class_type, method=method)
        alleles = ','.join(valid_alleles)
        # print(">>>> alleles: ", alleles)
        # print(">>>> invalid_alleles: ", invalid_alleles)

        if invalid_alleles:
            warning_msg = "The following alleles are invalid and will be excluded from predictions:\n"
            warning_msg += "\n".join(f"  - {allele}" for allele in invalid_alleles)
            warnings.warn(warning_msg, UserWarning)

        peptide_length_range = MHCII_DEFAULT_PEPTIDE_LENGTH_RANGE


    # Validate input_sequence_text
    valid_sequence_df = validators.validate_sequence_table(sequence_table_df, peptide_length_range)
    print(f"Sequence table after validation for MHC class {class_type.value}:")
    
    # if valid_sequence_df is empty, then raise an error
    if valid_sequence_df.empty:
        raise ValueError(f"No valid sequences that meets the length range requirements found in the sequence table for MHC class {class_type.value.upper()}.")


    # Create mhc_binding_payload
    mhc_binding_payload = {
        'input_sequence_text': fasta_string,
        'alleles': alleles,
        'predictors': [
            {
                'type': 'binding',
                'method': method
            }
        ],
        'peptide_length_range': peptide_length_range
    }


    # temporary file that contains the mhc_binding_payload
    with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as temp_file:
        json.dump(mhc_binding_payload, temp_file, indent=4)


    if class_type.value == 'i':
        # NOTE: When running from 'predict' subcommand, we need to create temporary directory for the mhci/mhcii result
        # When running from 'preprocess' subcommand, we need to create temporary directory for the mhci/mhcii result.
        # - This is because 'predict' subcommand doesn't take in the output directory as a parameter.
        # - 'preprocess' subcommand takes in the output directory as a parameter.
        if input_data['metadata']['subcommand'] == 'preprocess':
            split_dir = input_data['metadata']['output_dir'] / 'mhci'
            jd_file = split_dir.parent / 'job_descriptions.json'
        else:
            # Create temporary directory output where mhci/mhcii result will be stored
            tmpdir = tempfile.mkdtemp(prefix=f"mhc{class_type.value}_output_", suffix="_for_phbr")
            
            # Create 'mhci' folder inside the tmpdir
            split_dir = f'{tmpdir}/mhci'
            os.makedirs(split_dir, exist_ok=False)
            jd_file = Path(tmpdir) / 'job_descriptions.json'
        
        cmd = textwrap.dedent(f"""\
            source {root_path}/setup_mhci_env.sh
            python3 $TCELL_CLASS_I_PATH/src/tcell_mhci.py -j {temp_file.name} --split --split-dir={split_dir} --keep-empty-row
        """)

    if class_type.value == 'ii':
        # NOTE: When running from 'predict' subcommand, we need to create temporary directory for the mhci/mhcii result
        # When running from 'preprocess' subcommand, we need to create temporary directory for the mhci/mhcii result.
        # - This is because 'predict' subcommand doesn't take in the output directory as a parameter.
        # - 'preprocess' subcommand takes in the output directory as a parameter.
        if input_data['metadata']['subcommand'] == 'preprocess':
            split_dir = input_data['metadata']['output_dir'] / 'mhcii'
            jd_file = split_dir.parent / 'job_descriptions.json'
        else:
            # Create temporary directory output where mhci/mhcii result will be stored
            tmpdir = tempfile.mkdtemp(prefix=f"mhc{class_type.value}_output_", suffix="_for_phbr")
            
            # Create 'mhci' folder inside the tmpdir
            split_dir = f'{tmpdir}/mhcii'
            os.makedirs(split_dir, exist_ok=False)
            jd_file = Path(tmpdir) / 'job_descriptions.json'

        cmd = textwrap.dedent(f"""\
            source {root_path}/setup_mhcii_env.sh
            python3 $TCELL_CLASS_II_PATH/src/tcell_mhcii.py -j {temp_file.name} --split --split-dir={split_dir} --keep-empty-row
        """)


    print("==================^^^^==================")
    print(mhc_binding_payload)
    print("==================^^^^==================")


    print('cmd: ', cmd)
    
    # subprocess.run(split_cmd, capture_output=True, text=True, check=True)
    subprocess.run(cmd, shell=True, executable="/bin/bash", check=True)
    
    # sys.exit()

    # Return the path to the temporary directory if the subcommand is 'preprocess'
    if input_data['metadata']['subcommand'] == 'preprocess':
        return input_data['metadata']['output_dir'] / 'job_descriptions.json'

    

    # extract just the first line of the 'cmd'
    setup_cmd = cmd.strip().splitlines()[0]

    # Aggregate the result
    job_description_path = Path(split_dir).parent / "job_descriptions.json"

    with open(job_description_path, 'r') as f:
        jd_content = json.load(f)

    # The final result is located in 'predict-inputs/aggregate/aggregated-results.json'
    for i, job in enumerate(jd_content):
        sh_cmd = ' '.join([setup_cmd, '&&', 'python3', job['shell_cmd']])
        # sh_cmd = shlex.split(job['shell_cmd'])
        # # NOTE: Should we source the env file everytime we run a job?
        # sh_cmd.insert(0, python_path)
        print(f'JOB {i} >> {sh_cmd}')
        subprocess.run(sh_cmd, shell=True, executable="/bin/bash", check=True)

    return Path(split_dir)

def run(parser: Any, **kwargs: Any) -> None:
    print("Running PREPROCESS2 FILE")    
    input_file = kwargs.get('input_json')
    data = json.load(input_file)
    data['metadata'] = {
        'subcommand': kwargs.get('subcommand'),
        'root_path': parser.PROJECT_ROOT_PATH,
        'output_dir': kwargs.get('output_dir'),
    }

    # 1. Reorganize file structure
    reorganize_file_structure(data)

    # 2. Create job descriptions
    ''' ================================
    NOTE: 
    - MHCI runs before MHCII.
    - MHCI and MHCII are independent of each other.
    - PHBR runs after MHCI and MHCII.
    - PHBR is independent of MHCI and MHCII.
    - PHBR's prediction/postprocessing is the last step.
    ================================ '''
    job_description: List[Dict[str, Any]] = []
    result_dir_phbr = data['metadata']['output_dir'] / 'phbr' / 'predict-outputs'
    curr_job_id = 0
    split_input_files = {
        'mhci': None,
        'mhcii': None,
    }

    # Combined Case
    if data.get('class_i', {}) and data.get('class_ii', {}):
        # NOTE: Need to take the input file and break it into two separate files for MHCI and MHCII
        split_mhc_inputs(data)
        print('Split MHCI and MHCII inputs successfully!')

        # Update the input file names to these newly split files
        split_input_files['mhci'] = data['metadata']['output_dir'] / 'phbr' / 'predict-inputs' / 'mhci-input.json'
        split_input_files['mhcii'] = data['metadata']['output_dir'] / 'phbr' / 'predict-inputs' / 'mhcii-input.json'


    # MHCI
    if data.get('class_i', {}):
        print("MHC-I binding is needed")
        root_path = data['metadata']['root_path']
        result_dir_mhci = data['metadata']['output_dir'] / 'mhci' / 'predict-outputs'

        # NOTE: if the original file was the mhc combined file,
        # then we need to run prediction on the split files (MHCI for this case)
        # - This is because the mhc combined file is not a valid input for MHCI binding prediction
        if split_input_files['mhci']:
            input_file = split_input_files['mhci']
        else:
            input_file = input_file.name

        # CASE 1: User has all the binding data and has to run PHBR directly
        # NOTE: This will always be 1 job
        if not needs_mhc_binding(data):
            print('binding not needed...')

            job: Dict[str, Any] = {
                "shell_cmd": f"{root_path}/src/run_phbr.py predict -j {input_file} -d {data['metadata']['output_dir']} -o {result_dir_phbr.resolve()}/{curr_job_id} -f json",
                'job_id': curr_job_id,
                'job_type': 'predict',
                'depends_on_job_ids': [],
                'expected_outputs': [
                    f'{result_dir_phbr.resolve()}/{curr_job_id}.json'
                ]
            }
            job_description.append(job)

        else:
            print('mhci binding needed...')
            
            # NOTE: Turn "input_neoepitopes" into "mhc_sequence_tsv"
            if 'input_neoepitopes' in data:
                df = pd.read_csv(StringIO(data['input_neoepitopes']), sep='\t')
                df = df.rename(columns=lambda x: 'sequence' if 'peptide' in x.lower() else x)
                df.insert(0, 'seq #', range(1, len(df) + 1))

                # Create a temporary file
                with tempfile.NamedTemporaryFile(mode='w+', suffix='.tsv', delete=False) as tmp_file:
                    df.to_csv(tmp_file.name, index=False, sep='\t')
                    print(f"Temporary file created at: {tmp_file.name}")
                
                data['mhc_sequence_tsv'] = tmp_file.name

            mhc_sequence_file = data.get('mhc_sequence_tsv')
            output_file = data.get('output_file')

            # Run MHCI binding
            mhci_jd_file = run_mhc_binding(data, TCellClass.I)
            print('mhci_jd_file: ', mhci_jd_file)

            # NOTE: Tcell Mhcii tool overritten the folder structure.
            # - We need to change the folder structure back to the original structure.
            restructure_folder_structure_post_prediction(result_dir_mhci.parent)

            with open(mhci_jd_file, 'r') as f:
                job_description = json.load(f)


            
            # Update paths for job_description
            job_description = update_job_description_paths(job_description, result_dir_mhci.parent)

            # for job in job_description:
            #     print(job)

            # sys.exit()

            mhci_agg_result_output = job_description[-1]['expected_outputs']
            mhci_agg_job_id = job_description[-1]['job_id']
            mhci_job_id = mhci_agg_job_id + 1

            print("--------------------------------")
            print('mhci_agg_result_output: ', mhci_agg_result_output) # .../aggregate/peptide_table.tsv
            print(type(mhci_agg_result_output[0]))
            print('mhci_agg_job_id: ', mhci_agg_job_id)
            print('mhci_job_id: ', mhci_job_id)
            print('mhci_job_description: ', mhci_jd_file)
            print("--------------------------------")


            updated_input_file_data = json.load(open(input_file))

            phbr_input = {
                "mhc_peptide_json": mhci_agg_result_output[0],
                "mhc_sequence_tsv": data['mhc_sequence_tsv'],
                "class_i": updated_input_file_data['class_i'],
            }

            # Convert Path objects to strings in the data
            serializable_data = json.loads(json.dumps(phbr_input, default=str))

            print("++++++++++++++++++++++++++++++++++++")
            print(json.dumps(serializable_data, indent=2))
            print("++++++++++++++++++++++++++++++++++++")

            # # Create temporary file with phbr_input
            # with tempfile.NamedTemporaryFile(mode='w', prefix="phbr-mhci-input-", suffix='.json', delete=False) as tmp_file:
            #     json.dump(serializable_data, tmp_file, indent=2)
            #     phbr_input_file = tmp_file.name
            #     print(f"Created temporary file with phbr_input at: {phbr_input_file}")
            
            # Create file located in the user specified output directory
            phbr_mhci_input_file = data['metadata']['output_dir'] / 'phbr' / 'predict-inputs' / 'mhci-final-input.json'
            with open(phbr_mhci_input_file, 'w') as f:
                json.dump(serializable_data, f, indent=2)
                print(f"Created file with phbr_input at: {phbr_mhci_input_file}")

           

            # cmd = textwrap.dedent(f"""\
            #     source {root_path}/setup_mhci_env.sh
            #     python3 $TCELL_CLASS_I_PATH/src/tcell_mhci.py -j {phbr_input_file} --split --split-dir={tmpdir} --keep-empty-row
            # """) 
            cmd = f"{root_path}/src/run_phbr.py predict -j {phbr_mhci_input_file} -d {data['metadata']['output_dir']} -o {result_dir_phbr.resolve()}/{mhci_job_id} -f json"

            phbr_job = {
                'shell_cmd': cmd,
                'job_id': mhci_job_id,
                'job_type': 'predict',
                'depends_on_job_ids': [mhci_job_id - 1],
                'expected_outputs': [
                    f'{result_dir_phbr.resolve()}/{mhci_job_id}.json'
                ]
            }
            job_description.append(phbr_job)

            # for job in job_description:
            #     pprint(job)



    if data.get('class_ii', {}):
        print("MHC-II binding is needed")

        root_path = data['metadata']['root_path']
        result_dir_mhcii = data['metadata']['output_dir'] / 'mhcii' / 'predict-outputs'

        # NOTE: Since, MHCII comes after MHCI, we need to check if MHCI job is already added to the job_description
        if len(job_description) > 0:
            curr_job_id = job_description[-1]['job_id'] + 1
        else:
            curr_job_id = 0

        # NOTE: if the original file was the mhc combined file, 
        # then we need to run prediction on the split files (MHCII for this case)
        # - This is because the mhc combined file is not a valid input for MHCII binding prediction
        if split_input_files['mhcii']:
            input_file = split_input_files['mhcii']
        else:
            input_file = input_file.name

        if not needs_mhc_binding(data):
            print('binding not needed...')

            job: Dict[str, Any] = {
                "shell_cmd": f"{root_path}/src/run_phbr.py predict -j {input_file} -d {data['metadata']['output_dir']} -o {result_dir_phbr.resolve()}/{curr_job_id} -f json",
                'job_id': curr_job_id,
                'job_type': 'predict',
                'depends_on_job_ids': [],
                'expected_outputs': [
                    f'{result_dir_phbr.resolve()}/{curr_job_id}.json'
                ]
            }
            job_description.append(job)

        else:
            print('mhcii binding needed...')

            # NOTE: Turn "input_neoepitopes" into "mhc_sequence_tsv"
            if 'input_neoepitopes' in data:
                df = pd.read_csv(StringIO(data['input_neoepitopes']), sep='\t')
                df = df.rename(columns=lambda x: 'sequence' if 'peptide' in x.lower() else x)
                df.insert(0, 'seq #', range(1, len(df) + 1))

                # Create a temporary file
                with tempfile.NamedTemporaryFile(mode='w+', suffix='.tsv', delete=False) as tmp_file:
                    df.to_csv(tmp_file.name, index=False, sep='\t')
                    print(f"Temporary file created at: {tmp_file.name}")
                
                data['mhc_sequence_tsv'] = tmp_file.name

            mhc_sequence_file = data.get('mhc_sequence_tsv')
            output_file = data.get('output_file')

            # Run MHCII binding
            mhcii_jd_file = run_mhc_binding(data, TCellClass.II)
            print('mhcii_jd_file: ', mhcii_jd_file)

            # NOTE: Tcell Mhcii tool overritten the folder structure.
            # - We need to change the folder structure back to the original structure.
            restructure_folder_structure_post_prediction(result_dir_mhcii.parent)
            

            with open(mhcii_jd_file, 'r') as f:
                mhcii_job_description = json.load(f)

            # Update paths for job_description
            mhcii_job_description = update_job_description_paths(mhcii_job_description, result_dir_mhcii.parent)
                
            # Add all MHCII jobs to the master job description
            for job in mhcii_job_description:
                # Add MHCII environment before running the shell command
                # NOTE: This might not be needed when running in the server.
                # job['shell_cmd'] = f"source {root_path}/setup_mhcii_env.sh && {job['shell_cmd']}"

                # Update job IDs to continue from where MHCI left off
                job['job_id'] = curr_job_id
                if job['job_type'] == 'aggregate':
                    # Update dependencies for aggregate job
                    job_start = curr_job_id - len(mhcii_job_description) + 1
                    job['depends_on_job_ids'] = list(range(job_start, curr_job_id))

                
                job_description.append(job)
                curr_job_id += 1

            print(mhcii_job_description)
            mhcii_agg_result_output = mhcii_job_description[-1]['expected_outputs']
            mhcii_agg_job_id = mhcii_job_description[-1]['job_id']
            curr_job_id = mhcii_agg_job_id + 1
            print(mhcii_agg_result_output)  
            
            print("--------------------------------")
            print('mhcii_agg_result_output: ', mhcii_agg_result_output)
            print('mhcii_agg_job_id: ', mhcii_agg_job_id)
            print('curr_job_id: ', curr_job_id)
            print("--------------------------------")

            updated_input_file_data = json.load(open(input_file))

            phbr_input = {
                "mhc_peptide_json": mhcii_agg_result_output[0],
                "mhc_sequence_tsv": data['mhc_sequence_tsv'],
                "class_ii": updated_input_file_data['class_ii'],
            }

            serializable_data = json.loads(json.dumps(phbr_input, default=str))

            print("++++++++++++++++++++++++++++++++++++")
            print(json.dumps(serializable_data, indent=2))
            print("++++++++++++++++++++++++++++++++++++")

            # with tempfile.NamedTemporaryFile(mode='w', prefix="phbr-input-", suffix='.json', delete=False) as tmp_file:
            #     json.dump(serializable_data, tmp_file, indent=2)
            #     phbr_input_file = tmp_file.name
            #     print(f"Created temporary file with phbr_input at: {phbr_input_file}")

            phbr_mhcii_input_file = data['metadata']['output_dir'] / 'phbr' / 'predict-inputs' / 'mhcii-final-input.json'
            with open(phbr_mhcii_input_file, 'w') as f:
                json.dump(serializable_data, f, indent=2)
                print(f"Created file with phbr_input at: {phbr_mhcii_input_file}")

            cmd = f"{root_path}/src/run_phbr.py predict -j {phbr_mhcii_input_file} -d {data['metadata']['output_dir']} -o {result_dir_phbr.resolve()}/{curr_job_id} -f json"

            phbr_job = {
                'shell_cmd': cmd,
                'job_id': curr_job_id,
                'job_type': 'predict',
                'depends_on_job_ids': [curr_job_id - 1],
                'expected_outputs': [
                    f'{result_dir_phbr.resolve()}/{curr_job_id}.json'
                ]
            }
            
            job_description.append(phbr_job)


    # NOTE: PHBR postprocessing is the last step
    # - It will handle any PHBR result aggregation
    # - PHBR result will be reformatted
    # Add postprocessing job
    curr_job_id = job_description[-1]['job_id'] + 1

    for job in job_description:
        pprint(job)

    jd_dir = data['metadata']['output_dir']
    root_path = data['metadata']['root_path']
    
    postprocessing_job: Dict[str, Any] = {
        'shell_cmd': f'{root_path}/src/run_phbr.py postprocess --job-desc-file {jd_dir}/job_descriptions.json --postprocessed-results-dir {result_dir_phbr} -o {result_dir_phbr.parent}/results/{curr_job_id} -f json',
        'job_id': curr_job_id,
        'job_type': 'postprocess',
        # depends on the last job in the job_description
        'depends_on_job_ids': [job_description[-1]['job_id']],
        'expected_outputs': [
            f'{result_dir_phbr.parent.resolve()}/results/{curr_job_id}.json'
        ]
    }
    job_description.append(postprocessing_job)

    for job in job_description:
        pprint(job)

    # Save the master_job_description to a file
    save_json(job_description, data['metadata']['output_dir'] / 'job_descriptions.json')

    print('Job descriptions created successfully at: ', data['metadata']['output_dir'] / 'job_descriptions.json')