'''
Created on May 3, 2016

@author: jivan
TODO: If this file is still here after 2016-09, delete it.

This was created to fix the format of allele names in the original percentile data pickle
file to reflect standard nomenclature at http://www.ebi.ac.uk/ipd/mhc/.
It should be removed once the new percentile data is being used in place of the old.
'''
import pickle
import re
# 'valid_allele_names' is part of the 'iedbtools-utilities' package.
from valid_allele_names import allele_name_list

if __name__ == '__main__':
    with open('distribution_netmhccons_bin.cpickle') as pf:
        d_original = pickle.load(pf)
    d_fixed = {}
    invalid_names = []
    for method, allele, binding_length in d_original:
        if 'Mamu' in allele:
            fixed_allele1 = allele.replace('_', '*')
            # Some names have left out the colon, add it back in.
            fixed_allele2 = re.sub(r'(.*?\*)(.\d{1,2})(\d{2}).*', r'\1\2:\3', fixed_allele1)
            fixed_allele = fixed_allele2
#             print('{} -> {}'.format(allele, fixed_allele))
        elif 'BoLA' in allele:
            fixed_allele = allele.replace('_', '*')
            fixed_allele = re.sub(r'(\*)(\d\d)(\d\d)', r'\1\2\3', fixed_allele)
#             print('{} -> {}'.format(allele, fixed_allele))
        elif 'SLA' in allele:
            fixed_allele1 = re.sub(r'(SLA-\d)_?(.*)', r'\1*\2', allele)
            # Restore colon if absent
            fixed_allele2 = re.sub(r'(SLA-\d\*)(\d{2})(\d{2})', r'\1\2:\3', fixed_allele1)
            fixed_allele = fixed_allele2
#             print('{} -> {}'.format(allele, fixed_allele))
        elif 'HLA' in allele:
            fixed_allele = re.sub(r'(HLA-\w)', r'\1*', allele)
#             print('{} -> {}'.format(allele, fixed_allele))
        elif 'Patr' in allele:
            fixed_allele = re.sub(r'(Patr-\w)(\d\d)(\d\d)', r'\1*\2:\3', allele)
        else:
            fixed_allele = allele
#             print('{} -> {}'.format(allele, fixed_allele))

        if fixed_allele not in allele_name_list:
            invalid_names.append(fixed_allele)
        else:
            d_fixed[(method, fixed_allele, binding_length)] = \
                d_original[(method, allele, binding_length)]

    print('{} Total allele names'.format(len(d_original)))
    print('{} Invalid allele names'.format(len(invalid_names)))
    print('\n'.join(invalid_names))
    with open('netmhccons_percentile_distribution_2018-06-14.p', 'w+') as pf:
        pickle.dump(d_fixed, pf)
