# This is the script used to convert the tepitope percentile files into a single
#   pickle file.
# TODO: Remove this after tepitope percentile data is removed from
#    'djangotools/tool_data/MHCII/tepitope'
import os
import re
import glob
import cPickle

import json



OUTPUT_FILE = './tepitope_percentile_distribution_2018-11-21.p'

def read_file_as_json(input_file):
    with open(input_file, 'r') as i_file:
        return json.load(i_file)

def read_percentile_from_json(length):
    file_name = './mhcii_tepitepe_output/length_%d.json' % length
    return read_file_as_json(file_name)

def get_scores_sample(scores):
    scores_sample = []
    scores.sort(reverse=True)
    #print scores
    for i, score in enumerate(scores):
        # Select every score for the first 100 (1%)
        if i < 100:
            scores_sample.append(score)
        # Select every tenth score for the first 1000 (10%)
        elif i < 1000 and i % 10 == 0:
            scores_sample.append(score)
        # Select every 100th score for the remaining scores
        elif i >= 1000 and i % 100 == 0:
            scores_sample.append(score)
    return scores_sample

def get_dataset_list_280(dataset_10000):
    """
    [input format] a list like:
        [   
        "('LSLSVEDLRVPPTKRELLKRTNEEINLTQ', u'DRB1*11:28', 29)": [
            [
                "LRVPPTKRE", 
                0.3000000000000003
            ]
        ], 
    ]
    -----------------------------
    [output format] a list like:
        [   
        "('tepitope', 'DRB1*11:28', 29)": [
            [0.3000000000000003,...
            ]
        ], 
    ]
    """
    # get 10000 scores for each allele
    scores_dict = {}

    for k,v in dataset_10000.items():
        #print 'k:%s' % repr(k)
        score = v[0][1]
        k = str(k).replace('u','').replace('(','[').replace(')',']').replace("'",'"')
        #print 'k:%s' % repr(k)
        k = json.loads(k)
        key = ('tepitope', str(k[1]), k[2])
        #print 'key:%s' % str(key)
        score_list = scores_dict.setdefault(key,[])
        score_list.append(score)

    # get 280 sample scores
    results = {}
    for k,scores in scores_dict.items():
        print len(scores)
        results[k] = get_scores_sample(scores)

    return results

def write_result_in_pickle_file(percentiles, output_pickle_file):       
    with open(output_pickle_file, 'w+') as pickle_file:
        cPickle.dump(percentiles, pickle_file) 

def main():
    results = dict()
    # get data for length from 11 to 30
    for length in range(11,31):
        dataset_10000 = read_percentile_from_json(length)
        dataset_280 = get_dataset_list_280(dataset_10000)
        results.update(dataset_280) 
    write_result_in_pickle_file(results,OUTPUT_FILE)


if __name__ == '__main__':
    main()
