File size: 3,765 Bytes
8496edd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import random
from llm.llm import LLM
from collections import Counter, defaultdict
from prompt.template import DECOMPOSE_PRINCIPLE_PROMPT

from utils.utils import read_json_file, write_json_file


def read_problem_papers(problem_name):
    paper_dict = read_json_file('../data/paper_info_dataset.json')['data']
    papers = []
    for paper in paper_dict:
        if paper['paper'].startswith(problem_name):
            papers.append(paper['info'])
    return papers


def generate_decompose_prompt(data):
    # llm = LLM('deepseek-reasoner')
    llm = LLM('chatgpt-4o-latest')
    # Step 1: Filter papers from 2015 and later
    filtered_papers = [paper for paper in data if paper['paper'].split('/')[0] >= '2014']

    # Step 2: Group by type (A, B, C, D, E, F)
    problem_papers = defaultdict(list)

    # Loop through the filtered papers and organize by problem type
    for paper in filtered_papers:
        problem = paper['paper'].split('/')[0]
        problem_papers[problem].append(paper['info'])
    
    # keep 2 papers for each problem:
    for problem, papers in problem_papers.items():
        if len(papers) > 3:
            problem_papers[problem] = random.sample(papers, 3)
        else:
            problem_papers[problem] = papers

    # Step 3: Group papers by problem type (second part of the problem identifier)
    problem_type_papers = defaultdict(list)
    for problem, papers in problem_papers.items():
        problem_type = problem.split('_')[1]
        problem_type_papers[problem_type] += papers
    
    # Step 4: Group by tasknum (problem_type, len(tasks))
    tasknum_papers = defaultdict(list)
    for problem_type, papers in problem_type_papers.items():
        for paper in papers:
            tasknum_papers[(problem_type, len(paper['tasks']))].append(paper)
    
    filtered_tasknum_papers = tasknum_papers

    # # Step 5: Calculate the top 2 frequent tasknum for each problem_type
    # filtered_tasknum_papers = defaultdict(list)
    # for problem_type, papers in problem_type_papers.items():
    #     # Count the frequencies of tasknum within this problem_type
    #     tasknum_counts = Counter(len(paper['tasks']) for paper in papers)
    #     # Get the two most frequent tasknums
    #     most_common_tasknums = [tasknum for tasknum, _ in tasknum_counts.most_common(3)]
    #     print(problem_type, most_common_tasknums)
    #     # Keep only the papers with the top 2 frequent tasknums
    #     for paper in papers:
    #         if len(paper['tasks']) in most_common_tasknums:
    #             filtered_tasknum_papers[(problem_type, len(paper['tasks']))].append(paper)

    result = defaultdict(dict)
    for (problem_type, tasknum), papers in filtered_tasknum_papers.items():
        if tasknum not in [3, 4, 5] or problem_type not in ['A', 'B', 'C', 'D', 'E', 'F']:
            continue
        # if tasknum not in [4] or problem_type not in ['C']:
        #     continue
        print(f"Problem Type: {problem_type}, Task Number: {tasknum}, size: {len(papers)}")
        selected_papers = random.sample(papers, min(len(papers), 6))
        examples = '---'.join(([task_decompose(paper) for paper in selected_papers]))
        prompt = DECOMPOSE_PRINCIPLE_PROMPT.format(examples=examples, tasknum=tasknum)
        answer = llm.generate(prompt)
        result[problem_type][int(tasknum)] = answer

    return result


def task_decompose(paper):
    return '\n'.join([f"- Subtask {i}: {task['task_description'][:]}" for i, task in enumerate(paper['tasks'], start=1)])


if __name__ == "__main__":
    data = read_json_file('../data/actor_data/input/paper_info_dataset.json')
    result = generate_decompose_prompt(data['data'])
    write_json_file('../data/actor_data/input/decompose_prompt.json', result)