File size: 3,765 Bytes
8496edd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
import random
from llm.llm import LLM
from collections import Counter, defaultdict
from prompt.template import DECOMPOSE_PRINCIPLE_PROMPT
from utils.utils import read_json_file, write_json_file
def read_problem_papers(problem_name):
paper_dict = read_json_file('../data/paper_info_dataset.json')['data']
papers = []
for paper in paper_dict:
if paper['paper'].startswith(problem_name):
papers.append(paper['info'])
return papers
def generate_decompose_prompt(data):
# llm = LLM('deepseek-reasoner')
llm = LLM('chatgpt-4o-latest')
# Step 1: Filter papers from 2015 and later
filtered_papers = [paper for paper in data if paper['paper'].split('/')[0] >= '2014']
# Step 2: Group by type (A, B, C, D, E, F)
problem_papers = defaultdict(list)
# Loop through the filtered papers and organize by problem type
for paper in filtered_papers:
problem = paper['paper'].split('/')[0]
problem_papers[problem].append(paper['info'])
# keep 2 papers for each problem:
for problem, papers in problem_papers.items():
if len(papers) > 3:
problem_papers[problem] = random.sample(papers, 3)
else:
problem_papers[problem] = papers
# Step 3: Group papers by problem type (second part of the problem identifier)
problem_type_papers = defaultdict(list)
for problem, papers in problem_papers.items():
problem_type = problem.split('_')[1]
problem_type_papers[problem_type] += papers
# Step 4: Group by tasknum (problem_type, len(tasks))
tasknum_papers = defaultdict(list)
for problem_type, papers in problem_type_papers.items():
for paper in papers:
tasknum_papers[(problem_type, len(paper['tasks']))].append(paper)
filtered_tasknum_papers = tasknum_papers
# # Step 5: Calculate the top 2 frequent tasknum for each problem_type
# filtered_tasknum_papers = defaultdict(list)
# for problem_type, papers in problem_type_papers.items():
# # Count the frequencies of tasknum within this problem_type
# tasknum_counts = Counter(len(paper['tasks']) for paper in papers)
# # Get the two most frequent tasknums
# most_common_tasknums = [tasknum for tasknum, _ in tasknum_counts.most_common(3)]
# print(problem_type, most_common_tasknums)
# # Keep only the papers with the top 2 frequent tasknums
# for paper in papers:
# if len(paper['tasks']) in most_common_tasknums:
# filtered_tasknum_papers[(problem_type, len(paper['tasks']))].append(paper)
result = defaultdict(dict)
for (problem_type, tasknum), papers in filtered_tasknum_papers.items():
if tasknum not in [3, 4, 5] or problem_type not in ['A', 'B', 'C', 'D', 'E', 'F']:
continue
# if tasknum not in [4] or problem_type not in ['C']:
# continue
print(f"Problem Type: {problem_type}, Task Number: {tasknum}, size: {len(papers)}")
selected_papers = random.sample(papers, min(len(papers), 6))
examples = '---'.join(([task_decompose(paper) for paper in selected_papers]))
prompt = DECOMPOSE_PRINCIPLE_PROMPT.format(examples=examples, tasknum=tasknum)
answer = llm.generate(prompt)
result[problem_type][int(tasknum)] = answer
return result
def task_decompose(paper):
return '\n'.join([f"- Subtask {i}: {task['task_description'][:]}" for i, task in enumerate(paper['tasks'], start=1)])
if __name__ == "__main__":
data = read_json_file('../data/actor_data/input/paper_info_dataset.json')
result = generate_decompose_prompt(data['data'])
write_json_file('../data/actor_data/input/decompose_prompt.json', result)
|