import random from llm.llm import LLM from collections import Counter, defaultdict from prompt.template import DECOMPOSE_PRINCIPLE_PROMPT from utils.utils import read_json_file, write_json_file def read_problem_papers(problem_name): paper_dict = read_json_file('../data/paper_info_dataset.json')['data'] papers = [] for paper in paper_dict: if paper['paper'].startswith(problem_name): papers.append(paper['info']) return papers def generate_decompose_prompt(data): # llm = LLM('deepseek-reasoner') llm = LLM('chatgpt-4o-latest') # Step 1: Filter papers from 2015 and later filtered_papers = [paper for paper in data if paper['paper'].split('/')[0] >= '2014'] # Step 2: Group by type (A, B, C, D, E, F) problem_papers = defaultdict(list) # Loop through the filtered papers and organize by problem type for paper in filtered_papers: problem = paper['paper'].split('/')[0] problem_papers[problem].append(paper['info']) # keep 2 papers for each problem: for problem, papers in problem_papers.items(): if len(papers) > 3: problem_papers[problem] = random.sample(papers, 3) else: problem_papers[problem] = papers # Step 3: Group papers by problem type (second part of the problem identifier) problem_type_papers = defaultdict(list) for problem, papers in problem_papers.items(): problem_type = problem.split('_')[1] problem_type_papers[problem_type] += papers # Step 4: Group by tasknum (problem_type, len(tasks)) tasknum_papers = defaultdict(list) for problem_type, papers in problem_type_papers.items(): for paper in papers: tasknum_papers[(problem_type, len(paper['tasks']))].append(paper) filtered_tasknum_papers = tasknum_papers # # Step 5: Calculate the top 2 frequent tasknum for each problem_type # filtered_tasknum_papers = defaultdict(list) # for problem_type, papers in problem_type_papers.items(): # # Count the frequencies of tasknum within this problem_type # tasknum_counts = Counter(len(paper['tasks']) for paper in papers) # # Get the two most frequent tasknums # most_common_tasknums = [tasknum for tasknum, _ in tasknum_counts.most_common(3)] # print(problem_type, most_common_tasknums) # # Keep only the papers with the top 2 frequent tasknums # for paper in papers: # if len(paper['tasks']) in most_common_tasknums: # filtered_tasknum_papers[(problem_type, len(paper['tasks']))].append(paper) result = defaultdict(dict) for (problem_type, tasknum), papers in filtered_tasknum_papers.items(): if tasknum not in [3, 4, 5] or problem_type not in ['A', 'B', 'C', 'D', 'E', 'F']: continue # if tasknum not in [4] or problem_type not in ['C']: # continue print(f"Problem Type: {problem_type}, Task Number: {tasknum}, size: {len(papers)}") selected_papers = random.sample(papers, min(len(papers), 6)) examples = '---'.join(([task_decompose(paper) for paper in selected_papers])) prompt = DECOMPOSE_PRINCIPLE_PROMPT.format(examples=examples, tasknum=tasknum) answer = llm.generate(prompt) result[problem_type][int(tasknum)] = answer return result def task_decompose(paper): return '\n'.join([f"- Subtask {i}: {task['task_description'][:]}" for i, task in enumerate(paper['tasks'], start=1)]) if __name__ == "__main__": data = read_json_file('../data/actor_data/input/paper_info_dataset.json') result = generate_decompose_prompt(data['data']) write_json_file('../data/actor_data/input/decompose_prompt.json', result)