ejqs
Fix requirement extraction logic and update sample payload for Accounting Specialist role
65e65ac
| from typing import Dict, List, Any | |
| from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline, LongformerTokenizer | |
| import torch | |
| import torch.nn as nn | |
| import torch.nn.functional as F | |
| import spacy | |
| from spacy.matcher import PhraseMatcher | |
| from transformers import LongformerModel | |
| from skillNer.general_params import SKILL_DB | |
| from skillNer.skill_extractor_class import SkillExtractor | |
| Job_num_labels = None | |
| class EndpointHandler(): | |
| def __init__(self, path=""): | |
| # Label mapping as provided | |
| self.Job_label_map = { | |
| "JT": 0, # Job Title | |
| "JS": 1, # Job Summary | |
| "COT": 2, # Title of Company Overview Section | |
| "COC": 3, # Content of Company Overview Section | |
| "RT": 4, # Title of Responsibilites Section | |
| "RC": 5, # Content of Responsibilites Section | |
| "RQT": 6, # Title of Required Qualifications Section | |
| "RQC": 7, # Content of Required Qualifications Section | |
| "PQT": 8, # Title of Preferred Qualifications Section | |
| "PQC": 9, # Content of Preferred Qualifications Section | |
| "ET": 10, # Employment Type | |
| "SBC": 11, # Content of Salary and Benefits Section | |
| "SBT": 12 # Title of Salary and Benefits Section | |
| } | |
| global Job_num_labels | |
| self.Job_num_labels = len(self.Job_label_map) | |
| Job_num_labels = self.Job_num_labels | |
| self.Job_labels = [ | |
| {"value": "JT", "label": "Job Title"}, | |
| {"value": "JS", "label": "Job Summary"}, | |
| {"value": "COT", "label": "Title of Company Overview Section"}, | |
| {"value": "COC", "label": "Content of Company Overview Section"}, | |
| {"value": "RT", "label": "Title of Responsibilites Section"}, | |
| {"value": "RC", "label": "Content of Responsibilites Section"}, | |
| {"value": "RQT", "label": "Title of Required Qualifications Section"}, | |
| {"value": "RQC", "label": "Content of Required Qualifications Section"}, | |
| {"value": "PQT", "label": "Title of Preferred Qualifications Section"}, | |
| {"value": "PQC", "label": "Content of Preferred Qualifications Section"}, | |
| {"value": "ET", "label": "Employment Type"}, | |
| {"value": "SBC", "label": "Content of Salary and Benefits Section"}, | |
| {"value": "SBT", "label": "Title of Salary and Benefits Section"}, | |
| ] | |
| # Load tokenizer | |
| self.Job_tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096") | |
| self.Job_tokenizer.cls_token | |
| # Load model architecture | |
| self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| self.Job_model = LongformerSentenceClassifier(num_labels=self.Job_num_labels) | |
| self.Job_model.to(self.device) | |
| # Load trained weights | |
| self.Job_model.load_state_dict(torch.load(path + "/JobSegmentClassifier3rdEpoch_v2.pth", map_location=self.device)) | |
| # Set model to evaluation mode | |
| self.Job_model.eval() | |
| nlp = spacy.load("en_core_web_lg") | |
| self.skill_extractor = SkillExtractor(nlp, SKILL_DB, PhraseMatcher) | |
| def predict_job_sections(self, model, text, tokenizer, device): | |
| model.eval() | |
| # Tokenize text and get input tensors | |
| encoding = tokenizer( | |
| text, | |
| return_tensors="pt", | |
| truncation=True, | |
| padding="max_length", | |
| max_length=4096 | |
| ) | |
| input_ids = encoding["input_ids"].to(device) | |
| attention_mask = encoding["attention_mask"].to(device) | |
| # Identify `[CLS]` positions (assuming each sentence starts with `[CLS]`) | |
| cls_positions = (input_ids == tokenizer.cls_token_id).nonzero(as_tuple=True)[1] | |
| cls_positions = cls_positions.unsqueeze(0).to(device) # Shape: (1, num_sentences) | |
| # Create global attention mask (Longformer requires at least 1 global attention token) | |
| global_attention_mask = torch.zeros_like(input_ids) | |
| global_attention_mask[:, cls_positions] = 1 # Assign global attention to `[CLS]` tokens | |
| # Run the model | |
| with torch.no_grad(): | |
| logits = model( | |
| input_ids=input_ids, | |
| attention_mask=attention_mask, | |
| global_attention_mask=global_attention_mask, | |
| cls_positions=cls_positions | |
| ) # Shape: (1, num_sentences, num_labels) | |
| logits = logits.squeeze(0) # Shape: (num_sentences, num_labels) | |
| probs = F.softmax(logits, dim=-1) # Convert logits to probabilities | |
| predictions = torch.argmax(probs, dim=-1) # Get predicted label indices | |
| return predictions.cpu().numpy() # Convert to NumPy array for easy use | |
| def extract_job_sections(self, text): | |
| lines = text.splitlines() | |
| lines = [line for line in text.splitlines() if line.strip()] | |
| text = lines | |
| concatenated_text = " ".join(f"{self.Job_tokenizer.cls_token} {sentence}" for sentence in text) | |
| predictions = self.predict_job_sections(self.Job_model, concatenated_text, self.Job_tokenizer, self.device) | |
| return predictions, text | |
| def extract_job_requirements(self, text): | |
| lines = text.splitlines() | |
| lines = [line for line in text.splitlines() if line.strip()] | |
| text = lines | |
| concatenated_text = " ".join(f"{self.Job_tokenizer.cls_token} {sentence}" for sentence in text) | |
| predictions = self.predict_job_sections(self.Job_model, concatenated_text, self.Job_tokenizer, self.device) | |
| requirements = [] | |
| for i, pred in enumerate(predictions): | |
| if self.Job_labels[pred]['value'] == "RQC" and i < len(lines): | |
| requirements.append(lines[i]) | |
| return requirements | |
| def label_job_post(self, text): | |
| lines = self.extract_job_requirements(text) | |
| response = { | |
| "requirements": [] | |
| } | |
| for item in lines: | |
| response["requirements"].append(item) | |
| response["skills"] = [] | |
| seen = set() | |
| if response["requirements"]: # Only process if we have requirements | |
| annotations = self.skill_extractor.annotate(" ".join(response["requirements"])) | |
| if 'results' in annotations and 'full_matches' in annotations['results']: | |
| for result in annotations['results']['full_matches']: | |
| # Standardizing the skill names | |
| skill_info = SKILL_DB.get(result["skill_id"], {}) | |
| skill_name = skill_info.get('skill_name', 'Unknown Skill') | |
| if skill_name not in seen: | |
| seen.add(skill_name) | |
| response["skills"].append({'name': skill_name, 'skill_id': result["skill_id"]}) | |
| if 'results' in annotations and 'ngram_scored' in annotations['results']: | |
| for result in annotations['results']['ngram_scored']: | |
| if result['score'] >= 1: | |
| # Standardizing the skill names | |
| skill_info = SKILL_DB.get(result["skill_id"], {}) | |
| skill_name = skill_info.get('skill_name', 'Unknown Skill') | |
| if skill_name not in seen: | |
| seen.add(skill_name) | |
| response["skills"].append({'name': skill_name, 'skill_id': result["skill_id"]}) | |
| return response | |
| def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: | |
| """ | |
| data args: | |
| inputs (:obj: `str` | `PIL.Image` | `np.array`) | |
| kwargs | |
| Return: | |
| A :obj:`list` | `dict`: will be serialized and returned | |
| """ | |
| text = data['inputs'] | |
| # predictions, text = self.extract_job_sections(text) | |
| # requirements = self.extract_job_requirements(text) | |
| label_job_post = self.label_job_post(text) | |
| return label_job_post | |
| class LongformerSentenceClassifier(nn.Module): | |
| def __init__(self, model_name="allenai/longformer-base-4096", num_labels=Job_num_labels): | |
| """ | |
| Custom Longformer model for sentence classification. | |
| Args: | |
| model_name (str): Hugging Face Longformer model. | |
| num_labels (int): Number of possible sentence labels. | |
| """ | |
| super(LongformerSentenceClassifier, self).__init__() | |
| self.longformer = LongformerModel.from_pretrained(model_name) | |
| self.classifier = nn.Linear(self.longformer.config.hidden_size, num_labels) | |
| def forward(self, input_ids, attention_mask, global_attention_mask, cls_positions): | |
| """ | |
| Forward pass for sentence classification. | |
| Args: | |
| input_ids (Tensor): Tokenized input IDs, shape (batch_size, max_length) | |
| attention_mask (Tensor): Attention mask, shape (batch_size, max_length) | |
| global_attention_mask (Tensor): Global attention mask, shape (batch_size, max_length) | |
| cls_positions (List[Tensor]): Indices of `[CLS]` tokens for each batch element. | |
| """ | |
| outputs = self.longformer( | |
| input_ids=input_ids, | |
| attention_mask=attention_mask, | |
| global_attention_mask=global_attention_mask | |
| ) | |
| last_hidden_state = outputs.last_hidden_state | |
| cls_positions = cls_positions.view(input_ids.shape[0], -1) | |
| cls_embeddings = last_hidden_state.gather(1, cls_positions.unsqueeze(-1).expand(-1, -1, last_hidden_state.size(-1))) | |
| logits = self.classifier(cls_embeddings) | |
| return logits | |
| if __name__ == "__main__": | |
| # init handler | |
| my_handler = EndpointHandler(path=".") | |
| # prepare sample payload | |
| payload = {"inputs": """ | |
| We are seeking an experienced Accounting Specialist to join our team. | |
| The Accounting Specialist will be responsible for various financial tasks, including reconciling accounts, assist with accounts payable, | |
| preparing financial reports, and assisting the Controller. | |
| The ideal candidate will have a strong background in accounting principles and practices, as well as proficiency in Quickbooks accounting | |
| software, Excel and financial concepts. | |
| Responsibilities: | |
| - Perform general ledger reconciliation to ensure accuracy of financial data | |
| - Prepare and analyze financial reports, bank reconciliations and analysis. | |
| - Collaborate with internal teams to ensure compliance with accounting policies and procedures | |
| - Support financial audits by providing necessary documentation and information | |
| - Accounts Payable - Multiple Companies | |
| - GL Reconciliations | |
| - Prepare Weekly, Monthly and Quarterly Commission Reports | |
| - Daily Bank Deposits | |
| Skills: | |
| - Proficiency in accounting software QuickBooks Online | |
| - Strong knowledge of corporate finance principles and practices | |
| - Experience with general ledger reconciliation | |
| - Ability to understand concise financial reports | |
| - Strong analytical skills for financial analysis | |
| - Knowledge of financial auditing processes | |
| - Understanding of cash flow analysis | |
| - Solid grasp of financial concepts such as revenue recognition, depreciation, and accruals | |
| - Ability to manage multiple priorities and time effectively. | |
| Pay: | |
| $50,000 - $60,000 per year | |
| Benefits: | |
| 401(k) matching | |
| Dental insurance | |
| Health insurance | |
| Paid time off | |
| Vision insurance | |
| Experience: | |
| Microsoft Excel: 3 years (Required) | |
| QuickBooks Online: 3 years (Required) | |
| """} | |
| # holiday_payload = {"inputs": "Today is a though day"} | |
| # test the handler | |
| non_holiday_pred=my_handler(payload) | |
| # holiday_payload=my_handler(holiday_payload) | |
| # show results | |
| print(non_holiday_pred) | |
| # print("holiday_payload", holiday_payload) |