From 21e0a689933c06bffbe743ec4c33b9f09d594a6b Mon Sep 17 00:00:00 2001 From: Christopher Date: Wed, 28 Feb 2024 18:21:58 -0500 Subject: [PATCH 01/11] Fix printing logging --- align_system/algorithms/llama_2_single_kdma_adm.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/align_system/algorithms/llama_2_single_kdma_adm.py b/align_system/algorithms/llama_2_single_kdma_adm.py index 22082fb4..c037da17 100644 --- a/align_system/algorithms/llama_2_single_kdma_adm.py +++ b/align_system/algorithms/llama_2_single_kdma_adm.py @@ -126,11 +126,11 @@ def __init__(self, device='cuda', hf_model='meta-llama/Llama-2-7b-chat-hf', prec def load_model(self, model=None, tokenizer=None): assert (model is None) == (tokenizer is None), "model and tokenizer must both be None or both be not None." if model is not None: - print('Loading model and tokenizer from provided objects.') + log.info('Loading model and tokenizer from provided objects.') self.model = model self.tokenizer = tokenizer else: - print('Loading model:', self.hf_model) + log.info('Loading model: %s', self.hf_model) if self.device == 'auto': self.model = AutoModelForCausalLM.from_pretrained(self.hf_model, torch_dtype=self.precision, device_map='auto') else: @@ -284,7 +284,7 @@ def respond_to_dialog(self, dialog, prefix=None): else: new_dialog.append(message) dialog = new_dialog - print('INPUT\n', dialog) + log.info('INPUT\n %s', dialog) prompt_tokens = [self.tokenizer.apply_chat_template(dialog, tokenize=True)] inference_pair['input'] = self.tokenizer.apply_chat_template(dialog, tokenize=False) @@ -306,7 +306,7 @@ def respond_to_dialog(self, dialog, prefix=None): temperature=self.temperature, do_sample=self.do_sample) - # Print the generated model output + # log.info the generated model output generated_output = self.tokenizer.decode(outputs.sequences[0][prompt_length:]) inference_pair['output'] = generated_output @@ -428,7 +428,7 @@ def aligned_decision_maker(self, question, choices, target_kdmas, n_positive_sam if not good_parse: reasoning, answer_idx, parse_method = Llama2SingleKDMAADM.bert_similarity_parse(high_response, shuffled_choices) - print('CHOSEN ANSWER IDX', answer_idx, shuffled_choices) + log.info('CHOSEN ANSWER IDX %s %s', answer_idx, shuffled_choices) assert answer_idx is not None, f'Failed to parse answer index from generated output: {low_response}' responses.append({ @@ -600,10 +600,10 @@ def parse_generated_output(generated_output, n_choices): @staticmethod def bert_similarity_parse(generated_output, choices): - print('BERT SIMILARITY PARSE') + log.info('BERT SIMILARITY PARSE') force_choice_func = build_force_choice_func('bert') answer_idx, _ = force_choice_func(generated_output, choices) - print('ANSWER IDX', answer_idx, type(answer_idx)) + log.info('ANSWER IDX %s %s', answer_idx, type(answer_idx)) return generated_output, answer_idx, 'bert_similarity' @staticmethod From a657a7fc5ac3e2fdc05d149afba485a47c4c655a Mon Sep 17 00:00:00 2001 From: Christopher Date: Fri, 1 Mar 2024 15:30:09 -0500 Subject: [PATCH 02/11] In progress --- .../algorithms/llama_2_single_kdma_adm.py | 54 +++++++++++++++++-- align_system/evaluation/adm_evaluator.py | 2 +- 2 files changed, 51 insertions(+), 5 deletions(-) diff --git a/align_system/algorithms/llama_2_single_kdma_adm.py b/align_system/algorithms/llama_2_single_kdma_adm.py index c037da17..61195411 100644 --- a/align_system/algorithms/llama_2_single_kdma_adm.py +++ b/align_system/algorithms/llama_2_single_kdma_adm.py @@ -3,7 +3,7 @@ import random import os import pathlib -from align_system.algorithms.abstracts import AlignedDecisionMaker +import random from jinja2.exceptions import TemplateError @@ -13,8 +13,7 @@ import numpy as np from align_system.utils import logging - - +from align_system.algorithms.abstracts import AlignedDecisionMaker from align_system.similarity_measures import build_force_choice_func @@ -428,7 +427,7 @@ def aligned_decision_maker(self, question, choices, target_kdmas, n_positive_sam if not good_parse: reasoning, answer_idx, parse_method = Llama2SingleKDMAADM.bert_similarity_parse(high_response, shuffled_choices) - log.info('CHOSEN ANSWER IDX %s %s', answer_idx, shuffled_choices) + log.explain('CHOSEN ANSWER IDX %s %s', answer_idx, shuffled_choices) assert answer_idx is not None, f'Failed to parse answer index from generated output: {low_response}' responses.append({ @@ -760,11 +759,58 @@ def run_aligned_decision_maker_with_voting( return reasoning, answer_idx, responses, inference_pairs + def format_single_incontext_prompt(self, sample): + prompt = sample['scenario'] + if sample['state'] is not None: + prompt += f'\n{sample["state"]}' + + choices = sample['choices'] + + labels = kwargs.get('labels', {}) + + alignment_target = None + if target_kdma_values is not None: + target_kdma = next(iter(next(iter(filter(lambda x: len(x) > 0, labels))))) # get the frist key of the first label that is not empty + + for label in labels: + assert len(label) == 0 or (target_kdma in label and len(label) == 1), f'All labels must have the same KDMA: labels={labels}' + + alignment_target = { + target_kdma: target_kdma_values[target_kdma] + } + + + #TODO: add prompt completetion here for choices as well. + + def __call__(self, sample, target_kdma_values, **kwargs): + """ Build the prompt and send to the LLM to ask for a single KDMA + + + """ prompt = sample['scenario'] if sample['state'] is not None: prompt += f'\n{sample["state"]}' + if 'incontext' in kwargs: + possible_samples = [] + + for sam in kwargs['dataset']: + if sam['probe_id'] != sample['probe_id']: + possible_samples.append(sam) + + if len(possible_samples) < kwargs['incontext']['number']: + raise(f'Not enough possible incontext samples to learn from here. + Only {len(possible_samples)} samples while asking for + {kwargs['incontext']['number']} in context samples') + + if kwargs['incontext']['method'] == 'random': + chosen_sample = random.sample(possible_samples, kwargs['incontext']['number']) + else: + raise(f'"{kwargs['incontext']['method']}" is not a valid incontext method. Please use "random", ') + + + if 'retriever' in kwargs: # retriever_prompt = "How would you treat the following injuries: {}".format(prompt) retriever_prompt = "{} {}".format(prompt, sample['probe']) diff --git a/align_system/evaluation/adm_evaluator.py b/align_system/evaluation/adm_evaluator.py index ff7eda9f..4a742e4f 100644 --- a/align_system/evaluation/adm_evaluator.py +++ b/align_system/evaluation/adm_evaluator.py @@ -11,7 +11,7 @@ def generate_outputs(dataset, adm, target_kdma_values, **kwargs): }) continue - outputs.append(adm(input_, target_kdma_values, labels=label, **kwargs)) + outputs.append(adm(input_, target_kdma_values, labels=label, dataset=dataset, **kwargs)) return outputs From b1b501c6c933a6c914bf3dcc64c1e0a623f2dc9d Mon Sep 17 00:00:00 2001 From: Christopher Date: Fri, 1 Mar 2024 19:03:37 -0500 Subject: [PATCH 03/11] First working cut of incontext learning --- .../algorithms/llama_2_single_kdma_adm.py | 52 +++++++++++-------- 1 file changed, 29 insertions(+), 23 deletions(-) diff --git a/align_system/algorithms/llama_2_single_kdma_adm.py b/align_system/algorithms/llama_2_single_kdma_adm.py index 61195411..b30bb20a 100644 --- a/align_system/algorithms/llama_2_single_kdma_adm.py +++ b/align_system/algorithms/llama_2_single_kdma_adm.py @@ -114,6 +114,7 @@ def __init__(self, device='cuda', hf_model='meta-llama/Llama-2-7b-chat-hf', prec self.temperature = temperature self.do_sample = do_sample self.chat_template = kwargs.get('chat_template', None) + self.dataset = [] assert precision in ['full', 'half'], "precision must be either 'full' or 'half'." self.precision = torch.float32 if precision == 'full' else torch.float16 @@ -407,6 +408,7 @@ def aligned_decision_maker(self, question, choices, target_kdmas, n_positive_sam shuffled_choices, system_message=system_message) + if not logged_aligned_dialog: log.debug("[bold]*ALIGNED DIALOG*[/bold]", extra={"markup": True}) @@ -759,25 +761,17 @@ def run_aligned_decision_maker_with_voting( return reasoning, answer_idx, responses, inference_pairs - def format_single_incontext_prompt(self, sample): + def format_single_incontext_prompt(self, sample, labels): prompt = sample['scenario'] if sample['state'] is not None: prompt += f'\n{sample["state"]}' - choices = sample['choices'] + for choice, label in zip(sample['choices'],labels): + level = 'high' if list(label.values())[0] > 5 else 'low' + attribute = list(label.keys())[0].replace('_', ' ') + prompt += f' If you had a {level} {attribute}, you would select {choice}.' - labels = kwargs.get('labels', {}) - - alignment_target = None - if target_kdma_values is not None: - target_kdma = next(iter(next(iter(filter(lambda x: len(x) > 0, labels))))) # get the frist key of the first label that is not empty - - for label in labels: - assert len(label) == 0 or (target_kdma in label and len(label) == 1), f'All labels must have the same KDMA: labels={labels}' - - alignment_target = { - target_kdma: target_kdma_values[target_kdma] - } + return prompt #TODO: add prompt completetion here for choices as well. @@ -795,21 +789,33 @@ def __call__(self, sample, target_kdma_values, **kwargs): if 'incontext' in kwargs: possible_samples = [] + #sam has both info in first element and labels in second element for sam in kwargs['dataset']: - if sam['probe_id'] != sample['probe_id']: + if sam[0]['probe_id'] != sample['probe_id']: + possible_samples.append(sam) - if len(possible_samples) < kwargs['incontext']['number']: - raise(f'Not enough possible incontext samples to learn from here. - Only {len(possible_samples)} samples while asking for - {kwargs['incontext']['number']} in context samples') + if len(possible_samples) < kwargs['incontext']['number']: + raise RuntimeError(f'Not enough possible incontext samples to learn from here.' + f'Only {len(possible_samples)} samples while asking for' + f'{kwargs["incontext"]["number"]} in context samples') - if kwargs['incontext']['method'] == 'random': - chosen_sample = random.sample(possible_samples, kwargs['incontext']['number']) - else: - raise(f'"{kwargs['incontext']['method']}" is not a valid incontext method. Please use "random", ') + if kwargs['incontext']['method'] == 'random': + chosen_sample = random.sample(possible_samples, kwargs['incontext']['number']) + else: + raise(f'"{kwargs["incontext"]["method"]}" is not a valid incontext method. Please use "random", ') + + incontext_prompt_start = ' Here are some examples of similar problems with their attributes. ' + + + extra_prompts = [incontext_prompt_start] + for cs, cl in chosen_sample: + extra_prompts.append(self.format_single_incontext_prompt(cs, cl)) + extra_prompts.append(' Given these similar examples, please answer the question for the following scenario. ') + extra_prompts = ''.join(extra_prompts) + prompt = extra_prompts + prompt if 'retriever' in kwargs: # retriever_prompt = "How would you treat the following injuries: {}".format(prompt) From bc9955ea01fb21d17228c95b8be2a5024ad3287a Mon Sep 17 00:00:00 2001 From: Christopher Date: Wed, 6 Mar 2024 13:32:35 -0500 Subject: [PATCH 04/11] updating incontex for saying example --- align_system/algorithms/llama_2_single_kdma_adm.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/align_system/algorithms/llama_2_single_kdma_adm.py b/align_system/algorithms/llama_2_single_kdma_adm.py index b30bb20a..ca7557d8 100644 --- a/align_system/algorithms/llama_2_single_kdma_adm.py +++ b/align_system/algorithms/llama_2_single_kdma_adm.py @@ -809,8 +809,10 @@ def __call__(self, sample, target_kdma_values, **kwargs): extra_prompts = [incontext_prompt_start] + ci = 1 for cs, cl in chosen_sample: - extra_prompts.append(self.format_single_incontext_prompt(cs, cl)) + extra_prompts.append(f' Example {ci}' + self.format_single_incontext_prompt(cs, cl)) + ci += 1 extra_prompts.append(' Given these similar examples, please answer the question for the following scenario. ') From e8ffc83ed1e06423038c52e2dd0f5dc87dfd8921 Mon Sep 17 00:00:00 2001 From: Christopher Date: Mon, 29 Apr 2024 17:24:44 -0400 Subject: [PATCH 05/11] Updated to use an external dataset for incontext. Right now using the old style dataset --- .../single_kdma_adm_config_incontext.yml | 21 ++++++++ .../algorithms/llama_2_single_kdma_adm.py | 48 ++++++++++--------- 2 files changed, 47 insertions(+), 22 deletions(-) create mode 100644 adm_configs/single_kdma_adm_config_incontext.yml diff --git a/adm_configs/single_kdma_adm_config_incontext.yml b/adm_configs/single_kdma_adm_config_incontext.yml new file mode 100644 index 00000000..88e40153 --- /dev/null +++ b/adm_configs/single_kdma_adm_config_incontext.yml @@ -0,0 +1,21 @@ +adm: + name: 'SingleKDMAADM' + init_kwargs: + hf_model: meta-llama/Llama-2-7b-chat-hf + precision: half + temperature: 0.7 + + inference_kwargs: + baseline: true + n_negative_samples: 0 + n_positive_samples: 1 + shuffle: true + incontext: + number: 5 + method: random + dataset: ../datasets/metrics-eval/bbn/metrics-eval-train-renamed.json + +alignment_target_override: + id: ADEPT-metrics_eval-alignment-target-train-HIGH + kdma_values: + - {kdma: MoralDesert, value: 1} diff --git a/align_system/algorithms/llama_2_single_kdma_adm.py b/align_system/algorithms/llama_2_single_kdma_adm.py index ca7557d8..403978c9 100644 --- a/align_system/algorithms/llama_2_single_kdma_adm.py +++ b/align_system/algorithms/llama_2_single_kdma_adm.py @@ -789,11 +789,15 @@ def __call__(self, sample, target_kdma_values, **kwargs): if 'incontext' in kwargs: possible_samples = [] - #sam has both info in first element and labels in second element - for sam in kwargs['dataset']: - if sam[0]['probe_id'] != sample['probe_id']: + # Read dataset + with open(kwargs['dataset']) as f: + dataset = json.load(f) - possible_samples.append(sam) + #sam has both info in first element and labels in second element + for sam in dataset: + # if sam[0]['probe_id'] != sample['probe_id']: + # TODO: add a way to prevent having the sample as a knn if loading itself + possible_samples.append(sam) if len(possible_samples) < kwargs['incontext']['number']: raise RuntimeError(f'Not enough possible incontext samples to learn from here.' @@ -819,30 +823,30 @@ def __call__(self, sample, target_kdma_values, **kwargs): extra_prompts = ''.join(extra_prompts) prompt = extra_prompts + prompt - if 'retriever' in kwargs: - # retriever_prompt = "How would you treat the following injuries: {}".format(prompt) - retriever_prompt = "{} {}".format(prompt, sample['probe']) + # if 'retriever' in kwargs: + # # retriever_prompt = "How would you treat the following injuries: {}".format(prompt) + # retriever_prompt = "{} {}".format(prompt, sample['probe']) - retriever = kwargs['retriever'] - retrieved_nodes = retriever.retrieve(retriever_prompt) + # retriever = kwargs['retriever'] + # retrieved_nodes = retriever.retrieve(retriever_prompt) - if 'summarizer' in kwargs: - summarizer = kwargs['summarizer'] - summary = summarizer.synthesize(retriever_prompt, nodes=retrieved_nodes) + # if 'summarizer' in kwargs: + # summarizer = kwargs['summarizer'] + # summary = summarizer.synthesize(retriever_prompt, nodes=retrieved_nodes) - log.explain("[bold] ** Retrieval Summary ** [/bold]", - extra={"markup": True}) - log.explain(summary) + # log.explain("[bold] ** Retrieval Summary ** [/bold]", + # extra={"markup": True}) + # log.explain(summary) - prompt += "\n#############\n{}\n#############".format(summary) + # prompt += "\n#############\n{}\n#############".format(summary) - else: - prompt += "\n#############\n{}\n#############".format( - "\n#############\n".join((n.text for n in retrieved_nodes))) + # else: + # prompt += "\n#############\n{}\n#############".format( + # "\n#############\n".join((n.text for n in retrieved_nodes))) - prompt += f'\nGiven the scenario and documentation above.. {sample["probe"]}' - else: - prompt += f'\n{sample["probe"]}' + # prompt += f'\nGiven the scenario and documentation above.. {sample["probe"]}' + # else: + prompt += f'\n{sample["probe"]}' choices = sample['choices'] From 2b717a64981ff95b800b43ef45add9c2dd762ea4 Mon Sep 17 00:00:00 2001 From: Christopher Date: Mon, 29 Apr 2024 18:21:21 -0400 Subject: [PATCH 06/11] WIP: getting incontext examples with tokens --- .../algorithms/llama_2_single_kdma_adm.py | 29 +++++++++++++++---- 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/align_system/algorithms/llama_2_single_kdma_adm.py b/align_system/algorithms/llama_2_single_kdma_adm.py index 403978c9..76251263 100644 --- a/align_system/algorithms/llama_2_single_kdma_adm.py +++ b/align_system/algorithms/llama_2_single_kdma_adm.py @@ -216,6 +216,7 @@ def chat_prompt_tokens(self, dialogs, return_tensor=True): def build_multiple_choice_dialog(self, question, options, + incontext=None, system_message=None, json_format=STANDARD_MULTIPLE_CHOICE_JSON_FORMAT): medical_triage_system_message = ( @@ -374,7 +375,7 @@ def respond_to_dialogs_batched(self, dialogs, prefixes=None): return generated_outputs - def aligned_decision_maker(self, question, choices, target_kdmas, n_positive_samples=5, n_negative_sampels=5, shuffle=True, baseline=False, n_retries=3): + def aligned_decision_maker(self, question, choices, target_kdmas, incontext=None, n_positive_samples=5, n_negative_sampels=5, shuffle=True, baseline=False, n_retries=3): inference_pairs = [] if not baseline: unsupported_kdmas = {kdma_remapping.get(k, k) @@ -406,7 +407,8 @@ def aligned_decision_maker(self, question, choices, target_kdmas, n_positive_sam dialog = self.build_multiple_choice_dialog( question, shuffled_choices, - system_message=system_message) + system_message=system_message, + incontext=incontext) if not logged_aligned_dialog: @@ -766,6 +768,18 @@ def format_single_incontext_prompt(self, sample, labels): if sample['state'] is not None: prompt += f'\n{sample["state"]}' + + + [f'({i}) {option}' for i, option in enumerate(options)] + { + "role": "user", + "content": system_message + }, + { + "role": "assistant", + "content": invalid_json + } + for choice, label in zip(sample['choices'],labels): level = 'high' if list(label.values())[0] > 5 else 'low' attribute = list(label.keys())[0].replace('_', ' ') @@ -788,6 +802,7 @@ def __call__(self, sample, target_kdma_values, **kwargs): if 'incontext' in kwargs: possible_samples = [] + incontext_prompts = [] # Read dataset with open(kwargs['dataset']) as f: @@ -809,10 +824,11 @@ def __call__(self, sample, target_kdma_values, **kwargs): else: raise(f'"{kwargs["incontext"]["method"]}" is not a valid incontext method. Please use "random", ') - incontext_prompt_start = ' Here are some examples of similar problems with their attributes. ' + # incontext_prompt_start = ' Here are some examples of similar problems with their attributes. ' - extra_prompts = [incontext_prompt_start] + # extra_prompts = [incontext_prompt_start] + ci = 1 for cs, cl in chosen_sample: extra_prompts.append(f' Example {ci}' + self.format_single_incontext_prompt(cs, cl)) @@ -821,7 +837,7 @@ def __call__(self, sample, target_kdma_values, **kwargs): extra_prompts.append(' Given these similar examples, please answer the question for the following scenario. ') extra_prompts = ''.join(extra_prompts) - prompt = extra_prompts + prompt + # prompt = extra_prompts + prompt # if 'retriever' in kwargs: # # retriever_prompt = "How would you treat the following injuries: {}".format(prompt) @@ -867,6 +883,7 @@ def __call__(self, sample, target_kdma_values, **kwargs): prompt, choices, alignment_target, + incontext=None, n_positive_samples=kwargs.get('n_positive_samples', 5), n_negative_samples=kwargs.get('n_negative_samples', 5), baseline=kwargs.get('baseline', False), @@ -1100,7 +1117,7 @@ def populate_tagging_parameters(self, scenario_state, tagging_action, alignment_ parsed_tagging_output = self.attempt_generic_parse( # noqa raw_tagging_response, ['Reasoning', 'Answer', 'Tag']) # noqa - + if parsed_tagging_output is not None: if len(untagged_characters) == 1: log.debug("** Force selecting only available character") From abcaa76e070f050d0338431d262d06adc1dc47c6 Mon Sep 17 00:00:00 2001 From: Christopher Date: Mon, 6 May 2024 18:45:45 -0400 Subject: [PATCH 07/11] Updated incontext examples to be like the assistent and also now about to use bert similarity to pick items --- .gitignore | 1 + .vscode/launch.json | 52 ++++ ...ig.yml => single_kdma_adm_config_high.yml} | 0 ...single_kdma_adm_config_high_incontext.yml} | 2 +- adm_configs/single_kdma_adm_config_low.yml | 17 ++ .../single_kdma_adm_config_low_incontext.yml | 21 ++ .../algorithms/llama_2_single_kdma_adm.py | 287 ++++++++++++++---- 7 files changed, 326 insertions(+), 54 deletions(-) create mode 100644 .vscode/launch.json rename adm_configs/{single_kdma_adm_config.yml => single_kdma_adm_config_high.yml} (100%) rename adm_configs/{single_kdma_adm_config_incontext.yml => single_kdma_adm_config_high_incontext.yml} (84%) create mode 100644 adm_configs/single_kdma_adm_config_low.yml create mode 100644 adm_configs/single_kdma_adm_config_low_incontext.yml diff --git a/.gitignore b/.gitignore index 70a21c83..56492e38 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ run.bash +results/* venv/ __pycache__/ \ No newline at end of file diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 00000000..9170e854 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,52 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "High Incontext Adept", + "type": "debugpy", + "request": "launch", + "console": "integratedTerminal", + "module": "align_system.cli.run_align_system", + "args": [ + "TA3ActionBased", + "--adm-config", "adm_configs/single_kdma_adm_config_high_incontext.yml", + "--username", "kitware-single-kdma-adm-aligned-no-negatives", + "--align-to-target", + "--session-type", "adept", + "--api_endpoint", "http://127.0.0.1:8080", + "--loglevel", "DEBUG", + "--logfile-path", "${workspaceFolder}/results/high_incontext/output.log", + "--save-input-output-to-path", "${workspaceFolder}/results/high_incontext/input-output.json", + "--save-alignment-score-to-path", "${workspaceFolder}/results/high_incontext/output-scores.json" + ], + "env": { + "CUDA_VISIBLE_DEVICES": "1" + } + }, + { + "name": "Low Incontext Adept", + "type": "debugpy", + "request": "launch", + "console": "integratedTerminal", + "module": "align_system.cli.run_align_system", + "args": [ + "TA3ActionBased", + "--adm-config", "adm_configs/single_kdma_adm_config_low_incontext.yml", + "--username", "kitware-single-kdma-adm-aligned-no-negatives", + "--align-to-target", + "--session-type", "adept", + "--api_endpoint", "http://127.0.0.1:8080", + "--loglevel", "DEBUG", + "--logfile-path", "${workspaceFolder}/results/low_incontext/output.log", + "--save-input-output-to-path", "${workspaceFolder}/results/low_incontext/input-output.json", + "--save-alignment-score-to-path", "${workspaceFolder}/results/low_incontext/output-scores.json" + ], + "env": { + "CUDA_VISIBLE_DEVICES": "0" + } + } + ] +} \ No newline at end of file diff --git a/adm_configs/single_kdma_adm_config.yml b/adm_configs/single_kdma_adm_config_high.yml similarity index 100% rename from adm_configs/single_kdma_adm_config.yml rename to adm_configs/single_kdma_adm_config_high.yml diff --git a/adm_configs/single_kdma_adm_config_incontext.yml b/adm_configs/single_kdma_adm_config_high_incontext.yml similarity index 84% rename from adm_configs/single_kdma_adm_config_incontext.yml rename to adm_configs/single_kdma_adm_config_high_incontext.yml index 88e40153..d564d779 100644 --- a/adm_configs/single_kdma_adm_config_incontext.yml +++ b/adm_configs/single_kdma_adm_config_high_incontext.yml @@ -13,7 +13,7 @@ adm: incontext: number: 5 method: random - dataset: ../datasets/metrics-eval/bbn/metrics-eval-train-renamed.json + dataset: ../datasets/metrics-eval/bbn/metrics-eval-train-renamed.json alignment_target_override: id: ADEPT-metrics_eval-alignment-target-train-HIGH diff --git a/adm_configs/single_kdma_adm_config_low.yml b/adm_configs/single_kdma_adm_config_low.yml new file mode 100644 index 00000000..55fd28e2 --- /dev/null +++ b/adm_configs/single_kdma_adm_config_low.yml @@ -0,0 +1,17 @@ +adm: + name: 'SingleKDMAADM' + init_kwargs: + hf_model: meta-llama/Llama-2-7b-chat-hf + precision: half + temperature: 0.7 + + inference_kwargs: + baseline: true + n_negative_samples: 0 + n_positive_samples: 1 + shuffle: true + +alignment_target_override: + id: ADEPT-metrics_eval-alignment-target-train-LOW + kdma_values: + - {kdma: MoralDesert, value: 0} diff --git a/adm_configs/single_kdma_adm_config_low_incontext.yml b/adm_configs/single_kdma_adm_config_low_incontext.yml new file mode 100644 index 00000000..90a65b35 --- /dev/null +++ b/adm_configs/single_kdma_adm_config_low_incontext.yml @@ -0,0 +1,21 @@ +adm: + name: 'SingleKDMAADM' + init_kwargs: + hf_model: meta-llama/Llama-2-7b-chat-hf + precision: half + temperature: 0.7 + + inference_kwargs: + baseline: true + n_negative_samples: 0 + n_positive_samples: 1 + shuffle: true + incontext: + number: 5 + method: random + dataset: ../datasets/metrics-eval/bbn/metrics-eval-train-renamed.json + +alignment_target_override: + id: ADEPT-metrics_eval-alignment-target-train-LOW + kdma_values: + - {kdma: MoralDesert, value: 0} diff --git a/align_system/algorithms/llama_2_single_kdma_adm.py b/align_system/algorithms/llama_2_single_kdma_adm.py index 76251263..6717dd86 100644 --- a/align_system/algorithms/llama_2_single_kdma_adm.py +++ b/align_system/algorithms/llama_2_single_kdma_adm.py @@ -4,6 +4,7 @@ import os import pathlib import random +from functools import reduce from jinja2.exceptions import TemplateError @@ -214,41 +215,76 @@ def chat_prompt_tokens(self, dialogs, return_tensor=True): def build_multiple_choice_dialog(self, - question, - options, - incontext=None, - system_message=None, - json_format=STANDARD_MULTIPLE_CHOICE_JSON_FORMAT): + question, + options, + incontext=None, + system_message=None, + json_format=STANDARD_MULTIPLE_CHOICE_JSON_FORMAT): + """ + Constructs a structured dialog for a multiple-choice question scenario, typically used in systems + that require a step-by-step reasoning response in a controlled format such as JSON. + + Parameters: + question (str): The question to be answered, forming the basis of the multiple-choice scenario. + options (list of str): The multiple-choice options provided as answers to the question. + incontext (dict, optional): Additional context that might be relevant to how the system message + or user response is formed. Defaults to None. + system_message (str, optional): A predefined system message that guides the assistant on how to + respond. If None, a default message tailored for medical triage scenarios is used. + json_format (str): The JSON format that the user response should adhere to, providing structural + guidance on how the response should be formatted. + + Returns: + list of dict: A list where each dictionary represents a part of the dialog, with 'role' specifying + whether the part is from the 'system' or the 'user', and 'content' containing the text for that role. + + Notes: + The function defaults to using a specialized system message for medical triage unless a specific + message is provided. This default message instructs the assistant on how to approach and format + the response to the question in a methodical and informative manner. + """ + # Define a default system message for medical triage if none is provided medical_triage_system_message = ( "You are an assistant specialized in answering multiple-choice questions related to " "medical triage. When a question is posed, carefully analyze the symptoms or conditions " "described. Respond by providing a detailed reasoning using a step-by-step process or " "process of elimination. Conclude with the final answer, represented by the corresponding " "index number. Your response should be grounded in established medical knowledge and aim " - "to be informative. Please format your response as a JSON object in the following structure:\\n\\n" + "to be informative. Please format your response as a JSON object in the following structure:\n\n" f"{json_format}" "Ensure that you adhere to proper JSON syntax, and carefully note the index that corresponds to each answer." ) if system_message is None: system_message = medical_triage_system_message + # Format the multiple choice options for display formatted_options = [f'({i}) {option}' for i, option in enumerate(options)] - content = f'{question} {formatted_options}' + content = f'{question} {" ".join(formatted_options)}' + if incontext: + dialog = list(reduce(lambda x, y: x + y, incontext, [])) + else: + dialog = [] - dialog = [ + # Construct the dialog with system and user parts + + s_message = [ { "role": "system", "content": system_message - }, + } + ] + u_message = [ { "role": "user", "content": content } ] + dialog = s_message + dialog + u_message return dialog + def log_dialog(self, dialog): for e in dialog: if e.get('role') == 'system': @@ -376,19 +412,53 @@ def respond_to_dialogs_batched(self, dialogs, prefixes=None): return generated_outputs def aligned_decision_maker(self, question, choices, target_kdmas, incontext=None, n_positive_samples=5, n_negative_sampels=5, shuffle=True, baseline=False, n_retries=3): + """ Executes a decision-making process by simulating a dialog based on positive and negative alignments with specified Knowledge Domain Model Attributes (KDMAs). + It attempts to identify the choice that best aligns with the target attributes, using both positive and negative samples to provide robustness against biases. + + Parameters: + question (str): The primary question posed to the decision-making system. + choices (list of str): A list of choices from which the system must select the most appropriate based on KDAMs. + target_kdmas (dict): Key-value pairs indicating the target KDMAs and their desired levels. Values indicate desired thresholds for alignment. + incontext (dict, optional): Additional context provided to the decision-making system, which may affect its responses. + n_positive_samples (int): Number of samples to process assuming positive alignment with the target KDMAs. + n_negative_samples (int): Number of samples to process assuming negative or inverse alignment with the target KDMAs. + shuffle (bool): If True, shuffle the choices to potentially reduce positional bias in the decision-making process. + baseline (bool): If True, use a baseline decision-making model that does not consider specific KDMAs. + n_retries (int): The number of retry attempts to parse a successful response from the decision-making process. + + Returns: + tuple: + responses (list): A list of dictionaries where each dictionary contains the response from the decision-making system, the reasoning behind it, and the index of the chosen answer. + inference_pairs (list): A list of dictionaries capturing detailed information about each inference attempt for analysis and debugging. + + Raises: + RuntimeError: If any specified KDAMs in `target_kdmas` are not supported by the system. + + Notes: + This function leverages logging to trace both aligned and misaligned dialogs, only the first of each type is logged for brevity. + """ + inference_pairs = [] + + + # Check if baseline is not used and handle unsupported KDMAs if not baseline: unsupported_kdmas = {kdma_remapping.get(k, k) for k in target_kdmas.keys()} - kdmas if len(unsupported_kdmas) > 0: raise RuntimeError(f"KDMA(s) {unsupported_kdmas} not supported.") + + # Prefix for logging reasoning prefix = '{"Reasoning": "Because' responses = [] + # Flags to ensure we log certain types of dialog once logged_aligned_dialog = False logged_inverse_misaligned_dialog = False + + # Generate responses for positive samples for _ in range(n_positive_samples): if baseline: system_message = load_system_message() @@ -399,24 +469,27 @@ def aligned_decision_maker(self, question, choices, target_kdmas, incontext=None for kdma, value in target_kdmas.items()} system_message = load_system_message(system_message_keys) - indecies = list(range(len(choices))) + # Shuffle choices if required + indices = list(range(len(choices))) if shuffle: - random.shuffle(indecies) - shuffled_choices = [choices[i] for i in indecies] + random.shuffle(indices) + shuffled_choices = [choices[i] for i in indices] + # Build dialog with the system message and shuffled choices dialog = self.build_multiple_choice_dialog( question, shuffled_choices, system_message=system_message, incontext=incontext) - + # Log aligned dialog once for clarity if not logged_aligned_dialog: log.debug("[bold]*ALIGNED DIALOG*[/bold]", extra={"markup": True}) self.log_dialog(dialog) logged_aligned_dialog = True + # Attempt to parse a valid response multiple times good_parse = False for i in range(n_retries): high_response, inference_pair = self.respond_to_dialog(dialog, prefix=prefix) @@ -428,42 +501,48 @@ def aligned_decision_maker(self, question, choices, target_kdmas, incontext=None except RuntimeError as e: pass + # Fallback parsing strategy if normal parsing fails if not good_parse: reasoning, answer_idx, parse_method = Llama2SingleKDMAADM.bert_similarity_parse(high_response, shuffled_choices) + # Ensure an answer was parsed successfully log.explain('CHOSEN ANSWER IDX %s %s', answer_idx, shuffled_choices) assert answer_idx is not None, f'Failed to parse answer index from generated output: {low_response}' + # Store response details responses.append({ 'response': high_response, 'reasoning': reasoning, 'answer_idx': answer_idx, - 'shuffle_indecies': indecies, + 'shuffle_indices': indices, 'alignment': system_message_keys, 'aligned': True, 'parse_method': parse_method, }) - + # Repeat process for negative samples with inverse KDAM logic for _ in range(n_negative_sampels): system_message_keys = {kdma: 'high' if not value > 5 else 'low' for kdma, value in target_kdmas.items()} - indecies = list(range(len(choices))) + indices = list(range(len(choices))) if shuffle: - random.shuffle(indecies) - shuffled_choices = [choices[i] for i in indecies] + random.shuffle(indices) + shuffled_choices = [choices[i] for i in indices] + # Build dialog with inverse logic inverse_misaligned_dialog = self.build_multiple_choice_dialog( question, shuffled_choices, system_message=load_system_message(system_message_keys)) + # Log the first occurrence of an inverse misaligned dialog if not logged_inverse_misaligned_dialog: log.debug("[bold]*INVERSE MISALIGNED DIALOG*[/bold]", extra={"markup": True}) self.log_dialog(inverse_misaligned_dialog) logged_inverse_misaligned_dialog = True + # Attempt response parsing with retries good_parse = False for i in range(n_retries): low_response, inference_pair = self.respond_to_dialog(inverse_misaligned_dialog, prefix=prefix) @@ -475,16 +554,18 @@ def aligned_decision_maker(self, question, choices, target_kdmas, incontext=None except RuntimeError as e: pass + # Fallback parsing strategy if normal parsing fails if not good_parse: reasoning, answer_idx, parse_method = Llama2SingleKDMAADM.bert_similarity_parse(low_response, shuffled_choices) assert answer_idx is not None, f'Failed to parse answer index from generated output: {low_response}' + # Store response details responses.append({ 'response': low_response, 'reasoning': reasoning, 'answer_idx': answer_idx, - 'shuffle_indecies': indecies, + 'shuffle_indices': indices, 'alignment': system_message_keys, 'aligned': False, 'parse_method': parse_method, @@ -495,6 +576,23 @@ def aligned_decision_maker(self, question, choices, target_kdmas, incontext=None @staticmethod def calculate_votes(responses, choices): + """ + Calculates voting scores for each choice based on a list of responses. Responses that align with the desired outcome increase the score of the selected choice. Misaligned responses distribute a penalty among other choices. + + Parameters: + responses (list of dicts): Each dictionary contains information about a single response, including: + - 'answer_idx' (int or str): The index of the chosen answer. + - 'shuffle_indices' (list of int, optional): If present, it represents the original indices of the choices after shuffling. + - 'aligned' (bool): Indicates whether the response is aligned (True) or misaligned (False) with the desired outcome. + choices (list of str): A list of choices available for voting. + + Returns: + list of float: A list of normalized vote scores for each choice, where higher scores represent greater alignment with the desired outcome. + + Notes: + - The function handles cases where 'answer_idx' may not be an integer or could be out of the valid range of choices. + - Scores are adjusted by the minimum score to ensure all are non-negative and are then normalized to sum to 1. + """ choice_votes = [0] * len(choices) for response in responses: answer_idx = response['answer_idx'] @@ -509,8 +607,8 @@ def calculate_votes(responses, choices): if answer_idx >= len(choices): continue - if 'shuffle_indecies' in response: - answer_idx = response['shuffle_indecies'][int(answer_idx)] + if 'shuffle_indices' in response: + answer_idx = response['shuffle_indices'][int(answer_idx)] aligned = response['aligned'] @@ -717,11 +815,50 @@ def correct_json(self, invalid_json, verbose=True): return None def run_aligned_decision_maker_with_voting( - self, prompt, choices, alignment_target, n_positive_samples=5, n_negative_samples=5, baseline=False, shuffle=False): + self, + prompt, + choices, + alignment_target, + incontext= None, + n_positive_samples=5, + n_negative_samples=5, + baseline=False, + shuffle=False): + """ Executes a decision-making process with voting based on alignment targets and user-provided choices. + This method incorporates a mechanism for evaluating the alignment of choices with a specified target + using a set of positive and negative samples. + + Parameters: + prompt (str): The input prompt to which the decision-making model responds. + choices (list): A list of possible choices for the decision-maker to evaluate. + alignment_target (str): A target alignment criterion that guides the decision-making process. + incontext (list[dict], optional): Additional contextual information to provide to the model. Defaults to None. + n_positive_samples (int): Number of positive samples to use for aligning the choices with the target. Defaults to 5. + n_negative_samples (int): Number of negative samples to use for the alignment evaluation. Defaults to 5. + baseline (bool): Flag to determine whether to use a baseline model for comparison. Defaults to False. + shuffle (bool): Option to shuffle the choices before processing. This can help in reducing bias. Defaults to False. + + Returns: + tuple: A tuple containing: + - reasoning (str or None): The reasoning behind the selected choice, if available. + - answer_idx (int): The index of the choice selected as most aligned. + - responses (list): Detailed responses from the model for each choice. + - inference_pairs (list): Raw data pairs used in the inference process. + + Raises: + Exception: Captures and logs any exception that occurs during the vote calculation, defaulting choice scores to None if an error occurs. + + Notes: + This method leverages internal logging to trace the detailed responses and the computation of choice scores. + It is essential to ensure proper initialization of the logging and handling mechanisms to capture and utilize + the detailed debug outputs effectively. + + """ responses, inference_pairs = self.aligned_decision_maker( prompt, choices, alignment_target, + incontext=incontext, baseline=baseline, n_positive_samples=n_positive_samples, n_negative_sampels=n_negative_samples, @@ -755,40 +892,69 @@ def run_aligned_decision_maker_with_voting( for r in responses: assert r['answer_idx'] is not None - assert int(r['answer_idx']) < len(r['shuffle_indecies']) + assert int(r['answer_idx']) < len(r['shuffle_indices']) - if r['shuffle_indecies'][int(r['answer_idx'])] == answer_idx: + if r['shuffle_indices'][int(r['answer_idx'])] == answer_idx: reasoning = r['reasoning'] break return reasoning, answer_idx, responses, inference_pairs - def format_single_incontext_prompt(self, sample, labels): - prompt = sample['scenario'] + + def format_single_incontext_prompt(self, sample, labels, target_kdma_values): + """ + Formats an in-context prompt for use in a dialogue system where an assistant provides reasoned answers based on provided scores and target values. + + Parameters: + sample (dict): A dictionary containing the scenario description and choices. + - 'scenario' (str): The main question or scenario description. + - 'state' (str, optional): Additional state information to append to the scenario. + - 'choices' (list of str): Possible choices for the scenario. + labels (list of dicts): A list where each dictionary contains scores associated with each choice. + target_kdma_values (dict): A dictionary mapping a target attribute to its desired value. + + Returns: + list of dicts: A list containing structured data for the user and assistant roles, formatted for a dialogue interaction. + - Each dictionary contains 'role' (str) indicating if it's 'user' or 'assistant' and 'content' (str) which is the text associated with that role. + + Notes: + This function assumes there is exactly one key in `target_kdma_values`. + The assistant's answer includes reasoning why a particular choice is preferred based on the minimal distance to the target value in `labels`. + """ + # Combine scenario and additional state information if available + question = sample['scenario'] if sample['state'] is not None: - prompt += f'\n{sample["state"]}' + question += f'\n{sample["state"]}' + # Format choices as enumerated options for display + formatted_options = [f'({i}) {option}' for i, option in enumerate(sample['choices'])] + content = f"{question} {' '.join(formatted_options)}" + # Calculate the distance of each choice's score to the target value + target = next(iter(target_kdma_values)) + dist_to_target = [abs(score[target] - target_kdma_values[target]) for score in labels] + correct_answer_idx = np.argmin(dist_to_target) - [f'({i}) {option}' for i, option in enumerate(options)] - { - "role": "user", - "content": system_message - }, - { - "role": "assistant", - "content": invalid_json + # Map the target attribute to its descriptive wording + target_wording_mapping = { + "moral_deservingness": "morally deserving choice.", + "maximization": "maximization choice." } - for choice, label in zip(sample['choices'],labels): - level = 'high' if list(label.values())[0] > 5 else 'low' - attribute = list(label.keys())[0].replace('_', ' ') - prompt += f' If you had a {level} {attribute}, you would select {choice}.' - - return prompt + # Determine the adjective based on the value being higher or lower than a threshold + adjective = 'more' if target_kdma_values[target] > 5 else 'less' + # Construct the reasoning text using the correct choice and its descriptor + answer_text = f'Because {sample["choices"][correct_answer_idx]} is the {adjective} {target_wording_mapping.get(target, "specified attribute")}' + answer = f'{{"Reasoning": "{answer_text}", "answer": {correct_answer_idx}}}' + # Structure the dialog with user and assistant roles + prompt = [ + {"role": "user", "content": content}, + {"role": "assistant", "content": answer} + ] - #TODO: add prompt completetion here for choices as well. + return prompt + def __call__(self, sample, target_kdma_values, **kwargs): @@ -805,13 +971,13 @@ def __call__(self, sample, target_kdma_values, **kwargs): incontext_prompts = [] # Read dataset - with open(kwargs['dataset']) as f: + with open(kwargs['incontext']['dataset']) as f: dataset = json.load(f) #sam has both info in first element and labels in second element for sam in dataset: # if sam[0]['probe_id'] != sample['probe_id']: - # TODO: add a way to prevent having the sample as a knn if loading itself + # TODO: add a way to prevent (or ensure) having the sample as a knn if loading itself possible_samples.append(sam) if len(possible_samples) < kwargs['incontext']['number']: @@ -821,22 +987,37 @@ def __call__(self, sample, target_kdma_values, **kwargs): if kwargs['incontext']['method'] == 'random': chosen_sample = random.sample(possible_samples, kwargs['incontext']['number']) + elif kwargs['incontext']['method'] == 'bert_similarity': + # Extract Strings for each situation + possible_samples_parse = [] + for s, _ in possible_samples: + question = s['scenario'] + if s['state'] is not None: + question += f'\n{s["state"]}' + possible_samples_parse.append(question) + + # Create similarity scores between incontext dataset and find topk indices + from bert_score import score + _, _, F1 = score([prompt]*len(possible_samples_parse), possible_samples_parse, lang='en') + _, indices = torch.topk(F1, kwargs['incontext']['number']) + + # Make list of the top k for creating prompts + chosen_sample = [] + for i in indices: + chosen_sample.append(possible_samples[i]) else: - raise(f'"{kwargs["incontext"]["method"]}" is not a valid incontext method. Please use "random", ') - - # incontext_prompt_start = ' Here are some examples of similar problems with their attributes. ' + raise(f'"{kwargs["incontext"]["method"]}" is not a valid incontext method. Please use "random or bert_similarity", ') - # extra_prompts = [incontext_prompt_start] - + incontext_prompts = [] ci = 1 for cs, cl in chosen_sample: - extra_prompts.append(f' Example {ci}' + self.format_single_incontext_prompt(cs, cl)) + incontext_prompts.append(self.format_single_incontext_prompt(cs, cl, target_kdma_values)) ci += 1 - extra_prompts.append(' Given these similar examples, please answer the question for the following scenario. ') + # extra_prompts.append(' Given these similar examples, please answer the question for the following scenario. ') - extra_prompts = ''.join(extra_prompts) + # extra_prompts = ''.join(extra_prompts) # prompt = extra_prompts + prompt # if 'retriever' in kwargs: @@ -883,7 +1064,7 @@ def __call__(self, sample, target_kdma_values, **kwargs): prompt, choices, alignment_target, - incontext=None, + incontext=incontext_prompts, n_positive_samples=kwargs.get('n_positive_samples', 5), n_negative_samples=kwargs.get('n_negative_samples', 5), baseline=kwargs.get('baseline', False), From ce57312eeeedb5a0d89ae4085114172b606c9910 Mon Sep 17 00:00:00 2001 From: Christopher Date: Mon, 20 May 2024 17:41:39 -0400 Subject: [PATCH 08/11] Added Incontext learning dataset update Adding new incontext pulled from dataset extracted by running on the TA3 system. --- .../algorithms/llama_2_single_kdma_adm.py | 50 ++++++++++++------- 1 file changed, 31 insertions(+), 19 deletions(-) diff --git a/align_system/algorithms/llama_2_single_kdma_adm.py b/align_system/algorithms/llama_2_single_kdma_adm.py index 6717dd86..00f77e64 100644 --- a/align_system/algorithms/llama_2_single_kdma_adm.py +++ b/align_system/algorithms/llama_2_single_kdma_adm.py @@ -900,16 +900,15 @@ def run_aligned_decision_maker_with_voting( return reasoning, answer_idx, responses, inference_pairs - def format_single_incontext_prompt(self, sample, labels, target_kdma_values): """ Formats an in-context prompt for use in a dialogue system where an assistant provides reasoned answers based on provided scores and target values. Parameters: sample (dict): A dictionary containing the scenario description and choices. - - 'scenario' (str): The main question or scenario description. - - 'state' (str, optional): Additional state information to append to the scenario. - - 'choices' (list of str): Possible choices for the scenario. + - 'prompt' (str): The main question or scenario description. + - 'choices' (list of dicts): Possible choices for the scenario. + - Each choice is a dictionary with an 'unstructured' key containing the choice text. labels (list of dicts): A list where each dictionary contains scores associated with each choice. target_kdma_values (dict): A dictionary mapping a target attribute to its desired value. @@ -921,18 +920,33 @@ def format_single_incontext_prompt(self, sample, labels, target_kdma_values): This function assumes there is exactly one key in `target_kdma_values`. The assistant's answer includes reasoning why a particular choice is preferred based on the minimal distance to the target value in `labels`. """ - # Combine scenario and additional state information if available - question = sample['scenario'] - if sample['state'] is not None: - question += f'\n{sample["state"]}' + # Mapping of target attributes to their corresponding score keys + kdma_name_map = { + 'moral_deservingness': 'MoralDesert', + 'maximization': 'maximization', + } + + # Extract the main question from the sample + question = sample['prompt'] # Format choices as enumerated options for display - formatted_options = [f'({i}) {option}' for i, option in enumerate(sample['choices'])] + formatted_options = [f'({i}) {option["unstructured"]}' for i, option in enumerate(sample['choices'])] content = f"{question} {' '.join(formatted_options)}" - # Calculate the distance of each choice's score to the target value + # Extract the target attribute (assuming there's only one key in target_kdma_values) target = next(iter(target_kdma_values)) - dist_to_target = [abs(score[target] - target_kdma_values[target]) for score in labels] + + # Calculate the distance of each choice's score to the target value + dist_to_target = [] + for score in labels: + if kdma_name_map[target] in score: + # Multiply by 10 to match the rest of the KDMA's score range + dist = abs(score[kdma_name_map[target]] * 10 - target_kdma_values[target]) + else: + dist = float('inf') # If the target attribute is not in the scores, assign an infinite distance + dist_to_target.append(dist) + + # Determine the index of the choice with the minimum distance to the target value correct_answer_idx = np.argmin(dist_to_target) # Map the target attribute to its descriptive wording @@ -943,8 +957,9 @@ def format_single_incontext_prompt(self, sample, labels, target_kdma_values): # Determine the adjective based on the value being higher or lower than a threshold adjective = 'more' if target_kdma_values[target] > 5 else 'less' + # Construct the reasoning text using the correct choice and its descriptor - answer_text = f'Because {sample["choices"][correct_answer_idx]} is the {adjective} {target_wording_mapping.get(target, "specified attribute")}' + answer_text = f'Because {sample["choices"][correct_answer_idx]["unstructured"]} is the {adjective} {target_wording_mapping.get(target, "specified attribute")}' answer = f'{{"Reasoning": "{answer_text}", "answer": {correct_answer_idx}}}' # Structure the dialog with user and assistant roles @@ -954,7 +969,6 @@ def format_single_incontext_prompt(self, sample, labels, target_kdma_values): ] return prompt - def __call__(self, sample, target_kdma_values, **kwargs): @@ -990,10 +1004,8 @@ def __call__(self, sample, target_kdma_values, **kwargs): elif kwargs['incontext']['method'] == 'bert_similarity': # Extract Strings for each situation possible_samples_parse = [] - for s, _ in possible_samples: - question = s['scenario'] - if s['state'] is not None: - question += f'\n{s["state"]}' + for s in possible_samples: + question = s['input']['prompt'] possible_samples_parse.append(question) # Create similarity scores between incontext dataset and find topk indices @@ -1011,8 +1023,8 @@ def __call__(self, sample, target_kdma_values, **kwargs): incontext_prompts = [] ci = 1 - for cs, cl in chosen_sample: - incontext_prompts.append(self.format_single_incontext_prompt(cs, cl, target_kdma_values)) + for cs in chosen_sample: + incontext_prompts.append(self.format_single_incontext_prompt(cs['input'], cs['label'], target_kdma_values)) ci += 1 # extra_prompts.append(' Given these similar examples, please answer the question for the following scenario. ') From 1d835cfc2e30abdb8585304a6d56a1f103cd1366 Mon Sep 17 00:00:00 2001 From: Christopher Date: Mon, 20 May 2024 17:42:23 -0400 Subject: [PATCH 09/11] Adding more about the incontext learning (adding to previous commit) --- .../algorithms/llama_2_single_kdma_adm.py | 66 ++++++++++--------- 1 file changed, 35 insertions(+), 31 deletions(-) diff --git a/align_system/algorithms/llama_2_single_kdma_adm.py b/align_system/algorithms/llama_2_single_kdma_adm.py index 00f77e64..deb85166 100644 --- a/align_system/algorithms/llama_2_single_kdma_adm.py +++ b/align_system/algorithms/llama_2_single_kdma_adm.py @@ -970,67 +970,72 @@ def format_single_incontext_prompt(self, sample, labels, target_kdma_values): return prompt - def __call__(self, sample, target_kdma_values, **kwargs): - """ Build the prompt and send to the LLM to ask for a single KDMA + """ + Build the prompt and send it to the LLM to ask for a single KDMA (Key Decision-Making Attribute). + Parameters: + sample (dict): A dictionary containing the scenario, state, probe, and choices. + - 'scenario' (str): The main scenario description. + - 'state' (str, optional): Additional state information to append to the scenario. + - 'probe' (str): The specific question or probe to be answered. + - 'choices' (list of str): Possible choices for the scenario. + target_kdma_values (dict): A dictionary mapping a target attribute to its desired value. + kwargs (dict): Additional keyword arguments for in-context learning, retrievers, labels, etc. + - 'incontext' (dict, optional): Configuration for in-context learning. + - 'dataset' (str): Path to the in-context dataset. + - 'number' (int): Number of in-context samples to use. + - 'method' (str): Method to select in-context samples ('random' or 'bert_similarity'). + - 'labels' (list of dicts, optional): A list where each dictionary contains scores associated with each choice. + - 'n_positive_samples' (int, optional): Number of positive samples for decision making. + - 'n_negative_samples' (int, optional): Number of negative samples for decision making. + - 'baseline' (bool, optional): Whether to use a baseline approach. + - 'shuffle' (bool, optional): Whether to shuffle the choices. + Returns: + dict: A dictionary containing the selected choice and additional information. + - 'choice' (int): The index of the selected choice. + - 'info' (dict): Additional information including reasoning, responses, and raw data. """ prompt = sample['scenario'] if sample['state'] is not None: prompt += f'\n{sample["state"]}' + incontext_prompts = [] + if 'incontext' in kwargs: possible_samples = [] - incontext_prompts = [] # Read dataset with open(kwargs['incontext']['dataset']) as f: dataset = json.load(f) - #sam has both info in first element and labels in second element + # Populate possible samples from the dataset for sam in dataset: - # if sam[0]['probe_id'] != sample['probe_id']: - # TODO: add a way to prevent (or ensure) having the sample as a knn if loading itself possible_samples.append(sam) if len(possible_samples) < kwargs['incontext']['number']: - raise RuntimeError(f'Not enough possible incontext samples to learn from here.' - f'Only {len(possible_samples)} samples while asking for' - f'{kwargs["incontext"]["number"]} in context samples') + raise RuntimeError(f'Not enough possible in-context samples to learn from. Only {len(possible_samples)} samples available while asking for {kwargs["incontext"]["number"]} in-context samples.') if kwargs['incontext']['method'] == 'random': chosen_sample = random.sample(possible_samples, kwargs['incontext']['number']) elif kwargs['incontext']['method'] == 'bert_similarity': - # Extract Strings for each situation - possible_samples_parse = [] - for s in possible_samples: - question = s['input']['prompt'] - possible_samples_parse.append(question) + # Extract strings for each situation + possible_samples_parse = [s['input']['prompt'] for s in possible_samples] - # Create similarity scores between incontext dataset and find topk indices + # Create similarity scores between the in-context dataset and find top-k indices from bert_score import score _, _, F1 = score([prompt]*len(possible_samples_parse), possible_samples_parse, lang='en') - _, indices = torch.topk(F1, kwargs['incontext']['number']) + _, indices = torch.topk(F1, kwargs['incontext']['number']) # Make list of the top k for creating prompts - chosen_sample = [] - for i in indices: - chosen_sample.append(possible_samples[i]) + chosen_sample = [possible_samples[i] for i in indices] else: - raise(f'"{kwargs["incontext"]["method"]}" is not a valid incontext method. Please use "random or bert_similarity", ') + raise ValueError(f'"{kwargs["incontext"]["method"]}" is not a valid in-context method. Please use "random" or "bert_similarity".') - - incontext_prompts = [] - ci = 1 + # Create in-context prompts for cs in chosen_sample: incontext_prompts.append(self.format_single_incontext_prompt(cs['input'], cs['label'], target_kdma_values)) - ci += 1 - - # extra_prompts.append(' Given these similar examples, please answer the question for the following scenario. ') - - # extra_prompts = ''.join(extra_prompts) - # prompt = extra_prompts + prompt # if 'retriever' in kwargs: # # retriever_prompt = "How would you treat the following injuries: {}".format(prompt) @@ -1055,10 +1060,9 @@ def __call__(self, sample, target_kdma_values, **kwargs): # prompt += f'\nGiven the scenario and documentation above.. {sample["probe"]}' # else: - prompt += f'\n{sample["probe"]}' + prompt += f'\n{sample["probe"]}' choices = sample['choices'] - labels = kwargs.get('labels', {}) alignment_target = None From d61baa9c914b7f4b4b5b3795747b2193b0b8edac Mon Sep 17 00:00:00 2001 From: Christopher Date: Mon, 20 May 2024 17:43:41 -0400 Subject: [PATCH 10/11] Adding configs for different testing (high, low, incontext, and baseline) to run. Also adding launch.json to help others with running on VScode --- .vscode/launch.json | 77 ++++++++++++++++++- .../single_kdma_adm_config_baseline.yml | 17 ++++ adm_configs/single_kdma_adm_config_high.yml | 2 +- .../single_kdma_adm_config_high_incontext.yml | 7 +- adm_configs/single_kdma_adm_config_low.yml | 2 +- .../single_kdma_adm_config_low_incontext.yml | 5 +- 6 files changed, 100 insertions(+), 10 deletions(-) create mode 100644 adm_configs/single_kdma_adm_config_baseline.yml diff --git a/.vscode/launch.json b/.vscode/launch.json index 9170e854..93ace822 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -20,7 +20,8 @@ "--loglevel", "DEBUG", "--logfile-path", "${workspaceFolder}/results/high_incontext/output.log", "--save-input-output-to-path", "${workspaceFolder}/results/high_incontext/input-output.json", - "--save-alignment-score-to-path", "${workspaceFolder}/results/high_incontext/output-scores.json" + "--save-alignment-score-to-path", "${workspaceFolder}/results/high_incontext/output-scores.json", + "--training-session" ], "env": { "CUDA_VISIBLE_DEVICES": "1" @@ -42,10 +43,80 @@ "--loglevel", "DEBUG", "--logfile-path", "${workspaceFolder}/results/low_incontext/output.log", "--save-input-output-to-path", "${workspaceFolder}/results/low_incontext/input-output.json", - "--save-alignment-score-to-path", "${workspaceFolder}/results/low_incontext/output-scores.json" + "--save-alignment-score-to-path", "${workspaceFolder}/results/low_incontext/output-scores.json", + "--training-session" ], "env": { - "CUDA_VISIBLE_DEVICES": "0" + "CUDA_VISIBLE_DEVICES": "1" + } + }, + { + "name": "High Adept", + "type": "debugpy", + "request": "launch", + "console": "integratedTerminal", + "module": "align_system.cli.run_align_system", + "args": [ + "TA3ActionBased", + "--adm-config", "adm_configs/single_kdma_adm_config_high.yml", + "--username", "kitware-single-kdma-adm-aligned-no-negatives", + "--align-to-target", + "--session-type", "adept", + "--api_endpoint", "http://127.0.0.1:8080", + "--loglevel", "DEBUG", + "--logfile-path", "${workspaceFolder}/results/high/output.log", + "--save-input-output-to-path", "${workspaceFolder}/results/high/input-output.json", + "--save-alignment-score-to-path", "${workspaceFolder}/results/high/output-scores.json", + "--training-session" + ], + "env": { + "CUDA_VISIBLE_DEVICES": "2" + } + }, + { + "name": "Low Adept", + "type": "debugpy", + "request": "launch", + "console": "integratedTerminal", + "module": "align_system.cli.run_align_system", + "args": [ + "TA3ActionBased", + "--adm-config", "adm_configs/single_kdma_adm_config_low.yml", + "--username", "kitware-single-kdma-adm-aligned-no-negatives", + "--align-to-target", + "--session-type", "adept", + "--api_endpoint", "http://127.0.0.1:8080", + "--loglevel", "DEBUG", + "--logfile-path", "${workspaceFolder}/results/low/output.log", + "--save-input-output-to-path", "${workspaceFolder}/results/low/input-output.json", + "--save-alignment-score-to-path", "${workspaceFolder}/results/low/output-scores.json", + "--training-session" + ], + "env": { + "CUDA_VISIBLE_DEVICES": "3" + } + }, + { + "name": "Baseline Adept", + "type": "debugpy", + "request": "launch", + "console": "integratedTerminal", + "module": "align_system.cli.run_align_system", + "args": [ + "TA3ActionBased", + "--adm-config", "adm_configs/single_kdma_adm_config_baseline.yml", + "--username", "kitware-single-kdma-adm-aligned-no-negatives", + "--align-to-target", + "--session-type", "adept", + "--api_endpoint", "http://127.0.0.1:8080", + "--loglevel", "DEBUG", + "--logfile-path", "${workspaceFolder}/results/baseline/output.log", + "--save-input-output-to-path", "${workspaceFolder}/results/baseline/input-output.json", + "--save-alignment-score-to-path", "${workspaceFolder}/results/baseline/output-scores.json", + "--training-session" + ], + "env": { + "CUDA_VISIBLE_DEVICES": "3" } } ] diff --git a/adm_configs/single_kdma_adm_config_baseline.yml b/adm_configs/single_kdma_adm_config_baseline.yml new file mode 100644 index 00000000..55fd28e2 --- /dev/null +++ b/adm_configs/single_kdma_adm_config_baseline.yml @@ -0,0 +1,17 @@ +adm: + name: 'SingleKDMAADM' + init_kwargs: + hf_model: meta-llama/Llama-2-7b-chat-hf + precision: half + temperature: 0.7 + + inference_kwargs: + baseline: true + n_negative_samples: 0 + n_positive_samples: 1 + shuffle: true + +alignment_target_override: + id: ADEPT-metrics_eval-alignment-target-train-LOW + kdma_values: + - {kdma: MoralDesert, value: 0} diff --git a/adm_configs/single_kdma_adm_config_high.yml b/adm_configs/single_kdma_adm_config_high.yml index 384427f8..646c27c2 100644 --- a/adm_configs/single_kdma_adm_config_high.yml +++ b/adm_configs/single_kdma_adm_config_high.yml @@ -6,7 +6,7 @@ adm: temperature: 0.7 inference_kwargs: - baseline: true + baseline: false n_negative_samples: 0 n_positive_samples: 1 shuffle: true diff --git a/adm_configs/single_kdma_adm_config_high_incontext.yml b/adm_configs/single_kdma_adm_config_high_incontext.yml index d564d779..51c9762b 100644 --- a/adm_configs/single_kdma_adm_config_high_incontext.yml +++ b/adm_configs/single_kdma_adm_config_high_incontext.yml @@ -6,14 +6,15 @@ adm: temperature: 0.7 inference_kwargs: - baseline: true + baseline: false n_negative_samples: 0 n_positive_samples: 1 shuffle: true incontext: number: 5 - method: random - dataset: ../datasets/metrics-eval/bbn/metrics-eval-train-renamed.json + method: bert_similarity + # dataset: ../datasets/metrics-eval/bbn/metrics-eval-train-renamed.json + dataset: /data/shared/samba/integrated_results_metrics_eval/captured_dataset_for_chris/baseline_adept_high-1715105775-input-output.json alignment_target_override: id: ADEPT-metrics_eval-alignment-target-train-HIGH diff --git a/adm_configs/single_kdma_adm_config_low.yml b/adm_configs/single_kdma_adm_config_low.yml index 55fd28e2..70a9d648 100644 --- a/adm_configs/single_kdma_adm_config_low.yml +++ b/adm_configs/single_kdma_adm_config_low.yml @@ -6,7 +6,7 @@ adm: temperature: 0.7 inference_kwargs: - baseline: true + baseline: false n_negative_samples: 0 n_positive_samples: 1 shuffle: true diff --git a/adm_configs/single_kdma_adm_config_low_incontext.yml b/adm_configs/single_kdma_adm_config_low_incontext.yml index 90a65b35..e8fb6567 100644 --- a/adm_configs/single_kdma_adm_config_low_incontext.yml +++ b/adm_configs/single_kdma_adm_config_low_incontext.yml @@ -6,14 +6,15 @@ adm: temperature: 0.7 inference_kwargs: - baseline: true + baseline: false n_negative_samples: 0 n_positive_samples: 1 shuffle: true incontext: number: 5 method: random - dataset: ../datasets/metrics-eval/bbn/metrics-eval-train-renamed.json + # dataset: ../datasets/metrics-eval/bbn/metrics-eval-train-renamed.json + dataset: /data/shared/samba/integrated_results_metrics_eval/captured_dataset_for_chris/baseline_adept_high-1715105775-input-output.json alignment_target_override: id: ADEPT-metrics_eval-alignment-target-train-LOW From 6f2ee925844803a5a7a731e29a5603186e27475e Mon Sep 17 00:00:00 2001 From: Christopher Date: Wed, 22 May 2024 13:47:00 -0400 Subject: [PATCH 11/11] Adding more configs --- .vscode/launch.json | 33 ++++++++++++++++--- .../single_kdma_adm_config_high_baseline.yml | 17 ++++++++++ ...> single_kdma_adm_config_low_baseline.yml} | 0 .../single_kdma_adm_config_low_incontext.yml | 2 +- 4 files changed, 46 insertions(+), 6 deletions(-) create mode 100644 adm_configs/single_kdma_adm_config_high_baseline.yml rename adm_configs/{single_kdma_adm_config_baseline.yml => single_kdma_adm_config_low_baseline.yml} (100%) diff --git a/.vscode/launch.json b/.vscode/launch.json index 93ace822..3b2439cb 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -97,22 +97,45 @@ } }, { - "name": "Baseline Adept", + "name": "High Baseline Adept", "type": "debugpy", "request": "launch", "console": "integratedTerminal", "module": "align_system.cli.run_align_system", "args": [ "TA3ActionBased", - "--adm-config", "adm_configs/single_kdma_adm_config_baseline.yml", + "--adm-config", "adm_configs/single_kdma_adm_config_high_baseline.yml", "--username", "kitware-single-kdma-adm-aligned-no-negatives", "--align-to-target", "--session-type", "adept", "--api_endpoint", "http://127.0.0.1:8080", "--loglevel", "DEBUG", - "--logfile-path", "${workspaceFolder}/results/baseline/output.log", - "--save-input-output-to-path", "${workspaceFolder}/results/baseline/input-output.json", - "--save-alignment-score-to-path", "${workspaceFolder}/results/baseline/output-scores.json", + "--logfile-path", "${workspaceFolder}/results/high_baseline/output.log", + "--save-input-output-to-path", "${workspaceFolder}/results/high_baseline/input-output.json", + "--save-alignment-score-to-path", "${workspaceFolder}/results/high_baseline/output-scores.json", + "--training-session" + ], + "env": { + "CUDA_VISIBLE_DEVICES": "3" + } + }, + { + "name": "Low Baseline Adept", + "type": "debugpy", + "request": "launch", + "console": "integratedTerminal", + "module": "align_system.cli.run_align_system", + "args": [ + "TA3ActionBased", + "--adm-config", "adm_configs/single_kdma_adm_config_low_baseline.yml", + "--username", "kitware-single-kdma-adm-aligned-no-negatives", + "--align-to-target", + "--session-type", "adept", + "--api_endpoint", "http://127.0.0.1:8080", + "--loglevel", "DEBUG", + "--logfile-path", "${workspaceFolder}/results/low_baseline/output.log", + "--save-input-output-to-path", "${workspaceFolder}/low_baseline/baseline/input-output.json", + "--save-alignment-score-to-path", "${workspaceFolder}/low_baseline/baseline/output-scores.json", "--training-session" ], "env": { diff --git a/adm_configs/single_kdma_adm_config_high_baseline.yml b/adm_configs/single_kdma_adm_config_high_baseline.yml new file mode 100644 index 00000000..384427f8 --- /dev/null +++ b/adm_configs/single_kdma_adm_config_high_baseline.yml @@ -0,0 +1,17 @@ +adm: + name: 'SingleKDMAADM' + init_kwargs: + hf_model: meta-llama/Llama-2-7b-chat-hf + precision: half + temperature: 0.7 + + inference_kwargs: + baseline: true + n_negative_samples: 0 + n_positive_samples: 1 + shuffle: true + +alignment_target_override: + id: ADEPT-metrics_eval-alignment-target-train-HIGH + kdma_values: + - {kdma: MoralDesert, value: 1} diff --git a/adm_configs/single_kdma_adm_config_baseline.yml b/adm_configs/single_kdma_adm_config_low_baseline.yml similarity index 100% rename from adm_configs/single_kdma_adm_config_baseline.yml rename to adm_configs/single_kdma_adm_config_low_baseline.yml diff --git a/adm_configs/single_kdma_adm_config_low_incontext.yml b/adm_configs/single_kdma_adm_config_low_incontext.yml index e8fb6567..a23452cb 100644 --- a/adm_configs/single_kdma_adm_config_low_incontext.yml +++ b/adm_configs/single_kdma_adm_config_low_incontext.yml @@ -12,7 +12,7 @@ adm: shuffle: true incontext: number: 5 - method: random + method: bert_similarity # dataset: ../datasets/metrics-eval/bbn/metrics-eval-train-renamed.json dataset: /data/shared/samba/integrated_results_metrics_eval/captured_dataset_for_chris/baseline_adept_high-1715105775-input-output.json