发布于2025-01-04 14:53 阅读(681) 评论(0) 点赞(16) 收藏(5)
I am working on a script that runs an offsite prompt tuning model using a pre-trained Vicuna-7b model on GPU (CUDA). However, when I attempt to execute the code, I encounter the above error. It seems the tensors are not all being transferred to the same device.
I'm working on a LLM project, currently the project runs on HPC cluster, GPU Model: A100-40gb.
This is the train_opt.py file that im trying to run:
"""Offsite Prompt Tuning"""
import torch
from copy import deepcopy
import argparse, os
import numpy as np
import wandb
from transformers import set_seed, AutoModelForCausalLM, AutoTokenizer
from utils.utils import make_if_not_exist, str2bool
from utils.template import get_eval_template
from utils.dln import BackwardInstructGenerator
from utils.data import get_dataset
from utils.dp import LDGumbelMechanism, ExpMechanism
from utils.evaluate import Evaluator
CHECKPOINT_ROOT = './checkpoint'
def config_args(parser: argparse.ArgumentParser):
parser.add_argument('--ape_mode', default='bwd', choices=['bwd', 'iid_ibwd'], type=str,
help="bwd: backward update by DLN1, will resample prompts based on the same sets of demos;\n"
"iid_ibwd: Will not iteratively update instruct but sample each instruct "\
"independently. This is similar to bwd but will use different demos for each iteration.;\n"
"Note to use iid_ibwd for dp")
parser.add_argument('--seed', default=0, type=int, help='random seed')
parser.add_argument('--data', default='sst2', choices=['sst2', 'trec', 'disaster', 'mpqa'])
parser.add_argument('--model', default='lmsys/vicuna-7b-v1.3', help="model for generating prompts.")
parser.add_argument('--instruct_type', default='vicuna', type=str, help='instruction format.')
parser.add_argument('--batch_size', default=1, type=int, help="batch size for evaluation")
parser.add_argument('--holdout_ratio', default=0.01, type=float, help='ratio of training data to be held out for validation.')
parser.add_argument('--test_ratio', default=1., type=float, help='ratio of testing data to be used.')
# prompt generation
parser.add_argument('--steps', default=1, type=int, help='num of iterations of generation. Will iteratively update prompts.')
parser.add_argument('--num_prompt', default=5, type=int, help='num of prompt to generate')
parser.add_argument('--num_demos', default=5, type=int, help='num of demos to be used for prompt generation or ICL.')
parser.add_argument('--balance_demos', default=False, type=str2bool, help='balance demos in meta-prompts. Do not use this for DP cases.')
parser.add_argument('--max_new_tokens', default=128, type=int, help='max num of tokens for prompt')
parser.add_argument('--ensemble_gen', default=False, type=str2bool, help='ensemble all meta prompts.')
parser.add_argument('--ensemble_num', default=205, type=int, help='num of demo subsets for ensemble.')
parser.add_argument('--gen_batch_size', default=1, type=int, help='batch size when generating prompts.')
parser.add_argument('--gen_temp', default=0.9, type=float,
help='generation temperature on samplng tokens. 1 means no temp. Smaller values means less variance.')
parser.add_argument('--rep_penalty', default=1., type=float,
help='repetition penalty. Larger value, less repetition.')
parser.add_argument('--rm_eval_item_name', type=str2bool, default=False, help='Remove `Input:` in prompts. Required by DLN-1')
# dp
parser.add_argument('--dp_eps', default=None, type=float,
help='eps for DP. Recommend value: target_total_eps / max_new_token')
parser.add_argument('--dp_delta', default=None, type=float,
help='delta for DP. Recommend value: target_total_delta / max_new_token')
parser.add_argument('--target_eps', default=None, type=float,
help='target total eps for DP before generation stops.')
parser.add_argument('--tokenwise_gen', default=False, type=str2bool,
help='generate prompt token by token. For each token, the batch of demos will be resampled.')
def render_runname(args):
"""Render args into a single string."""
args.run_name = f"{args.data}/{args.ape_mode}/{args.model.replace('/', '_')}/" + f"s{args.seed}_pt{args.num_prompt}"
if args.steps != 1:
args.run_name += f"_st{args.steps}"
if args.max_new_tokens != 1024:
args.run_name += f"_mt{args.max_new_tokens}"
if args.ensemble_gen:
args.run_name += f'_ens-gen-{args.ensemble_num}'
if args.tokenwise_gen:
args.run_name += f'-twg'
else:
assert not args.tokenwise_gen, "Not allow tokenwise_gen for non-ensemble generation. Actually, it is unnecessary and inefficient."
if args.balance_demos:
args.run_name += '_bal'
if args.rm_eval_item_name:
args.run_name += '_rm-ein'
# sampling
args.run_name += f'_temp{args.gen_temp}'
if args.rep_penalty != 1.:
args.run_name += f'_rp{args.rep_penalty}'
# dp
if args.dp_eps is not None:
args.run_name += f'_dp{args.dp_eps}d{args.dp_delta}_max{args.target_eps}'
assert args.ape_mode in ["iid_ibwd"], "Require ape_mode to be `iid_ibwd` which implement the subsampling."
print(f"Run name: {args.run_name}")
args.save_path = os.path.join(CHECKPOINT_ROOT, args.run_name)
make_if_not_exist(args.save_path)
args.save_file = os.path.join(args.save_path, f"auto_prompt.pth")
def print_estimate_dp(dp_engine, val_dp_engine):
print(f"DP configurations:")
print(f"- generation dp")
# estimate max dp expense based on estimated queries.
if args.dp_delta * args.subsampling_rate * args.num_prompt * args.max_new_tokens > args.target_delta:
print(f" !!WARNING To run the full generation, dp_delta should be smaller "\
f"than {args.target_delta / (args.subsampling_rate * args.num_prompt * args.max_new_tokens)}")
print(f" Estimate max eps by reducing num of gen tokens from " \
f"{args.num_prompt * args.max_new_tokens} to {int(args.target_delta / args.subsampling_rate / args.dp_delta)}")
eps, delta = dp_engine.get_dp_expense(int(args.target_delta / args.subsampling_rate / args.dp_delta))
else:
eps, delta = dp_engine.get_dp_expense(args.num_prompt * args.max_new_tokens)
print(f" estimated max eps: eps={eps:.3f} delta={delta:g}")
# DP for validation
print(f"- val dp")
print(f" noise scale: {1 / args.target_eps:.4f}")
val_eps, val_delta = val_dp_engine.get_dp_expense(1, 1)
print(f" estimated val eps: eps={val_eps:.3f} delta={val_delta:g}")
def main(arg_list=None):
parser = argparse.ArgumentParser()
# device
parser.add_argument('--device', default='cuda')
parser.add_argument('--no_wandb', action='store_true', help='disable wandb')
parser.add_argument('--skip_eval', action='store_true', help='skip eval on holdout (and ranking) to save time')
config_args(parser)
global args
if arg_list is not None:
args = parser.parse_args(arg_list)
else:
args = parser.parse_args()
set_seed(args.seed)
rng = np.random.RandomState(args.seed)
render_runname(args)
make_if_not_exist(args.save_path)
wandb.init(project='dp-opt',
name=args.run_name, config=vars(args),
mode='offline' if args.no_wandb else 'online')
save_dict = {'config': vars(args)}
# load data
dataset, label_words = get_dataset(args.data, args.holdout_ratio, args.test_ratio, rng)
# config DP
if args.dp_eps is not None:
n_sample = len(dataset['train'])
dp_batch_size = args.ensemble_num * args.num_demos
args.target_delta = 1 / n_sample
args.subsampling_rate = dp_batch_size / n_sample
wandb.config['target_delta'] = args.target_delta
wandb.config['subsampling_rate'] = args.subsampling_rate
# DP for training
dp_engine = LDGumbelMechanism(
args.dp_eps, args.dp_delta,
target_eps=args.target_eps, target_delta=args.target_delta,
subsampling_rate=args.subsampling_rate,
fail_mode='retry')
val_dp_engine = ExpMechanism(args.target_eps,
target_eps=args.target_eps, target_delta=args.target_delta)
print_estimate_dp(dp_engine, val_dp_engine)
else:
dp_engine = None
val_dp_engine = None
# Load model
model_args = {'revision': 'main'}
if args.device == 'cuda':
model_args['device_map'] = 'cuda'
model_args['torch_dtype'] = torch.float16
model = AutoModelForCausalLM.from_pretrained('./vicuna-7b-v1.3/',
#args.model,
low_cpu_mem_usage=True,
offload_folder="offload",
**model_args)
tokenizer = AutoTokenizer.from_pretrained('./vicuna-7b-v1.3/',
#args.model,
use_fast=False,
revision='main',
device_map='cuda')
if 'gpt2' in args.model or 'llama' in args.model.lower():
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left'
tokenizer.truncation_side = 'left'
disable_att_mask = ('llama' in args.model) or ('vicuna' in args.model) # llama may have bugs on logits
# Prepare evaluator
instruct_type, eval_template, init_instruct = get_eval_template(
args.model, args.data, add_item_name=not args.rm_eval_item_name, instruct_type=args.instruct_type)
evaluator = Evaluator(eval_template, label_words, model, tokenizer, dataset, args.batch_size, device=args.device)
# Prepare instruction generator.
if args.ape_mode in ['bwd', 'iid_ibwd']:
instruct_generator = BackwardInstructGenerator(
model, tokenizer, args.device, args.max_new_tokens,
label_words, instruct_type, ensemble_gen=args.ensemble_gen,
disable_att_mask=disable_att_mask, gen_batch_size=args.gen_batch_size,
gen_temperature=args.gen_temp,
rep_penalty=args.rep_penalty,
dp_engine=dp_engine,
balance_demos=args.balance_demos,
tokenwise_gen=args.tokenwise_gen,
)
else:
raise NotImplementedError(f'ape_mode: {args.ape_mode}')
best_holdout_acc = 0
global_best_holdout_test_acc = 0
do_early_stop_cnt = 2 if args.steps > 1 else 1
best_save_dict = {}
for step in range(args.steps):
print("\n\n"+ "="*20+"\n"+"="*3 + f" Step {step} " + "="*3 + "\n"+ "="*20 + "\n")
# Step 1: generate prompts
if args.ape_mode == 'bwd':
# each instruction will be sampled from the same demonstration set.
generated_instructs, used_demos = instruct_generator.generate_instruct_bwd(
init_instruct, args.num_demos, dataset['train'], rng, evaluator,
num_prompt=args.num_prompt,
num_meta_prompt=args.ensemble_num if args.ensemble_gen else None
)
elif args.ape_mode == 'iid_ibwd':
# each instruction will be sampled from the non-overlap (iid) demonstration sets.
generated_instructs, used_demos = instruct_generator.iterative_generate(
init_instruct, args.num_demos, dataset['train'], rng, evaluator,
num_prompt=args.num_prompt,
num_meta_prompt=args.ensemble_num if args.ensemble_gen else None,
verbose=not args.ensemble_gen,
iid_instruct=True, # will make each instruct generated independently.
)
else:
raise NotImplementedError(f"Unknown ape_mode: {args.ape_mode}")
# process generated instructions.
assert len(generated_instructs) >= 1, "Fail to generate instructions."
if val_dp_engine is None:
save_dict['all generated_instructs'] = generated_instructs
unique_instructs = list(set(generated_instructs))
if len(unique_instructs) < len(generated_instructs):
print(f"Found duplicated instructs with {len(unique_instructs)}/{len(generated_instructs)} unique prompts.")
print(f"Removed duplicated instructs.")
generated_instructs = unique_instructs
# log & save
if dp_engine is not None:
final_eps, final_delta = dp_engine.get_dp_expense()
wandb.summary['final eps'] = final_eps
wandb.summary['final delta'] = final_delta
print(f"Final DP: eps={final_eps:.3f}, delta={final_delta}")
else:
final_eps, final_delta = 0., 0.
save_dict['generated_instructs'] = generated_instructs
save_dict['used_demos'] = used_demos
torch.save(save_dict, args.save_file)
print(f"save results => {args.save_file}")
if not args.skip_eval or args.steps > 1:
# Step 2: Evaluate and find the best instruct
instruct_metrics, save_dict = evaluator.find_best_instruct(generated_instructs, save_dict, dp_engine=val_dp_engine)
best_holdout_idx = save_dict['best_holdout_idx']
step_best_holdout_acc = instruct_metrics['holdout acc'][best_holdout_idx]
if val_dp_engine is not None:
val_eps, val_delta = val_dp_engine.get_dp_expense()
if val_eps > final_eps:
final_eps = val_eps
if val_delta > final_delta:
final_delta = val_delta
wandb.summary['final eps'] = final_eps
wandb.summary['final delta'] = final_delta
print(f"Final DP w/ val: eps={final_eps}, delta: {final_delta}")
# save
torch.save(save_dict, args.save_file)
print(f"save results => {args.save_file}")
if step_best_holdout_acc > best_holdout_acc:
print(f"best holdout acc changed: from {best_holdout_acc} to {step_best_holdout_acc}")
print(f"\nUpdate instruct from\n[START]{init_instruct}[END]")
init_instruct = generated_instructs[best_holdout_idx]
print(f"to the selected instruct\n[START]{init_instruct}[END]")
best_holdout_acc = step_best_holdout_acc
global_best_holdout_test_acc = save_dict['best_holdout_test_acc']
best_save_dict = deepcopy(save_dict)
else:
print(f"\nUnchanged best instruct:\n[START]{init_instruct}[END]")
print(f" Unchanged best_holdout_acc: {best_holdout_acc}")
do_early_stop_cnt -= 1
print(f"Will early stop in {do_early_stop_cnt} more unchanged.")
# move to last
torch.save(save_dict, os.path.join(args.save_path, f"last.pth"))
# restore the best to the default file.
torch.save(best_save_dict, args.save_file)
wandb.log({
'step': step,
'best_holdout_acc': best_holdout_acc,
'step_best_holdout_acc': step_best_holdout_acc,
'step_test_acc': save_dict['best_holdout_test_acc'],
'global_best_holdout_test_acc': global_best_holdout_test_acc,
})
if do_early_stop_cnt == 0:
print(f"Early stop at step {step}/{args.steps}...")
break
return best_save_dict, save_dict
if __name__ == '__main__':
main()
The error im facing is : RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!
Entire terminal output:
Run name: sst2/iid_ibwd/lmsys_vicuna-7b-v1.3/s0_pt40_mt50_ens-gen-205_temp1.1
Loading checkpoint shards: 100%|██████████| 2/2 [00:57<00:00, 28.88s/it]
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message
====================
=== Step 0 ===
====================
dln-fwd: 0%| | 0/66676 [00:00<?, ?it/s]Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
dln-fwd: 100%|██████████| 66676/66676 [34:37<00:00, 32.09it/s, acc: 0.858]
[Iter 0/40] generating prompt
>>> Meta prompt #0:
[START]A student is completing a task that requires producing a text output from a text input. The student receives an instruction that describes how to produce the output given each input. The student has made some errors. Your task is to improve the instruction such that the student can fix the errors.
This was the instruction.
Instruction: Classify the input text as positive or negative.
# Student successes
Input: it's almost impossible not to be swept away by the sheer beauty of his images .
Correct Output: positive
Input: a sick , twisted sort of way
Correct Output: negative
Input: an elegant film with often surprising twists and an intermingling of naiveté and sophistication
Correct Output: positive
Input: is trying to dupe the viewer into taking it all as very important simply because the movie is ugly to look at and not a hollywood product
Correct Output: negative
# Student errors
Input: on a 10-year delay
Student Output: negative
Correct Ouput: positive
Improve the instruction to fix the student errors. Clarify the instruction by adding few words or a short sentence. Be concise
Improved Instruction: [APE][END]
Traceback (most recent call last):
File "/lustre06/project/6097055/ombh/DP-OPT/train_opt.py", line 301, in <module>
main()
File "/lustre06/project/6097055/ombh/DP-OPT/train_opt.py", line 216, in main
generated_instructs, used_demos = instruct_generator.iterative_generate(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/lustre06/project/6097055/ombh/DP-OPT/utils/dln.py", line 242, in iterative_generate
_generated_instructs, _used_demos = self.generate_instruct_bwd(cur_instruct, num_demos, dataset, rng, evaluator, num_prompt=1, num_meta_prompt=num_meta_prompt, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/lustre06/project/6097055/ombh/DP-OPT/utils/dln.py", line 280, in generate_instruct_bwd
return self._generate_instruct_bwd(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/lustre06/project/6097055/ombh/DP-OPT/utils/dln.py", line 409, in _generate_instruct_bwd
instructs = self.forward_generate_prompt(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/lustre06/project/6097055/ombh/DP-OPT/utils/dln.py", line 128, in forward_generate_prompt
smp_output_ids = ensemble_generate(
^^^^^^^^^^^^^^^^^^
File "/lustre06/project/6097055/ombh/DP-OPT/utils/ensemble.py", line 84, in ensemble_generate
output_ids = greedy_search(model, input_ids, attention_mask, eos_token_id, pad_token_id,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/lustre06/project/6097055/ombh/DP-OPT/utils/ensemble.py", line 191, in greedy_search
next_token_scores = logits_processor(cur_input_ids, next_token_logits)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ombh/envs/DP-OPT/lib/python3.11/site-packages/transformers/generation/logits_process.py", line 104, in __call__
scores = processor(input_ids, scores)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ombh/envs/DP-OPT/lib/python3.11/site-packages/transformers/generation/logits_process.py", line 163, in __call__
eos_token_mask = isin_mps_friendly(vocab_tensor, self.eos_token_id)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ombh/envs/DP-OPT/lib/python3.11/site-packages/transformers/pytorch_utils.py", line 328, in isin_mps_friendly
return torch.isin(elements, test_elements)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument test_elements in method wrapper_CUDA_isin_Tensor_Tensor)
What could be causing this error? How should I proceed with debugging or fixing this issue?
作者:黑洞官方问答小能手
链接:https://www.pythonheidong.com/blog/article/2046749/cad299335f75a3fe1c41/
来源:python黑洞网
任何形式的转载都请注明出处,如有侵权 一经发现 必将追究其法律责任
昵称:
评论内容:(最多支持255个字符)
---无人问津也好,技不如人也罢,你都要试着安静下来,去做自己该做的事,而不是让内心的烦躁、焦虑,坏掉你本来就不多的热情和定力
Copyright © 2018-2021 python黑洞网 All Rights Reserved 版权所有,并保留所有权利。 京ICP备18063182号-1
投诉与举报,广告合作请联系vgs_info@163.com或QQ3083709327
免责声明:网站文章均由用户上传,仅供读者学习交流使用,禁止用做商业用途。若文章涉及色情,反动,侵权等违法信息,请向我们举报,一经核实我们会立即删除!