命名实体识别——bert_softmax模型
from google.colab import drive
drive.mount('/content/drive')
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
import os
os.chdir('/content/drive/MyDrive/chinese task/CLUENER2020')

#安装
!pip install transformers datasets seqeval
import os
import json
import logging
import numpy as np
import pandas as pd
import config
#加载处理完的npz数据集
#不加allow_pickle=True会报错Object arrays cannot be loaded when allow_pickle=False,numpy新版本中默认为False。
train_data=np.load('./data/train.npz',allow_pickle=True)
val_data=np.load('./data/dev.npz',allow_pickle=True)
test_data=np.load('./data/test.npz',allow_pickle=True)

test_data.files

数据从npz格式加载到pandas,标签用数字替换,以便输入模型

#转换为dataframe格式
import pandas as pd
#补个随机frac
train_df=pd.concat([pd.DataFrame(train_data['words'],columns=['words']),
pd.DataFrame(train_data['labels'],columns=['labels'])],axis=1).sample(frac=1.0).rename(columns={'labels':'labels0'})
#测试集和验证集不需要shuffle
val_df=pd.concat([pd.DataFrame(val_data['words'],columns=['words']),
pd.DataFrame(val_data['labels'],columns=['labels'])],axis=1).rename(columns={'labels':'labels0'})

test_df=pd.concat([pd.DataFrame(test_data['words'],columns=['words']),
pd.DataFrame(test_data['labels'],columns=['labels'])],axis=1).rename(columns={'labels':'labels0'})


#将训练验证集的BIOS标签转换为数字索引,此时word和labels已经对齐了
def trans(labels):
labels=list(labels)
nums=[]
for label in labels:
nums.append(config.label2id[label])
return nums

train_df['labels0']=train_df['labels0'].map(lambda x: trans(x))
val_df['labels0']=val_df['labels0'].map(lambda x: trans(x))

test_df['labels0']=test_df['labels0'].map(lambda x: trans(x))
val_df
words labels0
0 [彭, 小, 军, 认, 为, ,, 国, 内, 银, 行, 现, 在, 走, 的, 是, ... [7, 17, 17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
1 [温, 格, 的, 球, 队, 终, 于, 又, 踢, 了, 一, 场, 经, 典, 的, ... [7, 17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
2 [突, 袭, 黑, 暗, 雅, 典, 娜, 》, 中, R, i, d, d, i, c, ... [4, 14, 14, 14, 14, 14, 14, 14, 0, 7, 17, 17, ...
3 [郑, 阿, 姨, 就, 赶, 到, 文, 汇, 路, 排, 队, 拿, 钱, ,, 希, ... [0, 0, 0, 0, 0, 0, 1, 11, 11, 0, 0, 0, 0, 0, 0...
4 [我, 想, 站, 在, 雪, 山, 脚, 下, 你, 会, 被, 那, 巍, 峨, 的, ... [0, 0, 0, 0, 10, 20, 0, 0, 0, 0, 0, 0, 0, 0, 0...
... ... ...
1338 [在, 这, 个, 非, 常, 喜, 庆, 的, 日, 子, 里, ,, 我, 们, 首, ... [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
1339 [姜, 哲, 中, :, 公, 共, 之, 敌, 1, -, 1, 》, 、, 《, 神, ... [6, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16...
1340 [目, 前, ,, 日, 本, 松, 山, 海, 上, 保, 安, 部, 正, 在, 就, ... [0, 0, 0, 5, 15, 15, 15, 15, 15, 15, 15, 15, 0...
1341 [也, 就, 是, 说, 英, 国, 人, 在, 世, 博, 会, 上, 的, 英, 国, ... [0, 0, 0, 0, 0, 0, 0, 0, 10, 20, 20, 0, 0, 0, ...
1342 [另, 外, 意, 大, 利, 的, P, l, a, y, G, e, n, e, r, ... [0, 0, 0, 0, 0, 0, 2, 12, 12, 12, 12, 12, 12, ...

1343 rows × 2 columns

word_ids可以将每一个subtokens位置对应一个word的下标。并且特殊字符对应了None。有了这个list,我们就能将subtokens和words还有标注的labels对齐啦,并将[cls]和[sep]位置的标签用-100填充。-100经过softmax会被忽略。

"""
将word_ids值为none的部分,即特殊符号[cls]和[sep]位置的标签转化为-100。
我们有两种对齐label的方式:

1.label_all_tokens=True,多个subtokens对齐一个word,对齐一个label
2.label_all_tokens=False,多个subtokens的第一个subtoken对齐word,对齐一个label,其他subtokens直接赋予-100.
"""
label_all_tokens=True
def tokenize_and_align_labels(examples):
tokenized_inputs=tokenizer(examples["words"],truncation=True,is_split_into_words=True)#数据分词

pad_labels = []#创建labels列表
for i,label in enumerate(examples['labels0']):
word_ids=tokenized_inputs.word_ids(batch_index=i)#取出索引i的编码数据的word_ids属性
previous_word_idx=None
label_ids=[]
for word_idx in word_ids:
# 特殊标记的单词word_ids为None。将标签设置为-100,以便它们自动在损失函数中被忽略。
if word_idx is None:
label_ids.append(-100)
# 我们为每个单词的第一个标记设置标签。(这里一个单词多个subword的word_idx只有一个数)
elif word_idx != previous_word_idx:
label_ids.append(label[word_idx])
# 对于单词中的其他标记,我们将标签设置为当前标签或-100,具体取决于label_all_tokens标志。
else:
label_ids.append(label[word_idx] if label_all_tokens else -100)
#label_all_tokens = True时,其它子词添加和第一个子词一样的标签,否则全部设为-100
previous_word_idx = word_idx

pad_labels.append(label_ids)

tokenized_inputs["labels"] = pad_labels
return tokenized_inputs

#如果是中文分词,只需要使用word_ids去除特殊单词就行,比如对应标签设置为-100。
from datasets import Dataset
from transformers import AutoTokenizer
#这里一定要选AutoTokenizer,如果是BertTokenizer,会提示bertbase没有word_ids方法。结果没用到

trains_ds=Dataset.from_pandas(train_df)
val_ds=Dataset.from_pandas(val_df)
test_ds=Dataset.from_pandas(test_df)

tokenizer=AutoTokenizer.from_pretrained(config.roberta_model,do_lower_case=True)


tokenized_trains_ds=trains_ds.map(tokenize_and_align_labels,batched=True)
tokenized_val_ds=val_ds.map(tokenize_and_align_labels,batched=True)
tokenized_test_ds=test_ds.map(tokenize_and_align_labels,batched=True)
#加载模型
import torch
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained('hfl/chinese-roberta-wwm-ext-large',num_labels=31)

device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

解读BertForTokenClassification任务头(说明代码,不需要运行)

计算loss时忽略padding部分(即只算attention_mask==1部分)步骤为:

sequence_output=outputs[0]#bert输出取第一维,即每个tokens的隐向量   
sequence_output=self.dropout(sequence_output)
logits=self.classifier(sequence_output)#经过线性变换,从torch.Size([3,52,1024])转为torch.Size([3,52,31])
loss_fct = CrossEntropyLoss()#交叉熵损失函数,自带softmax

if attention_mask is not None:
#1.先取出mask矩阵压缩为一维,attention_mask==1转为一维真假矩阵。
active_loss=attention_mask.view(-1)==1#torch.Size([156]),只有156个有效tokens。

#2.labels压缩,再创建一个同形状的loss ignore_index矩阵
active_logits=logits.view(-1,self.num_labels)#logits变成一维。即由torch.Size([3,52,31])变成torch.Size([156,31])

#3.torch.where取出labels对应mask==1的部分,其余部分为loss忽略索引。即labels为(mask==1和忽略部分)
active_labels=torch.where(active_loss,labels.view(-1),
torch.tensor(loss_fct.ignore_index).type_as(labels))#torch.Size([156,31]),labels含有ignore_index。
"""torch.tensor(loss_fct.ignore_index).type_as(labels)就是一个全部为loss忽视索引,形状和labels一样的矩阵。
torch.where的用法就是满足参数1的条件active_loss,就从参数2矩阵取值,否则从参数3矩阵取值"""
#4.计算logtis和active_labels的loss。
loss=loss_fct(active_logits,active_labels)
else:
loss=loss_fct(logits.view(-1,self.num_labels),labels.view(-1))

"""然而,只有pad部分attention_mask==0,句子首尾部分的cls和sep还是计算loss的。本身这样处理还留有cls和sep。"""
label2id = {
"O": 0,
"B-address": 1,
"B-book": 2,
"B-company": 3,
'B-game': 4,
'B-government': 5,
'B-movie': 6,
'B-name': 7,
'B-organization': 8,
'B-position': 9,
'B-scene': 10,
"I-address": 11,
"I-book": 12,
"I-company": 13,
'I-game': 14,
'I-government': 15,
'I-movie': 16,
'I-name': 17,
'I-organization': 18,
'I-position': 19,
'I-scene': 20,
"S-address": 21,
"S-book": 22,
"S-company": 23,
'S-game': 24,
'S-government': 25,
'S-movie': 26,
'S-name': 27,
'S-organization': 28,
'S-position': 29,
'S-scene': 30
}

label_list= [label for label,id in list(label2id.items())]
label_list

设置seqeval评测方法,需要注意以下几点

  • 选择预测分类最大概率的下标
  • 将数字下标转化为BIOS格式的label,因为seqeval除了总的指标,还可以查看各个类别的指标。如果只是数字,运行时会有异常提示(但正常运行)
  • 忽略-100所在地方,即特殊tokens的位置
  • 这步和token分类任务头合起来,就将pad部位和特殊tokens部分都忽略loss计算了。
from datasets import load_metric
metric=load_metric("seqeval")
import numpy as np

def compute_metrics(p):
predictions,labels = p
predictions = np.argmax(predictions,axis=2)

# 去掉特殊字符处的值,不作比较。将label由数字转为ner标签31类。
true_predictions = [
[label_list[p] for (p, l) in zip(prediction, label) if l != -100]
for prediction, label in zip(predictions, labels)
]
true_labels = [
[label_list[l] for (p, l) in zip(prediction, label) if l != -100]
for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions,references=true_labels)
return {
"precision": results["overall_precision"],
"recall": results["overall_recall"],
"f1": results["overall_f1"],
"accuracy": results["overall_accuracy"],
}
Downloading:   0%|          | 0.00/2.48k [00:00<?, ?B/s]
batch_size=32
metric_name="f1"
#数据整理器,将接收到的输入及标签进行动态填充。估计是不填充的话labels不齐,无法输入模型
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer)

from transformers import TrainingArguments,Trainer
args=TrainingArguments(
"bert_softmax",
evaluation_strategy="epoch",
#save_strategy="epoch",
learning_rate=2e-5,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
num_train_epochs=8,
weight_decay=0.01,
metric_for_best_model=metric_name#只是调用最好的模型,)

trainer=Trainer(model,args,
train_dataset=tokenized_trains_ds,
eval_dataset=tokenized_val_ds,
data_collator=data_collator,
tokenizer=tokenizer,
compute_metrics=compute_metrics)
#进行训练
trainer.train()
The following columns in the training set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: words, labels0, __index_level_0__.
***** Running training *****
  Num examples = 10748
  Num Epochs = 8
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 2688




<div>

  <progress value='2688' max='2688' style='width:300px; height:20px; vertical-align: middle;'></progress>
  [2688/2688 1:34:02, Epoch 8/8]
</div>
<table border="1" class="dataframe">

Epoch Training Loss Validation Loss Precision Recall F1 Accuracy
1 No log 0.205783 0.684743 0.793294 0.735032 0.938082 2 0.302400 0.205716 0.716806 0.806641 0.759075 0.939276 3 0.117300 0.213287 0.736779 0.798177 0.766250 0.941265 4 0.117300 0.244457 0.735330 0.791341 0.762308 0.939952 5 0.056000 0.275058 0.743161 0.795898 0.768626 0.941146 6 0.031100 0.302491 0.738582 0.800130 0.768125 0.941663 7 0.031100 0.326065 0.739182 0.806315 0.771291 0.942957 8 0.015800 0.336456 0.741374 0.804362 0.771585 0.941882
</table>

The following columns in the evaluation set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: words, labels0.
***** Running Evaluation *****
  Num examples = 1343
  Batch size = 32
Saving model checkpoint to bert_softmax/checkpoint-336
Configuration saved in bert_softmax/checkpoint-336/config.json
Model weights saved in bert_softmax/checkpoint-336/pytorch_model.bin
tokenizer config file saved in bert_softmax/checkpoint-336/tokenizer_config.json
Special tokens file saved in bert_softmax/checkpoint-336/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: words, labels0.
***** Running Evaluation *****
  Num examples = 1343
  Batch size = 32
Saving model checkpoint to bert_softmax/checkpoint-672
Configuration saved in bert_softmax/checkpoint-672/config.json
Model weights saved in bert_softmax/checkpoint-672/pytorch_model.bin
tokenizer config file saved in bert_softmax/checkpoint-672/tokenizer_config.json
Special tokens file saved in bert_softmax/checkpoint-672/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: words, labels0.
***** Running Evaluation *****
  Num examples = 1343
  Batch size = 32
Saving model checkpoint to bert_softmax/checkpoint-1008
Configuration saved in bert_softmax/checkpoint-1008/config.json
Model weights saved in bert_softmax/checkpoint-1008/pytorch_model.bin
tokenizer config file saved in bert_softmax/checkpoint-1008/tokenizer_config.json
Special tokens file saved in bert_softmax/checkpoint-1008/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: words, labels0.
***** Running Evaluation *****
  Num examples = 1343
  Batch size = 32
Saving model checkpoint to bert_softmax/checkpoint-1344
Configuration saved in bert_softmax/checkpoint-1344/config.json
Model weights saved in bert_softmax/checkpoint-1344/pytorch_model.bin
tokenizer config file saved in bert_softmax/checkpoint-1344/tokenizer_config.json
Special tokens file saved in bert_softmax/checkpoint-1344/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: words, labels0.
***** Running Evaluation *****
  Num examples = 1343
  Batch size = 32
Saving model checkpoint to bert_softmax/checkpoint-1680
Configuration saved in bert_softmax/checkpoint-1680/config.json
Model weights saved in bert_softmax/checkpoint-1680/pytorch_model.bin
tokenizer config file saved in bert_softmax/checkpoint-1680/tokenizer_config.json
Special tokens file saved in bert_softmax/checkpoint-1680/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: words, labels0.
***** Running Evaluation *****
  Num examples = 1343
  Batch size = 32
Saving model checkpoint to bert_softmax/checkpoint-2016
Configuration saved in bert_softmax/checkpoint-2016/config.json
Model weights saved in bert_softmax/checkpoint-2016/pytorch_model.bin
tokenizer config file saved in bert_softmax/checkpoint-2016/tokenizer_config.json
Special tokens file saved in bert_softmax/checkpoint-2016/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: words, labels0.
***** Running Evaluation *****
  Num examples = 1343
  Batch size = 32
Saving model checkpoint to bert_softmax/checkpoint-2352
Configuration saved in bert_softmax/checkpoint-2352/config.json
Model weights saved in bert_softmax/checkpoint-2352/pytorch_model.bin
tokenizer config file saved in bert_softmax/checkpoint-2352/tokenizer_config.json
Special tokens file saved in bert_softmax/checkpoint-2352/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: words, labels0.
***** Running Evaluation *****
  Num examples = 1343
  Batch size = 32
Saving model checkpoint to bert_softmax/checkpoint-2688
Configuration saved in bert_softmax/checkpoint-2688/config.json
Model weights saved in bert_softmax/checkpoint-2688/pytorch_model.bin
tokenizer config file saved in bert_softmax/checkpoint-2688/tokenizer_config.json
Special tokens file saved in bert_softmax/checkpoint-2688/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)







TrainOutput(global_step=2688, training_loss=0.09796489925966376, metrics={'train_runtime': 5645.1208, 'train_samples_per_second': 15.232, 'train_steps_per_second': 0.476, 'total_flos': 8072824637823936.0, 'train_loss': 0.09796489925966376, 'epoch': 8.0})

如果想要得到单个类别的precision/recall/f1,我们直接将结果输入相同的评估函数即可:

#进行评估
trainer.evaluate()
import torch
torch.save(model.state_dict(),"./bert_softmax/bert_lstm_softmax_model")
predictions,labels,loss=trainer.predict(tokenized_val_ds)
predictions=np.argmax(predictions,axis=2)

# Remove ignored index (special tokens)
true_predictions = [
[label_list[p] for (p,l) in zip(prediction, label) if l != -100]
for prediction, label in zip(predictions, labels)
]
true_labels = [
[label_list[l] for (p, l) in zip(prediction, label) if l != -100]
for prediction, label in zip(predictions, labels)
]

results=metric.compute(predictions=true_predictions,references=true_labels)
results
#将结果排序查看
result_df=pd.DataFrame(results)
result_df.stack().unstack(0).sort_values(by=['f1'])
precision recall f1 number
address 0.556627 0.619303 0.586294 373.000000
scene 0.684211 0.746411 0.713959 209.000000
overall_precision 0.741374 0.741374 0.741374 0.741374
organization 0.713592 0.801090 0.754814 367.000000
book 0.743902 0.792208 0.767296 154.000000
overall_f1 0.771585 0.771585 0.771585 0.771585
position 0.753813 0.799076 0.775785 433.000000
company 0.752427 0.820106 0.784810 378.000000
government 0.738516 0.846154 0.788679 247.000000
overall_recall 0.804362 0.804362 0.804362 0.804362
game 0.808050 0.884746 0.844660 295.000000
movie 0.858108 0.841060 0.849498 151.000000
name 0.848671 0.892473 0.870021 465.000000
overall_accuracy 0.941882 0.941882 0.941882 0.941882
#预测验证集结果并对比标签
predictions,metrics,Loss=trainer.predict(tokenized_val_ds,metric_key_prefix="test")
pred=np.argmax(predictions,axis=2)#生成的结果是二维数组,所以需要用下一行进行转换。
preds=[x for x in pred]
val_df['preds']=pd.Series(preds)
val_df.to_csv('./bert_softmax/val_1220.csv')
val_df
The following columns in the test set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: words, labels0.
***** Running Prediction *****
  Num examples = 1343
  Batch size = 32
[42/42 31:38]
words labels0 preds
0 [彭, 小, 军, 认, 为, ,, 国, 内, 银, 行, 现, 在, 走, 的, 是, ... [7, 17, 17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0... [0, 7, 17, 17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
1 [温, 格, 的, 球, 队, 终, 于, 又, 踢, 了, 一, 场, 经, 典, 的, ... [7, 17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... [0, 7, 17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
2 [突, 袭, 黑, 暗, 雅, 典, 娜, 》, 中, R, i, d, d, i, c, ... [4, 14, 14, 14, 14, 14, 14, 14, 0, 7, 17, 17, ... [0, 4, 14, 14, 14, 14, 14, 14, 14, 0, 7, 17, 1...
3 [郑, 阿, 姨, 就, 赶, 到, 文, 汇, 路, 排, 队, 拿, 钱, ,, 希, ... [0, 0, 0, 0, 0, 0, 1, 11, 11, 0, 0, 0, 0, 0, 0... [0, 0, 0, 0, 0, 0, 0, 1, 11, 11, 0, 0, 0, 0, 0...
4 [我, 想, 站, 在, 雪, 山, 脚, 下, 你, 会, 被, 那, 巍, 峨, 的, ... [0, 0, 0, 0, 10, 20, 0, 0, 0, 0, 0, 0, 0, 0, 0... [0, 0, 0, 0, 0, 10, 20, 0, 0, 0, 0, 0, 0, 0, 0...
... ... ... ...
1338 [在, 这, 个, 非, 常, 喜, 庆, 的, 日, 子, 里, ,, 我, 们, 首, ... [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
1339 [姜, 哲, 中, :, 公, 共, 之, 敌, 1, -, 1, 》, 、, 《, 神, ... [6, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16... [0, 7, 17, 17, 16, 16, 16, 16, 16, 16, 16, 16,...
1340 [目, 前, ,, 日, 本, 松, 山, 海, 上, 保, 安, 部, 正, 在, 就, ... [0, 0, 0, 5, 15, 15, 15, 15, 15, 15, 15, 15, 0... [0, 0, 0, 0, 5, 15, 15, 15, 15, 15, 15, 15, 15...
1341 [也, 就, 是, 说, 英, 国, 人, 在, 世, 博, 会, 上, 的, 英, 国, ... [0, 0, 0, 0, 0, 0, 0, 0, 10, 20, 20, 0, 0, 0, ... [0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 18, 18, 0, 0, 0...
1342 [另, 外, 意, 大, 利, 的, P, l, a, y, G, e, n, e, r, ... [0, 0, 0, 0, 0, 0, 2, 12, 12, 12, 12, 12, 12, ... [0, 0, 0, 0, 0, 0, 0, 2, 12, 12, 12, 12, 12, 1...

1343 rows × 3 columns

#测试预测的结果,这部分不需要运行。
#预测出来的结果二维数组,不能直接转为Series。如果直接装进DataFrame,每个词是一列,一共52列
import pandas as pd
df=pd.Series(a)
print(pred)
print(df)
[[ 0  8  0 ...  0  0  0]
 [ 0  5 15 ... 15 15  0]
 [ 0  0  0 ...  0 14  0]
 ...
 [ 0  0  0 ... 12  0 12]
 [ 0  0  0 ... 16  0  0]
 [ 0  8 18 ...  0  0  0]]
0       [29, 12, 1, 1, 42, 1, 1, 42, 1, 1, 23, 14, 4, ...
1       [28, 13, 17, 17, 1, 1, 17, 13, 17, 7, 13, 14, ...
2       [3, 3, 18, 6, 6, 6, 6, 5, 6, 6, 6, 4, 16, 16, ...
3       [5, 13, 22, 45, 39, 45, 10, 10, 24, 40, 10, 14...
4       [32, 20, 14, 20, 14, 20, 14, 14, 41, 33, 20, 2...
                              ...                        
1340    [28, 43, 12, 24, 3, 31, 4, 31, 31, 24, 43, 32,...
1341    [22, 7, 33, 3, 10, 10, 46, 33, 10, 10, 23, 8, ...
1342    [26, 39, 18, 18, 45, 40, 18, 14, 18, 3, 3, 44,...
1343    [2, 23, 46, 46, 46, 40, 46, 40, 40, 10, 46, 24...
1344    [32, 10, 41, 33, 41, 34, 41, 41, 1, 41, 33, 13...
Length: 1345, dtype: object
#用trainer预测结果并保存
predictions,metrics,Loss=trainer.predict(tokenized_test_ds,metric_key_prefix="test")
pred=np.argmax(predictions,axis=1)
preds=preds=[x for x in pred]
pd.DataFrame({'label':preds}).to_csv('./bert_softmax/submit1220.csv',index=None)
文章作者: zhxnlp
文章链接: https://zhxnlp.github.io/2021/11/10/CLUENER 细粒度命名实体识别/bert_softmax/
版权声明: 本博客所有文章除特别声明外,均采用 CC BY-NC-SA 4.0 许可协议。转载请注明来自 zhxnlpのBlog