命名实体识别——bert_lstm_crf模型
from google.colab import drive
drive.mount('/content/drive')
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
import os
os.chdir('/content/drive/MyDrive/chinese task/CLUENER2020')
#安装
!pip install transformers datasets pytorch-crf seqeval
import os
import json
import logging
import numpy as np
import pandas as pd
import config

from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
#加载处理完的npz数据集
#不加allow_pickle=True会报错Object arrays cannot be loaded when allow_pickle=False,numpy新版本中默认为False。
train_data=np.load('./data/train.npz',allow_pickle=True)
val_data=np.load('./data/dev.npz',allow_pickle=True)
test_data=np.load('./data/test.npz',allow_pickle=True)

test_data.files
['words', 'labels']
#转换为dataframe格式
import pandas as pd
#补个随机frac
train_df=pd.concat([pd.DataFrame(train_data['words'],columns=['words']),
pd.DataFrame(train_data['labels'],columns=['labels'])],axis=1).sample(frac=1.0)
#测试集和验证集不需要shuffle
val_df=pd.concat([pd.DataFrame(val_data['words'],columns=['words']),
pd.DataFrame(val_data['labels'],columns=['labels'])],axis=1)#后面要进行预测对比标签,不宜shuffle

test_df=pd.concat([pd.DataFrame(test_data['words'],columns=['words']),
pd.DataFrame(test_data['labels'],columns=['labels'])],axis=1)
#小样本测试
#train_df=train_df.iloc[:1000]
#val_df=val_df.iloc[:500]


#将训练验证集的BIOS标签转换为数字索引,此时word和labels已经对齐了
def trans(labels):
labels=list(labels)
nums=[]
for label in labels:
nums.append(config.label2id[label])
return nums

train_df['labels']=train_df['labels'].map(lambda x: trans(x))
val_df['labels']=val_df['labels'].map(lambda x: trans(x))

test_df['labels']=test_df['labels'].map(lambda x: trans(x))
val_df
words labels
0 [彭, 小, 军, 认, 为, ,, 国, 内, 银, 行, 现, 在, 走, 的, 是, ... [7, 17, 17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
1 [温, 格, 的, 球, 队, 终, 于, 又, 踢, 了, 一, 场, 经, 典, 的, ... [7, 17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
2 [突, 袭, 黑, 暗, 雅, 典, 娜, 》, 中, R, i, d, d, i, c, ... [4, 14, 14, 14, 14, 14, 14, 14, 0, 7, 17, 17, ...
3 [郑, 阿, 姨, 就, 赶, 到, 文, 汇, 路, 排, 队, 拿, 钱, ,, 希, ... [0, 0, 0, 0, 0, 0, 1, 11, 11, 0, 0, 0, 0, 0, 0...
4 [我, 想, 站, 在, 雪, 山, 脚, 下, 你, 会, 被, 那, 巍, 峨, 的, ... [0, 0, 0, 0, 10, 20, 0, 0, 0, 0, 0, 0, 0, 0, 0...
... ... ...
1338 [在, 这, 个, 非, 常, 喜, 庆, 的, 日, 子, 里, ,, 我, 们, 首, ... [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
1339 [姜, 哲, 中, :, 公, 共, 之, 敌, 1, -, 1, 》, 、, 《, 神, ... [6, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16...
1340 [目, 前, ,, 日, 本, 松, 山, 海, 上, 保, 安, 部, 正, 在, 就, ... [0, 0, 0, 5, 15, 15, 15, 15, 15, 15, 15, 15, 0...
1341 [也, 就, 是, 说, 英, 国, 人, 在, 世, 博, 会, 上, 的, 英, 国, ... [0, 0, 0, 0, 0, 0, 0, 0, 10, 20, 20, 0, 0, 0, ...
1342 [另, 外, 意, 大, 利, 的, P, l, a, y, G, e, n, e, r, ... [0, 0, 0, 0, 0, 0, 2, 12, 12, 12, 12, 12, 12, ...

1343 rows × 2 columns

pandas数据装入datasets进行解码,之后方便直接pad labels。

labels在后面装入dataloader的时候,不处理的话长度不一致,处理的话整理函数太麻烦。

from datasets import Dataset
from transformers import AutoTokenizer
#这里一定要选AutoTokenizer,如果是BertTokenizer,会提示bertbase没有word_ids方法。结果没用到
trains_ds=Dataset.from_pandas(train_df)
val_ds=Dataset.from_pandas(val_df)
test_ds=Dataset.from_pandas(test_df)

tokenizer=AutoTokenizer.from_pretrained(config.roberta_model,do_lower_case=True)

#tokenized_inputs=tokenizer(trains_ds["words"],padding=True,truncation=True,is_split_into_words=True)为啥这种是错的
tokenized_trains_ds=trains_ds.map(lambda examples:tokenizer(examples['words'],is_split_into_words=True,truncation=True,padding=True),batched=True)
tokenized_val_ds=val_ds.map(lambda examples:tokenizer(examples['words'],is_split_into_words=True,truncation=True,padding=True),batched=True)
tokenized_test_ds=test_ds.map(lambda examples:tokenizer(examples['words'],is_split_into_words=True,truncation=True,padding=True),batched=True)
  0%|          | 0/11 [00:00<?, ?ba/s]


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.



  0%|          | 0/2 [00:00<?, ?ba/s]



  0%|          | 0/2 [00:00<?, ?ba/s]
#在编码之后的datasets里面操作,得到的结果无法写入datasets,所以只好写到pandas文件里。
#将labels填充到和input_ids一样长(最长句子52,所以其实全部都填充到52)

def padding(data,pad):
pad_labels=[]
for ds in data:
labels=ds['labels']
mask=ds['attention_mask']
label_ids=[pad]

pad_length=len(mask)
label_length=len(labels)

label_ids=label_ids+labels+[pad]*(pad_length-label_length-1)
pad_labels.append(label_ids)
return pad_labels
#tokenized_trains_ds["pad_labels"]=pad_labels# Column 2 named labels expected length 10748 but got length 1000
"""
train_df['mask_labels']=padding(tokenized_trains_ds,-100)
val_df['mask_labels']=padding(tokenized_val_ds,-100)
test_df['mask_labels']=padding(tokenized_test_ds,-100)"""

train_df['pad_labels']=padding(tokenized_trains_ds,-1)
val_df['pad_labels']=padding(tokenized_val_ds,-1)
test_df['pad_labels']=padding(tokenized_test_ds,-1)
val_df
words labels pad_labels
0 [彭, 小, 军, 认, 为, ,, 国, 内, 银, 行, 现, 在, 走, 的, 是, ... [7, 17, 17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0... [-1, 7, 17, 17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
1 [温, 格, 的, 球, 队, 终, 于, 又, 踢, 了, 一, 场, 经, 典, 的, ... [7, 17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... [-1, 7, 17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
2 [突, 袭, 黑, 暗, 雅, 典, 娜, 》, 中, R, i, d, d, i, c, ... [4, 14, 14, 14, 14, 14, 14, 14, 0, 7, 17, 17, ... [-1, 4, 14, 14, 14, 14, 14, 14, 14, 0, 7, 17, ...
3 [郑, 阿, 姨, 就, 赶, 到, 文, 汇, 路, 排, 队, 拿, 钱, ,, 希, ... [0, 0, 0, 0, 0, 0, 1, 11, 11, 0, 0, 0, 0, 0, 0... [-1, 0, 0, 0, 0, 0, 0, 1, 11, 11, 0, 0, 0, 0, ...
4 [我, 想, 站, 在, 雪, 山, 脚, 下, 你, 会, 被, 那, 巍, 峨, 的, ... [0, 0, 0, 0, 10, 20, 0, 0, 0, 0, 0, 0, 0, 0, 0... [-1, 0, 0, 0, 0, 10, 20, 0, 0, 0, 0, 0, 0, 0, ...
... ... ... ...
1338 [在, 这, 个, 非, 常, 喜, 庆, 的, 日, 子, 里, ,, 我, 们, 首, ... [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... [-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
1339 [姜, 哲, 中, :, 公, 共, 之, 敌, 1, -, 1, 》, 、, 《, 神, ... [6, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16... [-1, 6, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16...
1340 [目, 前, ,, 日, 本, 松, 山, 海, 上, 保, 安, 部, 正, 在, 就, ... [0, 0, 0, 5, 15, 15, 15, 15, 15, 15, 15, 15, 0... [-1, 0, 0, 0, 5, 15, 15, 15, 15, 15, 15, 15, 1...
1341 [也, 就, 是, 说, 英, 国, 人, 在, 世, 博, 会, 上, 的, 英, 国, ... [0, 0, 0, 0, 0, 0, 0, 0, 10, 20, 20, 0, 0, 0, ... [-1, 0, 0, 0, 0, 0, 0, 0, 0, 10, 20, 20, 0, 0,...
1342 [另, 外, 意, 大, 利, 的, P, l, a, y, G, e, n, e, r, ... [0, 0, 0, 0, 0, 0, 2, 12, 12, 12, 12, 12, 12, ... [-1, 0, 0, 0, 0, 0, 0, 2, 12, 12, 12, 12, 12, ...

1343 rows × 3 columns

batch_size=16

#划分训练验证集
from sklearn.model_selection import train_test_split
from datasets import Dataset
from torch.nn.utils.rnn import pad_sequence


train_data,train_label,val_data,val_label=train_df['words'].iloc[:],train_df['pad_labels'].iloc[:],val_df['words'].iloc[:],val_df['pad_labels'].iloc[:]

test_data,test_label=(test_df['words'].iloc[:],test_df['pad_labels'].iloc[:])

#stratify=train_df['label'].iloc[:]报错:The least populated class in y has only 1 member,which is too few.
#The minimum number of groups for any class cannot be less than 2.估计是样本太少,分层抽取不可行。

#数据预处理

tokenizer=AutoTokenizer.from_pretrained(config.roberta_model,do_lower_case=True)
train_encoding=tokenizer(list(train_data),is_split_into_words=True,truncation=True,padding=True,return_tensors='pt')#训练集中划分的训练集
val_encoding=tokenizer(list(val_data),is_split_into_words=True,truncation=True,padding=True,return_tensors='pt')#训练集中划分的验证集
test_encoding=tokenizer(list(test_data),is_split_into_words=True,truncation=True,padding=True,return_tensors='pt')#测试集
#加载到datalodar并预处理
#数据集读取

from torch.utils.data import Dataset, DataLoader,TensorDataset
import torch
class XFeiDataset(Dataset):
def __init__(self,encodings,pad_labels):
self.encodings=encodings
self.pad_labels=pad_labels

# 读取单个样本
def __getitem__(self,idx):
item={key:torch.tensor(val[idx]) for key,val in self.encodings.items()}
item['pad_labels']=torch.tensor((self.pad_labels[idx]))
item['mask']=(item['pad_labels']!=-1)
return item

def __len__(self):
return len(self.pad_labels)

#def collate_fn

train_dataset=XFeiDataset(train_encoding,list(train_label))
val_dataset=XFeiDataset(val_encoding,list(val_label))
test_dataset=XFeiDataset(test_encoding,list(test_label))


from torch.utils.data import Dataset,DataLoader,TensorDataset

train_loader=DataLoader(train_dataset,batch_size=batch_size,shuffle=True)
val_loader=DataLoader(val_dataset,batch_size=batch_size,shuffle=False)
test_loader=DataLoader(test_dataset,batch_size=batch_size,shuffle=False)#test数据不能shuffle啊,真坑死我了
#for i in val_loader:
#print(i)#输出5元组,input三兄弟和pad_labels和mask矩阵
from transformers import BertModel
from torch.nn.utils.rnn import pad_sequence
#初始化bert模型
from transformers import BertConfig
import torch.nn as nn
from torch.nn import LSTM
from torch.nn import functional as F
from torchcrf import CRF

num_labels=31
dropout=0.1
#取0.1时,epoch=1,precision 0.68|recall 0.72|f1 0.70|acc 0.93
#选0.2训练loss更大。epoch1时precision 0.50|recall 0.60|f1 0.54|acc 0.90

class Bert_LSTM(nn.Module):
def __init__(self):
super(Bert_LSTM,self).__init__()
self.num_labels=num_labels
self.dropout=nn.Dropout(dropout)
self.bert=BertModel.from_pretrained(config.roberta_model)
for param in self.bert.parameters():
param.requires_grad=True
self.classifier=nn.Linear(1024,self.num_labels)
self.crf=CRF(num_labels,batch_first=True)
from torch.nn import functional as F

self.bilstm=nn.LSTM(
input_size=1024,
hidden_size=512,
batch_first=True,
num_layers=2,
dropout=0.5,
bidirectional=True)

def forward(self,batch_seqs,batch_seq_masks,batch_seq_segments,pad_labels,mask):

output=self.bert(input_ids=batch_seqs,attention_mask=batch_seq_masks,token_type_ids=batch_seq_segments)
#pooler_output=output.pooler_output
last_hidden_state=output.last_hidden_state
last_hidden_state=self.dropout(last_hidden_state)
#只有这种写法不会报错,如果是sequence_output,pooler_output=self.bert(**kwags)这种,sequence_output会报错str没有xxx属性。
#貌似是bert输出有很多,直接用output.last_hidden_state来调用结果(估计是版本问题,坑),关键是输出要打印出来


lstm_output,(hn,cn)=self.bilstm(last_hidden_state)
#output为输出序列的隐藏层,hn为最后一个时刻的隐藏层,cn为最后一个时刻的隐藏细胞
lstm_output=self.dropout(lstm_output)

# 得到判别值
logits=self.classifier(lstm_output)
logits,pad_labels,mask=logits[:,1:,:],pad_labels[:,1:],mask[:,1:]#首个值不能是false,否则crf报错
loss=self.crf(logits,pad_labels,mask)*(-1)

return logits,loss
#加载模型
model=Bert_LSTM()
#model.load_state_dict(torch.load("/content/drive/MyDrive/chinese task/CLUENER2020/model/bert_lstm_crf_model"))
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
#定义优化器
epoch=10
lr=3e-5

from transformers import AdamW,get_scheduler

train_steps_per_epoch=len(train_loader)
num_training_steps=train_steps_per_epoch*epoch

#定义各模块参数
bert_parameters=list(model.bert.named_parameters())
lstm_parameters=list(model.bilstm.named_parameters())
classifier_parameters=list(model.classifier.named_parameters())
no_decay=['bias','LayerNorm.weight']

#bert模型、lstm模型、nn.linear的学习率分离,后两个是bert的3倍
optimizer_grouped_parameters=[
{'params':[p for n,p in bert_parameters if not any(nd in n for nd in no_decay)],
'lr':lr,'weight_decay':0.01},
{'params':[p for n,p in bert_parameters if any(nd in n for nd in no_decay)],
'lr':lr,'weight_decay':0.0},
{'params':[p for n,p in lstm_parameters if not any(nd in n for nd in no_decay)],
'lr':lr*3,'weight_decay':0.01},
{'params':[p for n,p in lstm_parameters if any(nd in n for nd in no_decay)],
'lr':lr*3,'weight_decay': 0.0},
{'params':[p for n,p in classifier_parameters if not any(nd in n for nd in no_decay)],
'lr':lr*3,'weight_decay':0.01},
{'params':[p for n,p in classifier_parameters if any(nd in n for nd in no_decay)],
'lr':lr*3,'weight_decay':0.0}]

optimizer=AdamW(optimizer_grouped_parameters,lr=lr,eps=1e-8)
#使用线性衰减学习率
lr_scheduler=get_scheduler(
"linear",
optimizer=optimizer,
num_warmup_steps=0,
num_training_steps=num_training_steps)
#将crf.decode预测值用0进行pad之后转为tensor
def pad_result(data,pad_labels):
pad_pred=[]
max_len=pad_labels.shape[1]
for pred in data:
pad_length=max_len-len(pred)

label_ids=pred+[0]*(pad_length)
pad_pred.append(label_ids)

return pad_pred

#pred=torch.tensor(pred)必须在函数外,否则报错tensor没有append

#编写评价方法
from datasets import load_metric
metric=load_metric("seqeval")
import numpy as np

label_list= [label for label,id in list(config.label2id.items())]

def compute_metrics(y_pred,y_true):
predictions,labels=y_pred,y_true

# 掉特殊字符处的值,不作比较。
true_predictions=[
[label_list[p] for (p,l) in zip(prediction, label) if l !=-1]
for prediction,label in zip(predictions, labels)
]
true_labels=[
[label_list[l] for (p,l) in zip(prediction, label) if l !=-1]
for prediction,label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions,references=true_labels)
return results

#编写训练和验证循环
import time
import numpy as np
from sklearn.metrics import f1_score,precision_score,recall_score,accuracy_score
from torch.nn import functional as F
from torchcrf import CRF
#加载进度条
from tqdm.auto import tqdm

num_training_steps=train_steps_per_epoch*epoch

progress_bar=tqdm(range(num_training_steps))

def train_and_eval(epoch):
for i in range(epoch):
"""训练模型"""
start=time.time()
model.train()
print("***** Running training epoch {} *****".format(i+1))
train_loss_sum=0.0
for idx,batch in enumerate(train_loader):
input_ids=batch['input_ids'].to(device)
attention_mask=batch['attention_mask'].to(device)
token_type_ids=batch['token_type_ids'].to(device)
pad_labels=batch['pad_labels'].to(device)
mask=batch['mask'].to(device)


#计算输出和loss
logits,loss=model(input_ids,attention_mask,token_type_ids,pad_labels,mask)
loss.backward()

optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
progress_bar.update(1)

train_loss_sum+=loss.item()
if (idx+1)%(len(train_loader)//5)==0: # 只打印五次结果
print("Epoch {:03d} | Step {:04d}/{:04d} | Loss {:.4f} | Time {:.4f} | Learning rate = {} \n".format(
i+1,idx+1,len(train_loader),train_loss_sum/(idx+1),time.time()-start,optimizer.state_dict()['param_groups'][0]['lr']))

#验证模型
model.eval()
y_pred,y_true=[],[]
best_f1,total_eval_loss=0,0
total_eval_accuracy,total,acc=0,0,0

for batch in val_loader:
with torch.no_grad():#只有这一块是不需要求导的

input_ids=batch['input_ids'].to(device)
attention_mask=batch['attention_mask'].to(device)
token_type_ids=batch['token_type_ids'].to(device)
pad_labels=batch['pad_labels'].to(device)
mask=batch['mask'].to(device)
logits,loss=model(input_ids,attention_mask,token_type_ids,pad_labels,mask)#都是去掉首个-100的结果

total_eval_loss+=loss.item()


pad_labels,mask=pad_labels[:,1:].to(device),mask[:,1:].to(device)
pred=model.crf.decode(logits,mask)
pred=torch.tensor(pad_result(pred,pad_labels)).to(device)#预测值经过pad_pred函数pad成batch_size*max_length-1,再转为tensor
acc+=(pred[mask]==pad_labels[mask]).sum().item()#只计算没有mask的单词的准确率,mask在外面似乎accs0.93不准。
total+=mask.sum().item()
total_eval_accuracy=acc/total

y_pred.extend(pred.cpu().numpy().tolist())#将每个把batch结果依次加入总列表
y_true.extend(pad_labels.cpu().numpy().tolist())
results=compute_metrics(y_pred,y_true)
f1=results["overall_f1"]

if f1>best_f1:
best_f1=f1
torch.save(model.state_dict(),"./bert_lstm_crf/blf_model")

print("precision {:.2f}|recall {:.2f}|f1 {:.4f}|acc {:.2f}".format(results["overall_precision"],results["overall_recall"],results["overall_f1"],results["overall_accuracy"]))
print("Average val loss:%.2f"%(total_eval_loss),"sklearn_acc:%.2f"%(total_eval_accuracy))
print("time costed={}s \n".format(round(time.time()-start,5)))
print("-------------------------------")
  0%|          | 0/6720 [00:00<?, ?it/s]

对比bert的token分类任务头

train_and_eval(epoch)
"""
预测值不包括实际值会报错
UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, msg_start, len(result))"""
***** Running training epoch 1 *****


/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:13: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
  del sys.path[0]


Epoch 001 | Step 0134/0672 | Loss 850.4533 | Time 160.0524 | Learning rate = 2.9401785714285713e-05 

Epoch 001 | Step 0268/0672 | Loss 755.6008 | Time 321.1079 | Learning rate = 2.880357142857143e-05 

Epoch 001 | Step 0402/0672 | Loss 693.5408 | Time 482.2713 | Learning rate = 2.8205357142857143e-05 

Epoch 001 | Step 0536/0672 | Loss 626.1115 | Time 643.2802 | Learning rate = 2.7607142857142855e-05 

Epoch 001 | Step 0670/0672 | Loss 554.6280 | Time 804.1686 | Learning rate = 2.7008928571428574e-05 

precision 0.55|recall 0.65|f1 0.59|acc 0.90
Average val loss:16898.68 sklearn_acc:0.90
time costed=847.54825s 

-------------------------------
***** Running training epoch 2 *****
Epoch 002 | Step 0134/0672 | Loss 189.6032 | Time 161.3824 | Learning rate = 2.6401785714285714e-05 

Epoch 002 | Step 0268/0672 | Loss 174.3528 | Time 321.8987 | Learning rate = 2.580357142857143e-05 

Epoch 002 | Step 0402/0672 | Loss 166.6937 | Time 482.3897 | Learning rate = 2.5205357142857145e-05 

Epoch 002 | Step 0536/0672 | Loss 160.9221 | Time 643.1670 | Learning rate = 2.4607142857142857e-05 

Epoch 002 | Step 0670/0672 | Loss 155.2287 | Time 803.8956 | Learning rate = 2.4008928571428572e-05 

precision 0.70|recall 0.78|f1 0.73|acc 0.94
Average val loss:10865.32 sklearn_acc:0.94
time costed=846.85732s 

-------------------------------
***** Running training epoch 3 *****
Epoch 003 | Step 0134/0672 | Loss 104.7770 | Time 161.5656 | Learning rate = 2.3401785714285716e-05 

Epoch 003 | Step 0268/0672 | Loss 104.1795 | Time 322.5888 | Learning rate = 2.2803571428571428e-05 

Epoch 003 | Step 0402/0672 | Loss 102.1001 | Time 483.3302 | Learning rate = 2.2205357142857143e-05 

Epoch 003 | Step 0536/0672 | Loss 100.6387 | Time 644.2039 | Learning rate = 2.1607142857142858e-05 

Epoch 003 | Step 0670/0672 | Loss 100.2379 | Time 804.9622 | Learning rate = 2.100892857142857e-05 

precision 0.71|recall 0.79|f1 0.75|acc 0.94
Average val loss:10455.55 sklearn_acc:0.94
time costed=847.52307s 

-------------------------------
***** Running training epoch 4 *****
Epoch 004 | Step 0134/0672 | Loss 65.6758 | Time 161.4387 | Learning rate = 2.0401785714285714e-05 

Epoch 004 | Step 0268/0672 | Loss 70.1131 | Time 321.9805 | Learning rate = 1.980357142857143e-05 

Epoch 004 | Step 0402/0672 | Loss 74.5657 | Time 482.5696 | Learning rate = 1.920535714285714e-05 

Epoch 004 | Step 0536/0672 | Loss 77.0551 | Time 643.0697 | Learning rate = 1.860714285714286e-05 

Epoch 004 | Step 0670/0672 | Loss 77.2134 | Time 803.5144 | Learning rate = 1.800892857142857e-05 

precision 0.72|recall 0.79|f1 0.75|acc 0.94
Average val loss:10545.26 sklearn_acc:0.94
time costed=846.01025s 

-------------------------------
***** Running training epoch 5 *****
Epoch 005 | Step 0134/0672 | Loss 62.8429 | Time 161.6965 | Learning rate = 1.7401785714285716e-05 

Epoch 005 | Step 0268/0672 | Loss 61.3453 | Time 322.8604 | Learning rate = 1.680357142857143e-05 

Epoch 005 | Step 0402/0672 | Loss 59.3090 | Time 484.0892 | Learning rate = 1.6205357142857143e-05 

Epoch 005 | Step 0536/0672 | Loss 58.5966 | Time 645.2598 | Learning rate = 1.5607142857142858e-05 

Epoch 005 | Step 0670/0672 | Loss 57.8091 | Time 806.5770 | Learning rate = 1.5008928571428572e-05 

precision 0.73|recall 0.81|f1 0.76|acc 0.94
Average val loss:11918.01 sklearn_acc:0.94
time costed=849.09279s 

-------------------------------
***** Running training epoch 6 *****
Epoch 006 | Step 0134/0672 | Loss 40.8853 | Time 161.7673 | Learning rate = 1.4401785714285716e-05 

Epoch 006 | Step 0268/0672 | Loss 40.4048 | Time 322.7583 | Learning rate = 1.3803571428571427e-05 

Epoch 006 | Step 0402/0672 | Loss 40.0783 | Time 483.8099 | Learning rate = 1.3205357142857143e-05 

Epoch 006 | Step 0536/0672 | Loss 39.7375 | Time 644.7180 | Learning rate = 1.2607142857142858e-05 

Epoch 006 | Step 0670/0672 | Loss 40.0972 | Time 805.9105 | Learning rate = 1.2008928571428573e-05 

precision 0.74|recall 0.81|f1 0.78|acc 0.94
Average val loss:11576.13 sklearn_acc:0.94
time costed=848.65729s 

-------------------------------
***** Running training epoch 7 *****
Epoch 007 | Step 0134/0672 | Loss 28.1865 | Time 161.6997 | Learning rate = 1.1401785714285714e-05 

Epoch 007 | Step 0268/0672 | Loss 29.4536 | Time 322.6295 | Learning rate = 1.0803571428571429e-05 

Epoch 007 | Step 0402/0672 | Loss 29.7340 | Time 483.5822 | Learning rate = 1.0205357142857144e-05 

Epoch 007 | Step 0536/0672 | Loss 30.0218 | Time 644.7393 | Learning rate = 9.607142857142856e-06 

Epoch 007 | Step 0670/0672 | Loss 29.1320 | Time 805.7908 | Learning rate = 9.008928571428571e-06 

precision 0.75|recall 0.80|f1 0.78|acc 0.94
Average val loss:13548.30 sklearn_acc:0.94
time costed=848.7225s 

-------------------------------
***** Running training epoch 8 *****
Epoch 008 | Step 0134/0672 | Loss 19.5674 | Time 162.0373 | Learning rate = 8.401785714285715e-06 

Epoch 008 | Step 0268/0672 | Loss 20.5346 | Time 323.2465 | Learning rate = 7.803571428571429e-06 

Epoch 008 | Step 0402/0672 | Loss 20.5707 | Time 484.4724 | Learning rate = 7.205357142857143e-06 

Epoch 008 | Step 0536/0672 | Loss 21.3377 | Time 645.7742 | Learning rate = 6.607142857142857e-06 

Epoch 008 | Step 0670/0672 | Loss 21.3365 | Time 806.9678 | Learning rate = 6.008928571428572e-06 

precision 0.75|recall 0.80|f1 0.78|acc 0.94
Average val loss:14625.68 sklearn_acc:0.94
time costed=850.00101s 

-------------------------------
***** Running training epoch 9 *****
Epoch 009 | Step 0134/0672 | Loss 15.8689 | Time 162.2782 | Learning rate = 5.4017857142857145e-06 

Epoch 009 | Step 0268/0672 | Loss 15.7494 | Time 323.9999 | Learning rate = 4.803571428571428e-06 

Epoch 009 | Step 0402/0672 | Loss 16.1067 | Time 485.5929 | Learning rate = 4.205357142857143e-06 

Epoch 009 | Step 0536/0672 | Loss 15.9825 | Time 647.0968 | Learning rate = 3.6071428571428573e-06 

Epoch 009 | Step 0670/0672 | Loss 15.6784 | Time 808.7016 | Learning rate = 3.0089285714285717e-06 

precision 0.74|recall 0.81|f1 0.77|acc 0.94
Average val loss:15238.61 sklearn_acc:0.94
time costed=851.86068s 

-------------------------------
***** Running training epoch 10 *****
Epoch 010 | Step 0134/0672 | Loss 12.5352 | Time 162.6241 | Learning rate = 2.401785714285714e-06 

Epoch 010 | Step 0268/0672 | Loss 12.0666 | Time 327.8486 | Learning rate = 1.8035714285714286e-06 

Epoch 010 | Step 0402/0672 | Loss 12.0815 | Time 492.9378 | Learning rate = 1.205357142857143e-06 

Epoch 010 | Step 0536/0672 | Loss 12.2230 | Time 658.1718 | Learning rate = 6.071428571428572e-07 

Epoch 010 | Step 0670/0672 | Loss 11.8025 | Time 823.4604 | Learning rate = 8.92857142857143e-09 

precision 0.75|recall 0.80|f1 0.77|acc 0.94
Average val loss:15777.25 sklearn_acc:0.94
time costed=868.35004s 

-------------------------------





'\n预测值不包括实际值会报错\nUndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n  _warn_prf(average, modifier, msg_start, len(result))'
#torch.save(model.state_dict(),"./bert_lstm_crf/finall_blf_model")
model.load_state_dict(torch.load("./bert_lstm_crf/blf_model"))
model.to(device)
#编写predict函数
def predict(model,data_loader):#参数名为data时加载训练好的模型来预测报错,原模型不报错
model.eval()
y_pred,y_true,predictions= [],[],[]#pad后的预测结果、真实标签和没有pad的标签list
for batch in data_loader:
with torch.no_grad():#只有这一块是不需要求导的

input_ids=batch['input_ids'].to(device)
attention_mask=batch['attention_mask'].to(device)
token_type_ids=batch['token_type_ids'].to(device)
pad_labels=batch['pad_labels'].to(device)#用1填充的标签,用来计算logits和loss
mask=batch['mask'].to(device)#mask矩阵,用来计算logits和loss、crf解码结果
logits,loss=model(input_ids,attention_mask,token_type_ids,pad_labels,mask)

pad_labels,mask=pad_labels[:,1:].to(device),mask[:,1:].to(device)
prediction=model.crf.decode(logits,mask)#解码后得出真实tokens的预测结果list

#预测值只是列表,pad之后加入总的预测列表,用于评价compute_metrics
pad_pred=torch.tensor(pad_result(prediction,pad_labels)).to(device)
y_pred.extend(pad_pred.cpu().numpy().tolist())
y_true.extend(pad_labels.cpu().numpy().tolist())#pad_labels为真实标签

predictions.extend(prediction)#这个是不用pad的预测结果,用于最终提交结果

return y_pred,y_true,predictions
#预测验证集结果,查看各个tokens类别的指标
y_pred,y_true,predictions=predict(model,val_loader)
results=compute_metrics(y_pred,y_true)
#将结果排序查看
result_df=pd.DataFrame(results)
result_df.stack().unstack(0).sort_values(by=['f1'])
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:13: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
  del sys.path[0]
precision recall f1 number
address 0.555288 0.619303 0.585551 373.000000
scene 0.697115 0.693780 0.695444 209.000000
overall_precision 0.747256 0.747256 0.747256 0.747256
organization 0.721805 0.784741 0.751958 367.000000
overall_f1 0.771725 0.771725 0.771725 0.771725
government 0.739130 0.825911 0.780115 247.000000
position 0.774554 0.801386 0.787741 433.000000
book 0.775000 0.805195 0.789809 154.000000
company 0.759124 0.825397 0.790875 378.000000
overall_recall 0.797852 0.797852 0.797852 0.797852
movie 0.807947 0.807947 0.807947 151.000000
game 0.791925 0.864407 0.826580 295.000000
name 0.865031 0.909677 0.886792 465.000000
overall_accuracy 0.938997 0.938997 0.938997 0.938997
#将验证集真实标签和预测结果进行对比展示
val_df['preds']=pd.Series(predictions)
val_df.to_csv('./bert_lstm_crf/val_1221.csv')
val_df=val_df.drop(["pad_labels"],axis=1)
val_df
words labels preds
0 [彭, 小, 军, 认, 为, ,, 国, 内, 银, 行, 现, 在, 走, 的, 是, ... [7, 17, 17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0... [7, 17, 17, 0, 0, 0, 3, 13, 13, 13, 0, 0, 0, 0...
1 [温, 格, 的, 球, 队, 终, 于, 又, 踢, 了, 一, 场, 经, 典, 的, ... [7, 17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... [7, 17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
2 [突, 袭, 黑, 暗, 雅, 典, 娜, 》, 中, R, i, d, d, i, c, ... [4, 14, 14, 14, 14, 14, 14, 14, 0, 7, 17, 17, ... [6, 16, 16, 16, 16, 16, 16, 16, 0, 7, 17, 17, ...
3 [郑, 阿, 姨, 就, 赶, 到, 文, 汇, 路, 排, 队, 拿, 钱, ,, 希, ... [0, 0, 0, 0, 0, 0, 1, 11, 11, 0, 0, 0, 0, 0, 0... [0, 0, 0, 0, 0, 0, 1, 11, 11, 0, 0, 0, 0, 0, 0...
4 [我, 想, 站, 在, 雪, 山, 脚, 下, 你, 会, 被, 那, 巍, 峨, 的, ... [0, 0, 0, 0, 10, 20, 0, 0, 0, 0, 0, 0, 0, 0, 0... [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
... ... ... ...
1338 [在, 这, 个, 非, 常, 喜, 庆, 的, 日, 子, 里, ,, 我, 们, 首, ... [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
1339 [姜, 哲, 中, :, 公, 共, 之, 敌, 1, -, 1, 》, 、, 《, 神, ... [6, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16... [6, 17, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16...
1340 [目, 前, ,, 日, 本, 松, 山, 海, 上, 保, 安, 部, 正, 在, 就, ... [0, 0, 0, 5, 15, 15, 15, 15, 15, 15, 15, 15, 0... [0, 0, 0, 5, 15, 15, 15, 15, 15, 15, 15, 15, 0...
1341 [也, 就, 是, 说, 英, 国, 人, 在, 世, 博, 会, 上, 的, 英, 国, ... [0, 0, 0, 0, 0, 0, 0, 0, 10, 20, 20, 0, 0, 0, ... [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 20...
1342 [另, 外, 意, 大, 利, 的, P, l, a, y, G, e, n, e, r, ... [0, 0, 0, 0, 0, 0, 2, 12, 12, 12, 12, 12, 12, ... [0, 0, 1, 11, 11, 0, 2, 12, 12, 12, 12, 12, 12...

1343 rows × 3 columns

用模型预测验证集结果,与原标签对比

y_pred,y_true,predictions=predict(model,test_loader)
pd.DataFrame({'label':predictions}).to_csv('./bert_lstm_crf/submit1222.csv',index=None)
文章作者: zhxnlp
文章链接: https://zhxnlp.github.io/2021/11/15/CLUENER 细粒度命名实体识别/bert_lstm_crf/
版权声明: 本博客所有文章除特别声明外,均采用 CC BY-NC-SA 4.0 许可协议。转载请注明来自 zhxnlpのBlog