from google.colab import drive |
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
import os |
#安装 |
import os |
#加载处理完的npz数据集 |
['words', 'labels']
#转换为dataframe格式 |
words | labels | |
---|---|---|
0 | [彭, 小, 军, 认, 为, ,, 国, 内, 银, 行, 现, 在, 走, 的, 是, ... | [7, 17, 17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0... |
1 | [温, 格, 的, 球, 队, 终, 于, 又, 踢, 了, 一, 场, 经, 典, 的, ... | [7, 17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... |
2 | [突, 袭, 黑, 暗, 雅, 典, 娜, 》, 中, R, i, d, d, i, c, ... | [4, 14, 14, 14, 14, 14, 14, 14, 0, 7, 17, 17, ... |
3 | [郑, 阿, 姨, 就, 赶, 到, 文, 汇, 路, 排, 队, 拿, 钱, ,, 希, ... | [0, 0, 0, 0, 0, 0, 1, 11, 11, 0, 0, 0, 0, 0, 0... |
4 | [我, 想, 站, 在, 雪, 山, 脚, 下, 你, 会, 被, 那, 巍, 峨, 的, ... | [0, 0, 0, 0, 10, 20, 0, 0, 0, 0, 0, 0, 0, 0, 0... |
... | ... | ... |
1338 | [在, 这, 个, 非, 常, 喜, 庆, 的, 日, 子, 里, ,, 我, 们, 首, ... | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... |
1339 | [姜, 哲, 中, :, 公, 共, 之, 敌, 1, -, 1, 》, 、, 《, 神, ... | [6, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16... |
1340 | [目, 前, ,, 日, 本, 松, 山, 海, 上, 保, 安, 部, 正, 在, 就, ... | [0, 0, 0, 5, 15, 15, 15, 15, 15, 15, 15, 15, 0... |
1341 | [也, 就, 是, 说, 英, 国, 人, 在, 世, 博, 会, 上, 的, 英, 国, ... | [0, 0, 0, 0, 0, 0, 0, 0, 10, 20, 20, 0, 0, 0, ... |
1342 | [另, 外, 意, 大, 利, 的, P, l, a, y, G, e, n, e, r, ... | [0, 0, 0, 0, 0, 0, 2, 12, 12, 12, 12, 12, 12, ... |
1343 rows × 2 columns
pandas数据装入datasets进行解码,之后方便直接pad labels。
labels在后面装入dataloader的时候,不处理的话长度不一致,处理的话整理函数太麻烦。
from datasets import Dataset |
0%| | 0/11 [00:00<?, ?ba/s]
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
0%| | 0/2 [00:00<?, ?ba/s]
0%| | 0/2 [00:00<?, ?ba/s]
#在编码之后的datasets里面操作,得到的结果无法写入datasets,所以只好写到pandas文件里。 |
words | labels | pad_labels | |
---|---|---|---|
0 | [彭, 小, 军, 认, 为, ,, 国, 内, 银, 行, 现, 在, 走, 的, 是, ... | [7, 17, 17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0... | [-1, 7, 17, 17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... |
1 | [温, 格, 的, 球, 队, 终, 于, 又, 踢, 了, 一, 场, 经, 典, 的, ... | [7, 17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... | [-1, 7, 17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0... |
2 | [突, 袭, 黑, 暗, 雅, 典, 娜, 》, 中, R, i, d, d, i, c, ... | [4, 14, 14, 14, 14, 14, 14, 14, 0, 7, 17, 17, ... | [-1, 4, 14, 14, 14, 14, 14, 14, 14, 0, 7, 17, ... |
3 | [郑, 阿, 姨, 就, 赶, 到, 文, 汇, 路, 排, 队, 拿, 钱, ,, 希, ... | [0, 0, 0, 0, 0, 0, 1, 11, 11, 0, 0, 0, 0, 0, 0... | [-1, 0, 0, 0, 0, 0, 0, 1, 11, 11, 0, 0, 0, 0, ... |
4 | [我, 想, 站, 在, 雪, 山, 脚, 下, 你, 会, 被, 那, 巍, 峨, 的, ... | [0, 0, 0, 0, 10, 20, 0, 0, 0, 0, 0, 0, 0, 0, 0... | [-1, 0, 0, 0, 0, 10, 20, 0, 0, 0, 0, 0, 0, 0, ... |
... | ... | ... | ... |
1338 | [在, 这, 个, 非, 常, 喜, 庆, 的, 日, 子, 里, ,, 我, 们, 首, ... | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... | [-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... |
1339 | [姜, 哲, 中, :, 公, 共, 之, 敌, 1, -, 1, 》, 、, 《, 神, ... | [6, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16... | [-1, 6, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16... |
1340 | [目, 前, ,, 日, 本, 松, 山, 海, 上, 保, 安, 部, 正, 在, 就, ... | [0, 0, 0, 5, 15, 15, 15, 15, 15, 15, 15, 15, 0... | [-1, 0, 0, 0, 5, 15, 15, 15, 15, 15, 15, 15, 1... |
1341 | [也, 就, 是, 说, 英, 国, 人, 在, 世, 博, 会, 上, 的, 英, 国, ... | [0, 0, 0, 0, 0, 0, 0, 0, 10, 20, 20, 0, 0, 0, ... | [-1, 0, 0, 0, 0, 0, 0, 0, 0, 10, 20, 20, 0, 0,... |
1342 | [另, 外, 意, 大, 利, 的, P, l, a, y, G, e, n, e, r, ... | [0, 0, 0, 0, 0, 0, 2, 12, 12, 12, 12, 12, 12, ... | [-1, 0, 0, 0, 0, 0, 0, 2, 12, 12, 12, 12, 12, ... |
1343 rows × 3 columns
batch_size=16 |
#加载到datalodar并预处理 |
#for i in val_loader: |
from transformers import BertModel |
#加载模型 |
#定义优化器 |
#将crf.decode预测值用0进行pad之后转为tensor |
#编写训练和验证循环 |
0%| | 0/6720 [00:00<?, ?it/s]
对比bert的token分类任务头
train_and_eval(epoch) |
***** Running training epoch 1 *****
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:13: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
del sys.path[0]
Epoch 001 | Step 0134/0672 | Loss 850.4533 | Time 160.0524 | Learning rate = 2.9401785714285713e-05
Epoch 001 | Step 0268/0672 | Loss 755.6008 | Time 321.1079 | Learning rate = 2.880357142857143e-05
Epoch 001 | Step 0402/0672 | Loss 693.5408 | Time 482.2713 | Learning rate = 2.8205357142857143e-05
Epoch 001 | Step 0536/0672 | Loss 626.1115 | Time 643.2802 | Learning rate = 2.7607142857142855e-05
Epoch 001 | Step 0670/0672 | Loss 554.6280 | Time 804.1686 | Learning rate = 2.7008928571428574e-05
precision 0.55|recall 0.65|f1 0.59|acc 0.90
Average val loss:16898.68 sklearn_acc:0.90
time costed=847.54825s
-------------------------------
***** Running training epoch 2 *****
Epoch 002 | Step 0134/0672 | Loss 189.6032 | Time 161.3824 | Learning rate = 2.6401785714285714e-05
Epoch 002 | Step 0268/0672 | Loss 174.3528 | Time 321.8987 | Learning rate = 2.580357142857143e-05
Epoch 002 | Step 0402/0672 | Loss 166.6937 | Time 482.3897 | Learning rate = 2.5205357142857145e-05
Epoch 002 | Step 0536/0672 | Loss 160.9221 | Time 643.1670 | Learning rate = 2.4607142857142857e-05
Epoch 002 | Step 0670/0672 | Loss 155.2287 | Time 803.8956 | Learning rate = 2.4008928571428572e-05
precision 0.70|recall 0.78|f1 0.73|acc 0.94
Average val loss:10865.32 sklearn_acc:0.94
time costed=846.85732s
-------------------------------
***** Running training epoch 3 *****
Epoch 003 | Step 0134/0672 | Loss 104.7770 | Time 161.5656 | Learning rate = 2.3401785714285716e-05
Epoch 003 | Step 0268/0672 | Loss 104.1795 | Time 322.5888 | Learning rate = 2.2803571428571428e-05
Epoch 003 | Step 0402/0672 | Loss 102.1001 | Time 483.3302 | Learning rate = 2.2205357142857143e-05
Epoch 003 | Step 0536/0672 | Loss 100.6387 | Time 644.2039 | Learning rate = 2.1607142857142858e-05
Epoch 003 | Step 0670/0672 | Loss 100.2379 | Time 804.9622 | Learning rate = 2.100892857142857e-05
precision 0.71|recall 0.79|f1 0.75|acc 0.94
Average val loss:10455.55 sklearn_acc:0.94
time costed=847.52307s
-------------------------------
***** Running training epoch 4 *****
Epoch 004 | Step 0134/0672 | Loss 65.6758 | Time 161.4387 | Learning rate = 2.0401785714285714e-05
Epoch 004 | Step 0268/0672 | Loss 70.1131 | Time 321.9805 | Learning rate = 1.980357142857143e-05
Epoch 004 | Step 0402/0672 | Loss 74.5657 | Time 482.5696 | Learning rate = 1.920535714285714e-05
Epoch 004 | Step 0536/0672 | Loss 77.0551 | Time 643.0697 | Learning rate = 1.860714285714286e-05
Epoch 004 | Step 0670/0672 | Loss 77.2134 | Time 803.5144 | Learning rate = 1.800892857142857e-05
precision 0.72|recall 0.79|f1 0.75|acc 0.94
Average val loss:10545.26 sklearn_acc:0.94
time costed=846.01025s
-------------------------------
***** Running training epoch 5 *****
Epoch 005 | Step 0134/0672 | Loss 62.8429 | Time 161.6965 | Learning rate = 1.7401785714285716e-05
Epoch 005 | Step 0268/0672 | Loss 61.3453 | Time 322.8604 | Learning rate = 1.680357142857143e-05
Epoch 005 | Step 0402/0672 | Loss 59.3090 | Time 484.0892 | Learning rate = 1.6205357142857143e-05
Epoch 005 | Step 0536/0672 | Loss 58.5966 | Time 645.2598 | Learning rate = 1.5607142857142858e-05
Epoch 005 | Step 0670/0672 | Loss 57.8091 | Time 806.5770 | Learning rate = 1.5008928571428572e-05
precision 0.73|recall 0.81|f1 0.76|acc 0.94
Average val loss:11918.01 sklearn_acc:0.94
time costed=849.09279s
-------------------------------
***** Running training epoch 6 *****
Epoch 006 | Step 0134/0672 | Loss 40.8853 | Time 161.7673 | Learning rate = 1.4401785714285716e-05
Epoch 006 | Step 0268/0672 | Loss 40.4048 | Time 322.7583 | Learning rate = 1.3803571428571427e-05
Epoch 006 | Step 0402/0672 | Loss 40.0783 | Time 483.8099 | Learning rate = 1.3205357142857143e-05
Epoch 006 | Step 0536/0672 | Loss 39.7375 | Time 644.7180 | Learning rate = 1.2607142857142858e-05
Epoch 006 | Step 0670/0672 | Loss 40.0972 | Time 805.9105 | Learning rate = 1.2008928571428573e-05
precision 0.74|recall 0.81|f1 0.78|acc 0.94
Average val loss:11576.13 sklearn_acc:0.94
time costed=848.65729s
-------------------------------
***** Running training epoch 7 *****
Epoch 007 | Step 0134/0672 | Loss 28.1865 | Time 161.6997 | Learning rate = 1.1401785714285714e-05
Epoch 007 | Step 0268/0672 | Loss 29.4536 | Time 322.6295 | Learning rate = 1.0803571428571429e-05
Epoch 007 | Step 0402/0672 | Loss 29.7340 | Time 483.5822 | Learning rate = 1.0205357142857144e-05
Epoch 007 | Step 0536/0672 | Loss 30.0218 | Time 644.7393 | Learning rate = 9.607142857142856e-06
Epoch 007 | Step 0670/0672 | Loss 29.1320 | Time 805.7908 | Learning rate = 9.008928571428571e-06
precision 0.75|recall 0.80|f1 0.78|acc 0.94
Average val loss:13548.30 sklearn_acc:0.94
time costed=848.7225s
-------------------------------
***** Running training epoch 8 *****
Epoch 008 | Step 0134/0672 | Loss 19.5674 | Time 162.0373 | Learning rate = 8.401785714285715e-06
Epoch 008 | Step 0268/0672 | Loss 20.5346 | Time 323.2465 | Learning rate = 7.803571428571429e-06
Epoch 008 | Step 0402/0672 | Loss 20.5707 | Time 484.4724 | Learning rate = 7.205357142857143e-06
Epoch 008 | Step 0536/0672 | Loss 21.3377 | Time 645.7742 | Learning rate = 6.607142857142857e-06
Epoch 008 | Step 0670/0672 | Loss 21.3365 | Time 806.9678 | Learning rate = 6.008928571428572e-06
precision 0.75|recall 0.80|f1 0.78|acc 0.94
Average val loss:14625.68 sklearn_acc:0.94
time costed=850.00101s
-------------------------------
***** Running training epoch 9 *****
Epoch 009 | Step 0134/0672 | Loss 15.8689 | Time 162.2782 | Learning rate = 5.4017857142857145e-06
Epoch 009 | Step 0268/0672 | Loss 15.7494 | Time 323.9999 | Learning rate = 4.803571428571428e-06
Epoch 009 | Step 0402/0672 | Loss 16.1067 | Time 485.5929 | Learning rate = 4.205357142857143e-06
Epoch 009 | Step 0536/0672 | Loss 15.9825 | Time 647.0968 | Learning rate = 3.6071428571428573e-06
Epoch 009 | Step 0670/0672 | Loss 15.6784 | Time 808.7016 | Learning rate = 3.0089285714285717e-06
precision 0.74|recall 0.81|f1 0.77|acc 0.94
Average val loss:15238.61 sklearn_acc:0.94
time costed=851.86068s
-------------------------------
***** Running training epoch 10 *****
Epoch 010 | Step 0134/0672 | Loss 12.5352 | Time 162.6241 | Learning rate = 2.401785714285714e-06
Epoch 010 | Step 0268/0672 | Loss 12.0666 | Time 327.8486 | Learning rate = 1.8035714285714286e-06
Epoch 010 | Step 0402/0672 | Loss 12.0815 | Time 492.9378 | Learning rate = 1.205357142857143e-06
Epoch 010 | Step 0536/0672 | Loss 12.2230 | Time 658.1718 | Learning rate = 6.071428571428572e-07
Epoch 010 | Step 0670/0672 | Loss 11.8025 | Time 823.4604 | Learning rate = 8.92857142857143e-09
precision 0.75|recall 0.80|f1 0.77|acc 0.94
Average val loss:15777.25 sklearn_acc:0.94
time costed=868.35004s
-------------------------------
'\n预测值不包括实际值会报错\nUndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n _warn_prf(average, modifier, msg_start, len(result))'
#torch.save(model.state_dict(),"./bert_lstm_crf/finall_blf_model") |
#编写predict函数 |
#预测验证集结果,查看各个tokens类别的指标 |
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:13: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
del sys.path[0]
precision | recall | f1 | number | |
---|---|---|---|---|
address | 0.555288 | 0.619303 | 0.585551 | 373.000000 |
scene | 0.697115 | 0.693780 | 0.695444 | 209.000000 |
overall_precision | 0.747256 | 0.747256 | 0.747256 | 0.747256 |
organization | 0.721805 | 0.784741 | 0.751958 | 367.000000 |
overall_f1 | 0.771725 | 0.771725 | 0.771725 | 0.771725 |
government | 0.739130 | 0.825911 | 0.780115 | 247.000000 |
position | 0.774554 | 0.801386 | 0.787741 | 433.000000 |
book | 0.775000 | 0.805195 | 0.789809 | 154.000000 |
company | 0.759124 | 0.825397 | 0.790875 | 378.000000 |
overall_recall | 0.797852 | 0.797852 | 0.797852 | 0.797852 |
movie | 0.807947 | 0.807947 | 0.807947 | 151.000000 |
game | 0.791925 | 0.864407 | 0.826580 | 295.000000 |
name | 0.865031 | 0.909677 | 0.886792 | 465.000000 |
overall_accuracy | 0.938997 | 0.938997 | 0.938997 | 0.938997 |
#将验证集真实标签和预测结果进行对比展示 |
words | labels | preds | |
---|---|---|---|
0 | [彭, 小, 军, 认, 为, ,, 国, 内, 银, 行, 现, 在, 走, 的, 是, ... | [7, 17, 17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0... | [7, 17, 17, 0, 0, 0, 3, 13, 13, 13, 0, 0, 0, 0... |
1 | [温, 格, 的, 球, 队, 终, 于, 又, 踢, 了, 一, 场, 经, 典, 的, ... | [7, 17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... | [7, 17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... |
2 | [突, 袭, 黑, 暗, 雅, 典, 娜, 》, 中, R, i, d, d, i, c, ... | [4, 14, 14, 14, 14, 14, 14, 14, 0, 7, 17, 17, ... | [6, 16, 16, 16, 16, 16, 16, 16, 0, 7, 17, 17, ... |
3 | [郑, 阿, 姨, 就, 赶, 到, 文, 汇, 路, 排, 队, 拿, 钱, ,, 希, ... | [0, 0, 0, 0, 0, 0, 1, 11, 11, 0, 0, 0, 0, 0, 0... | [0, 0, 0, 0, 0, 0, 1, 11, 11, 0, 0, 0, 0, 0, 0... |
4 | [我, 想, 站, 在, 雪, 山, 脚, 下, 你, 会, 被, 那, 巍, 峨, 的, ... | [0, 0, 0, 0, 10, 20, 0, 0, 0, 0, 0, 0, 0, 0, 0... | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... |
... | ... | ... | ... |
1338 | [在, 这, 个, 非, 常, 喜, 庆, 的, 日, 子, 里, ,, 我, 们, 首, ... | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... |
1339 | [姜, 哲, 中, :, 公, 共, 之, 敌, 1, -, 1, 》, 、, 《, 神, ... | [6, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16... | [6, 17, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16... |
1340 | [目, 前, ,, 日, 本, 松, 山, 海, 上, 保, 安, 部, 正, 在, 就, ... | [0, 0, 0, 5, 15, 15, 15, 15, 15, 15, 15, 15, 0... | [0, 0, 0, 5, 15, 15, 15, 15, 15, 15, 15, 15, 0... |
1341 | [也, 就, 是, 说, 英, 国, 人, 在, 世, 博, 会, 上, 的, 英, 国, ... | [0, 0, 0, 0, 0, 0, 0, 0, 10, 20, 20, 0, 0, 0, ... | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 20... |
1342 | [另, 外, 意, 大, 利, 的, P, l, a, y, G, e, n, e, r, ... | [0, 0, 0, 0, 0, 0, 2, 12, 12, 12, 12, 12, 12, ... | [0, 0, 1, 11, 11, 0, 2, 12, 12, 12, 12, 12, 12... |
1343 rows × 3 columns
用模型预测验证集结果,与原标签对比
y_pred,y_true,predictions=predict(model,test_loader) |