From 3e6f6f85b4d68ffd987d794bccf1aa08fe752e85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=85=B7=E7=9B=96?= <10061212+wh1051899053@user.noreply.gitee.com> Date: Wed, 1 Dec 2021 07:51:09 +0000 Subject: [PATCH 01/11] =?UTF-8?q?=E7=AC=AC=E4=B8=80=E6=AC=A1=E6=8F=90?= =?UTF-8?q?=E4=BA=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- glove_bilstm.ipynb | 597 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 597 insertions(+) create mode 100644 glove_bilstm.ipynb diff --git a/glove_bilstm.ipynb b/glove_bilstm.ipynb new file mode 100644 index 0000000..787f9d5 --- /dev/null +++ b/glove_bilstm.ipynb @@ -0,0 +1,597 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "fe8db7fb", + "metadata": {}, + "source": [ + "安装包" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "e831646d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Looking in indexes: http://repo.myhuaweicloud.com/repository/pypi/simple\n", + "Collecting gensim\n", + " Downloading http://repo.myhuaweicloud.com/repository/pypi/packages/9f/44/985c6291f160aca1257dae9b5bb62d91d0f61f12014297a2fa80e6464be1/gensim-4.1.2-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1 MB)\n", + "\u001b[K |████████████████████████████████| 24.1 MB 30.0 MB/s eta 0:00:016 MB 30.0 MB/s eta 0:00:01B 30.0 MB/s eta 0:00:01███████▉| 23.9 MB 30.0 MB/s eta 0:00:01\n", + "\u001b[?25hRequirement already satisfied: scipy>=0.18.1 in /home/ma-user/anaconda3/envs/MindSpore/lib/python3.7/site-packages (from gensim) (1.7.2)\n", + "Collecting smart-open>=1.8.1\n", + " Downloading http://repo.myhuaweicloud.com/repository/pypi/packages/cd/11/05f68ea934c24ade38e95ac30a38407767787c4e3db1776eae4886ad8c95/smart_open-5.2.1-py3-none-any.whl (58 kB)\n", + "\u001b[K |████████████████████████████████| 58 kB 26.0 MB/s eta 0:00:01\n", + "\u001b[?25hRequirement already satisfied: numpy>=1.17.0 in /home/ma-user/anaconda3/envs/MindSpore/lib/python3.7/site-packages (from gensim) (1.21.4)\n", + "Installing collected packages: smart-open, gensim\n", + "Successfully installed gensim-4.1.2 smart-open-5.2.1\n", + "\u001b[33mWARNING: You are using pip version 21.0.1; however, version 21.3 is available.\n", + "You should consider upgrading via the '/home/ma-user/anaconda3/envs/MindSpore/bin/python -m pip install --upgrade pip' command.\u001b[0m\n" + ] + } + ], + "source": [ + "!pip install gensim" + ] + }, + { + "cell_type": "markdown", + "id": "ad7c54d6", + "metadata": {}, + "source": [ + "导入包" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "44aa6bcd", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import math\n", + "from itertools import chain\n", + "import gensim\n", + "import numpy as np\n", + "from mindspore.mindrecord import FileWriter" + ] + }, + { + "cell_type": "markdown", + "id": "74543d4c", + "metadata": {}, + "source": [ + "### 1. 读入数据" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "5a2b601c", + "metadata": {}, + "outputs": [], + "source": [ + "# 定义读入数据的函数\n", + "def read_imdb(path, seg='train'):\n", + " labels = ['pos', 'neg']\n", + " data = []\n", + " for label in labels:\n", + " # rf:./aclImdb/seg/label.txt, eg:./aclImdb/train/pos.txt\n", + " with open(os.path.join(path, seg, label) + '.txt', 'r', encoding='utf8') as rf:\n", + " for review in rf.readlines():\n", + " review = review.replace('\\n', '')\n", + " if label == 'pos':\n", + " data.append([review, 1])\n", + " elif label == 'neg':\n", + " data.append([review, 0])\n", + " return data" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "df68cfff", + "metadata": {}, + "outputs": [], + "source": [ + "imdb_data_path='./aclImdb'\n", + "raw_data_train = read_imdb(imdb_data_path, seg='train')\n", + "raw_data_test = read_imdb(imdb_data_path, seg='test')" + ] + }, + { + "cell_type": "markdown", + "id": "2204a189", + "metadata": {}, + "source": [ + "### 2. 分词" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "c3619ef5", + "metadata": {}, + "outputs": [], + "source": [ + "# 定义分词函数\n", + "def tokenize_samples(raw_data):\n", + " tokenized_data = []\n", + " for review in raw_data:\n", + " tokenized_data.append([tok.lower() for tok in review.split()])\n", + " return tokenized_data" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "eaa71af9", + "metadata": {}, + "outputs": [], + "source": [ + "# tokenized_data:[[word,word,word,...],...]\n", + "tokenized_data_train = tokenize_samples([review for review, _ in raw_data_train])\n", + "tokenized_data_test = tokenize_samples([review for review, _ in raw_data_test])" + ] + }, + { + "cell_type": "markdown", + "id": "4d8e86a7", + "metadata": {}, + "source": [ + "### 3. word2idx" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "bb31338d", + "metadata": {}, + "outputs": [], + "source": [ + "# 建立word->index的字典,用作后面将文字转化为数字序列\n", + "vocab = set(chain(*tokenized_data_train))\n", + "word_to_idx = {word: i+1 for i, word in enumerate(vocab)}\n", + "word_to_idx[''] = 0" + ] + }, + { + "cell_type": "markdown", + "id": "51c1ed8e", + "metadata": {}, + "source": [ + "### 4. embedding" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "ce4f1a73", + "metadata": {}, + "outputs": [], + "source": [ + "# !sed -i '1i\\400000 100' ./glove.6B.100d.txt\n", + "def load_embeddings(glove_file_path, word_to_idx, embed_size=100):\n", + " word2vector = gensim.models.KeyedVectors.load_word2vec_format(\n", + " glove_file_path, binary=False, encoding='utf-8')\n", + " assert embed_size == word2vector.vector_size\n", + " embeddings = np.zeros((len(word_to_idx), embed_size)).astype(np.float32)\n", + " for word, idx in word_to_idx.items():\n", + " try:\n", + " embeddings[idx, :] = word2vector.get_vector(word)\n", + " except KeyError:\n", + " continue\n", + " return embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "899ce46a", + "metadata": {}, + "outputs": [], + "source": [ + "embeddings = load_embeddings('./glove/glove.6B.100d.txt', word_to_idx)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "d3e93498", + "metadata": {}, + "outputs": [], + "source": [ + "np.savetxt(\"./cache/weight.txt\", embeddings)" + ] + }, + { + "cell_type": "markdown", + "id": "b0a6045f", + "metadata": {}, + "source": [ + "### 5. encode" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "d83aaf7c", + "metadata": {}, + "outputs": [], + "source": [ + "def encode_samples(tokenized_samples, word_to_idx):\n", + " \"\"\"\n", + " tokenized_samples: [[word, word, ...]]\n", + " word_to_idx: {word:idx, word:idx, ...}\n", + " features: [[idx, idx, ...], [idx, idx, ...], ...]\n", + " \"\"\"\n", + " features = []\n", + " for sample in tokenized_samples:\n", + " feature = []\n", + " for token in sample:\n", + " feature.append(word_to_idx.get(token, 0))\n", + " features.append(feature)\n", + " return features" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "fd203f67", + "metadata": {}, + "outputs": [], + "source": [ + "def pad_samples(features, maxlen=500, pad=0):\n", + " padded_features = []\n", + " for feature in features:\n", + " if len(feature) >= maxlen:\n", + " padded_feature = feature[:maxlen]\n", + " else:\n", + " padded_feature = feature\n", + " while len(padded_feature) < maxlen:\n", + " padded_feature.append(pad)\n", + " padded_features.append(padded_feature)\n", + " return padded_features" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "5dae973a", + "metadata": {}, + "outputs": [], + "source": [ + "y_train = np.array([label for _, label in raw_data_train]).astype(np.int32)\n", + "y_test = np.array([label for _, label in raw_data_test]).astype(np.int32)\n", + "\n", + "X_train = np.array(pad_samples(encode_samples(tokenized_data_train, word_to_idx))).astype(np.int32)\n", + "X_test = np.array(pad_samples(encode_samples(tokenized_data_test, word_to_idx))).astype(np.int32)" + ] + }, + { + "cell_type": "markdown", + "id": "324b1291", + "metadata": {}, + "source": [ + "### 6. convert to mindrecord" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "80a0e3b2", + "metadata": {}, + "outputs": [], + "source": [ + "def get_json_data_list(X, y):\n", + " data_list = []\n", + " for i, (feature, label) in enumerate(zip(X, y)):\n", + " data_json = {\"id\": i, \"feature\": feature.reshape(-1), \"label\": int(label)}\n", + " data_list.append(data_json)\n", + " return data_list" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "a9440916", + "metadata": {}, + "outputs": [], + "source": [ + "def convert_np_to_mindrecord(X_train, y_train, X_test, y_test, mindrecord_save_path=\"./mindrecord\"):\n", + " schema_json = {\"id\": {\"type\": \"int32\"},\n", + " \"label\": {\"type\": \"int32\"},\n", + " \"feature\": {\"type\": \"int32\", \"shape\": [-1]}}\n", + " writer = FileWriter(os.path.join(mindrecord_save_path, \"aclImdb_train.mindrecord\"), shard_num=4)\n", + " data_train = get_json_data_list(X_train, y_train)\n", + " writer.add_schema(schema_json, \"nlp_schema\")\n", + " writer.add_index([\"id\", \"label\"])\n", + " writer.write_raw_data(data_train)\n", + " writer.commit()\n", + " \n", + " writer = FileWriter(os.path.join(mindrecord_save_path, \"aclImdb_test.mindrecord\"), shard_num=4)\n", + " data_test = get_json_data_list(X_test, y_test)\n", + " writer.add_schema(schema_json, \"nlp_schema\")\n", + " writer.add_index([\"id\", \"label\"])\n", + " writer.write_raw_data(data_test)\n", + " writer.commit()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "8a3dad17", + "metadata": {}, + "outputs": [], + "source": [ + "convert_np_to_mindrecord(X_train, y_train, X_test, y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "e6952e97", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "aclImdb_test.mindrecord0 aclImdb_train.mindrecord0\n", + "aclImdb_test.mindrecord0.db aclImdb_train.mindrecord0.db\n", + "aclImdb_test.mindrecord1 aclImdb_train.mindrecord1\n", + "aclImdb_test.mindrecord1.db aclImdb_train.mindrecord1.db\n", + "aclImdb_test.mindrecord2 aclImdb_train.mindrecord2\n", + "aclImdb_test.mindrecord2.db aclImdb_train.mindrecord2.db\n", + "aclImdb_test.mindrecord3 aclImdb_train.mindrecord3\n", + "aclImdb_test.mindrecord3.db aclImdb_train.mindrecord3.db\n" + ] + } + ], + "source": [ + "!ls ./mindrecord" + ] + }, + { + "cell_type": "markdown", + "id": "500a2411", + "metadata": {}, + "source": [ + "### 7. 创建数据集" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "9108f4e0", + "metadata": {}, + "outputs": [], + "source": [ + "import mindspore.dataset as mds" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "edc4efdd", + "metadata": {}, + "outputs": [], + "source": [ + "def create_dataset(base_path, batch_size, num_epochs, is_train):\n", + " columns_list = [\"feature\", \"label\"]\n", + " num_consumer = 4\n", + " if is_train:\n", + " path = os.path.join(base_path, \"aclImdb_train.mindrecord0\")\n", + " else:\n", + " path = os.path.join(base_path, \"aclImdb_test.mindrecord0\")\n", + " dataset = mds.MindDataset(path, columns_list=[\"feature\", \"label\"], num_parallel_workers=4)\n", + " dataset = dataset.shuffle(buffer_size=dataset.get_dataset_size())\n", + " dataset = dataset.batch(batch_size=batch_size, drop_remainder=True)\n", + " dataset = dataset.repeat(count=num_epochs)\n", + " return dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "7782b64e", + "metadata": {}, + "outputs": [], + "source": [ + "dataset_train = create_dataset(\"./mindrecord\", batch_size=32, num_epochs=10, is_train=True)" + ] + }, + { + "cell_type": "markdown", + "id": "452b266e", + "metadata": {}, + "source": [ + "### 8. 定义模型并训练" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "a6f692fd", + "metadata": {}, + "outputs": [], + "source": [ + "from mindspore import Tensor, nn, Model, context, Parameter\n", + "from mindspore.common.initializer import initializer\n", + "from mindspore.ops import operations as P\n", + "from mindspore.nn import Accuracy,LSTM\n", + "from mindspore.train.callback import LossMonitor, CheckpointConfig, ModelCheckpoint, TimeMonitor\n", + "from lstm import SentimentNet" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "50bcb227", + "metadata": {}, + "outputs": [], + "source": [ + "embedding_tabel = np.loadtxt(os.path.join(\"./cache\", \"weight.txt\")).astype(np.float32)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "ad0cec2f", + "metadata": {}, + "outputs": [], + "source": [ + "network = SentimentNet(vocab_size=embedding_tabel.shape[0],\n", + " embed_size=100,\n", + " num_hiddens=100,\n", + " num_layers=2,\n", + " bidirectional=False,\n", + " num_classes=2,\n", + " weight=Tensor(embedding_tabel),\n", + " batch_size=32)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "18e4ad24", + "metadata": {}, + "outputs": [], + "source": [ + "loss = nn.SoftmaxCrossEntropyWithLogits(reduction='mean', sparse=True)\n", + "opt = nn.Momentum(network.trainable_params(), 0.1, 0.9)\n", + "loss_callback = LossMonitor(per_print_times=3000)\n", + "model = Model(network, loss, opt, {'acc': Accuracy()})\n", + "config_ck = CheckpointConfig(save_checkpoint_steps=390, keep_checkpoint_max=10)\n", + "checkpoint_cb = ModelCheckpoint(prefix=\"lstm\", directory=\"./model\", config=config_ck)\n", + "time_cb = TimeMonitor(data_size=dataset_train.get_dataset_size())" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "9fec8d6b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch: 1 step: 3000, loss is 0.6625426\n", + "epoch: 1 step: 6000, loss is 0.6949122\n", + "epoch time: 300260.071 ms, per step time: 38.446 ms\n", + "epoch: 2 step: 1190, loss is 0.7159466\n", + "epoch: 2 step: 4190, loss is 0.66605085\n", + "epoch: 2 step: 7190, loss is 0.59723616\n", + "epoch time: 227865.086 ms, per step time: 29.176 ms\n", + "epoch: 3 step: 2380, loss is 0.63499504\n", + "epoch: 3 step: 5380, loss is 0.6277052\n", + "epoch time: 228694.479 ms, per step time: 29.282 ms\n", + "epoch: 4 step: 570, loss is 0.5825846\n", + "epoch: 4 step: 3570, loss is 0.6407242\n", + "epoch: 4 step: 6570, loss is 0.6360452\n", + "epoch time: 228434.188 ms, per step time: 29.249 ms\n", + "epoch: 5 step: 1760, loss is 0.618327\n", + "epoch: 5 step: 4760, loss is 0.34937367\n", + "epoch: 5 step: 7760, loss is 0.3124014\n", + "epoch time: 228633.055 ms, per step time: 29.274 ms\n", + "epoch: 6 step: 2950, loss is 0.6037954\n", + "epoch: 6 step: 5950, loss is 0.49445567\n", + "epoch time: 227237.265 ms, per step time: 29.096 ms\n", + "epoch: 7 step: 1140, loss is 0.19343969\n", + "epoch: 7 step: 4140, loss is 0.2573592\n", + "epoch: 7 step: 7140, loss is 0.11479706\n", + "epoch time: 228353.065 ms, per step time: 29.239 ms\n", + "epoch: 8 step: 2330, loss is 0.32086658\n", + "epoch: 8 step: 5330, loss is 0.20143133\n", + "epoch time: 228068.243 ms, per step time: 29.202 ms\n", + "epoch: 9 step: 520, loss is 0.081819385\n", + "epoch: 9 step: 3520, loss is 0.3052325\n", + "epoch: 9 step: 6520, loss is 0.40288585\n", + "epoch time: 227941.615 ms, per step time: 29.186 ms\n", + "epoch: 10 step: 1710, loss is 0.012856578\n", + "epoch: 10 step: 4710, loss is 0.06560053\n", + "epoch: 10 step: 7710, loss is 0.07241162\n", + "epoch time: 228342.392 ms, per step time: 29.237 ms\n" + ] + } + ], + "source": [ + "from mindspore import context\n", + "context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target=\"GPU\")\n", + "model.train(10, dataset_train, callbacks=[time_cb, checkpoint_cb, loss_callback], dataset_sink_mode=False)" + ] + }, + { + "cell_type": "markdown", + "id": "0089756b", + "metadata": {}, + "source": [ + "### 9. 评估模型" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "eb6d15da", + "metadata": {}, + "outputs": [], + "source": [ + "dataset_test = create_dataset(\"./mindrecord\", batch_size=32, num_epochs=10, is_train=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "d89f2d0b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "accuracy:{'acc': 0.813644366197183}\n" + ] + } + ], + "source": [ + "acc = model.eval(dataset_test)\n", + "print(\"accuracy:{}\".format(acc))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a5c1d85a", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "MindSpore", + "language": "python", + "name": "mindspore" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} -- Gitee From e9bff42ecde06576fa51c112bce8f73936f3acb5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=85=B7=E7=9B=96?= <10061212+wh1051899053@user.noreply.gitee.com> Date: Wed, 1 Dec 2021 07:51:29 +0000 Subject: [PATCH 02/11] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20gl?= =?UTF-8?q?ove=5Fbilstm.ipynb?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- glove_bilstm.ipynb | 597 --------------------------------------------- 1 file changed, 597 deletions(-) delete mode 100644 glove_bilstm.ipynb diff --git a/glove_bilstm.ipynb b/glove_bilstm.ipynb deleted file mode 100644 index 787f9d5..0000000 --- a/glove_bilstm.ipynb +++ /dev/null @@ -1,597 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "fe8db7fb", - "metadata": {}, - "source": [ - "安装包" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "e831646d", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Looking in indexes: http://repo.myhuaweicloud.com/repository/pypi/simple\n", - "Collecting gensim\n", - " Downloading http://repo.myhuaweicloud.com/repository/pypi/packages/9f/44/985c6291f160aca1257dae9b5bb62d91d0f61f12014297a2fa80e6464be1/gensim-4.1.2-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1 MB)\n", - "\u001b[K |████████████████████████████████| 24.1 MB 30.0 MB/s eta 0:00:016 MB 30.0 MB/s eta 0:00:01B 30.0 MB/s eta 0:00:01███████▉| 23.9 MB 30.0 MB/s eta 0:00:01\n", - "\u001b[?25hRequirement already satisfied: scipy>=0.18.1 in /home/ma-user/anaconda3/envs/MindSpore/lib/python3.7/site-packages (from gensim) (1.7.2)\n", - "Collecting smart-open>=1.8.1\n", - " Downloading http://repo.myhuaweicloud.com/repository/pypi/packages/cd/11/05f68ea934c24ade38e95ac30a38407767787c4e3db1776eae4886ad8c95/smart_open-5.2.1-py3-none-any.whl (58 kB)\n", - "\u001b[K |████████████████████████████████| 58 kB 26.0 MB/s eta 0:00:01\n", - "\u001b[?25hRequirement already satisfied: numpy>=1.17.0 in /home/ma-user/anaconda3/envs/MindSpore/lib/python3.7/site-packages (from gensim) (1.21.4)\n", - "Installing collected packages: smart-open, gensim\n", - "Successfully installed gensim-4.1.2 smart-open-5.2.1\n", - "\u001b[33mWARNING: You are using pip version 21.0.1; however, version 21.3 is available.\n", - "You should consider upgrading via the '/home/ma-user/anaconda3/envs/MindSpore/bin/python -m pip install --upgrade pip' command.\u001b[0m\n" - ] - } - ], - "source": [ - "!pip install gensim" - ] - }, - { - "cell_type": "markdown", - "id": "ad7c54d6", - "metadata": {}, - "source": [ - "导入包" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "44aa6bcd", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import math\n", - "from itertools import chain\n", - "import gensim\n", - "import numpy as np\n", - "from mindspore.mindrecord import FileWriter" - ] - }, - { - "cell_type": "markdown", - "id": "74543d4c", - "metadata": {}, - "source": [ - "### 1. 读入数据" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "5a2b601c", - "metadata": {}, - "outputs": [], - "source": [ - "# 定义读入数据的函数\n", - "def read_imdb(path, seg='train'):\n", - " labels = ['pos', 'neg']\n", - " data = []\n", - " for label in labels:\n", - " # rf:./aclImdb/seg/label.txt, eg:./aclImdb/train/pos.txt\n", - " with open(os.path.join(path, seg, label) + '.txt', 'r', encoding='utf8') as rf:\n", - " for review in rf.readlines():\n", - " review = review.replace('\\n', '')\n", - " if label == 'pos':\n", - " data.append([review, 1])\n", - " elif label == 'neg':\n", - " data.append([review, 0])\n", - " return data" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "df68cfff", - "metadata": {}, - "outputs": [], - "source": [ - "imdb_data_path='./aclImdb'\n", - "raw_data_train = read_imdb(imdb_data_path, seg='train')\n", - "raw_data_test = read_imdb(imdb_data_path, seg='test')" - ] - }, - { - "cell_type": "markdown", - "id": "2204a189", - "metadata": {}, - "source": [ - "### 2. 分词" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "c3619ef5", - "metadata": {}, - "outputs": [], - "source": [ - "# 定义分词函数\n", - "def tokenize_samples(raw_data):\n", - " tokenized_data = []\n", - " for review in raw_data:\n", - " tokenized_data.append([tok.lower() for tok in review.split()])\n", - " return tokenized_data" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "eaa71af9", - "metadata": {}, - "outputs": [], - "source": [ - "# tokenized_data:[[word,word,word,...],...]\n", - "tokenized_data_train = tokenize_samples([review for review, _ in raw_data_train])\n", - "tokenized_data_test = tokenize_samples([review for review, _ in raw_data_test])" - ] - }, - { - "cell_type": "markdown", - "id": "4d8e86a7", - "metadata": {}, - "source": [ - "### 3. word2idx" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "bb31338d", - "metadata": {}, - "outputs": [], - "source": [ - "# 建立word->index的字典,用作后面将文字转化为数字序列\n", - "vocab = set(chain(*tokenized_data_train))\n", - "word_to_idx = {word: i+1 for i, word in enumerate(vocab)}\n", - "word_to_idx[''] = 0" - ] - }, - { - "cell_type": "markdown", - "id": "51c1ed8e", - "metadata": {}, - "source": [ - "### 4. embedding" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "ce4f1a73", - "metadata": {}, - "outputs": [], - "source": [ - "# !sed -i '1i\\400000 100' ./glove.6B.100d.txt\n", - "def load_embeddings(glove_file_path, word_to_idx, embed_size=100):\n", - " word2vector = gensim.models.KeyedVectors.load_word2vec_format(\n", - " glove_file_path, binary=False, encoding='utf-8')\n", - " assert embed_size == word2vector.vector_size\n", - " embeddings = np.zeros((len(word_to_idx), embed_size)).astype(np.float32)\n", - " for word, idx in word_to_idx.items():\n", - " try:\n", - " embeddings[idx, :] = word2vector.get_vector(word)\n", - " except KeyError:\n", - " continue\n", - " return embeddings" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "899ce46a", - "metadata": {}, - "outputs": [], - "source": [ - "embeddings = load_embeddings('./glove/glove.6B.100d.txt', word_to_idx)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "d3e93498", - "metadata": {}, - "outputs": [], - "source": [ - "np.savetxt(\"./cache/weight.txt\", embeddings)" - ] - }, - { - "cell_type": "markdown", - "id": "b0a6045f", - "metadata": {}, - "source": [ - "### 5. encode" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "d83aaf7c", - "metadata": {}, - "outputs": [], - "source": [ - "def encode_samples(tokenized_samples, word_to_idx):\n", - " \"\"\"\n", - " tokenized_samples: [[word, word, ...]]\n", - " word_to_idx: {word:idx, word:idx, ...}\n", - " features: [[idx, idx, ...], [idx, idx, ...], ...]\n", - " \"\"\"\n", - " features = []\n", - " for sample in tokenized_samples:\n", - " feature = []\n", - " for token in sample:\n", - " feature.append(word_to_idx.get(token, 0))\n", - " features.append(feature)\n", - " return features" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "fd203f67", - "metadata": {}, - "outputs": [], - "source": [ - "def pad_samples(features, maxlen=500, pad=0):\n", - " padded_features = []\n", - " for feature in features:\n", - " if len(feature) >= maxlen:\n", - " padded_feature = feature[:maxlen]\n", - " else:\n", - " padded_feature = feature\n", - " while len(padded_feature) < maxlen:\n", - " padded_feature.append(pad)\n", - " padded_features.append(padded_feature)\n", - " return padded_features" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "5dae973a", - "metadata": {}, - "outputs": [], - "source": [ - "y_train = np.array([label for _, label in raw_data_train]).astype(np.int32)\n", - "y_test = np.array([label for _, label in raw_data_test]).astype(np.int32)\n", - "\n", - "X_train = np.array(pad_samples(encode_samples(tokenized_data_train, word_to_idx))).astype(np.int32)\n", - "X_test = np.array(pad_samples(encode_samples(tokenized_data_test, word_to_idx))).astype(np.int32)" - ] - }, - { - "cell_type": "markdown", - "id": "324b1291", - "metadata": {}, - "source": [ - "### 6. convert to mindrecord" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "80a0e3b2", - "metadata": {}, - "outputs": [], - "source": [ - "def get_json_data_list(X, y):\n", - " data_list = []\n", - " for i, (feature, label) in enumerate(zip(X, y)):\n", - " data_json = {\"id\": i, \"feature\": feature.reshape(-1), \"label\": int(label)}\n", - " data_list.append(data_json)\n", - " return data_list" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "a9440916", - "metadata": {}, - "outputs": [], - "source": [ - "def convert_np_to_mindrecord(X_train, y_train, X_test, y_test, mindrecord_save_path=\"./mindrecord\"):\n", - " schema_json = {\"id\": {\"type\": \"int32\"},\n", - " \"label\": {\"type\": \"int32\"},\n", - " \"feature\": {\"type\": \"int32\", \"shape\": [-1]}}\n", - " writer = FileWriter(os.path.join(mindrecord_save_path, \"aclImdb_train.mindrecord\"), shard_num=4)\n", - " data_train = get_json_data_list(X_train, y_train)\n", - " writer.add_schema(schema_json, \"nlp_schema\")\n", - " writer.add_index([\"id\", \"label\"])\n", - " writer.write_raw_data(data_train)\n", - " writer.commit()\n", - " \n", - " writer = FileWriter(os.path.join(mindrecord_save_path, \"aclImdb_test.mindrecord\"), shard_num=4)\n", - " data_test = get_json_data_list(X_test, y_test)\n", - " writer.add_schema(schema_json, \"nlp_schema\")\n", - " writer.add_index([\"id\", \"label\"])\n", - " writer.write_raw_data(data_test)\n", - " writer.commit()" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "8a3dad17", - "metadata": {}, - "outputs": [], - "source": [ - "convert_np_to_mindrecord(X_train, y_train, X_test, y_test)" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "e6952e97", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "aclImdb_test.mindrecord0 aclImdb_train.mindrecord0\n", - "aclImdb_test.mindrecord0.db aclImdb_train.mindrecord0.db\n", - "aclImdb_test.mindrecord1 aclImdb_train.mindrecord1\n", - "aclImdb_test.mindrecord1.db aclImdb_train.mindrecord1.db\n", - "aclImdb_test.mindrecord2 aclImdb_train.mindrecord2\n", - "aclImdb_test.mindrecord2.db aclImdb_train.mindrecord2.db\n", - "aclImdb_test.mindrecord3 aclImdb_train.mindrecord3\n", - "aclImdb_test.mindrecord3.db aclImdb_train.mindrecord3.db\n" - ] - } - ], - "source": [ - "!ls ./mindrecord" - ] - }, - { - "cell_type": "markdown", - "id": "500a2411", - "metadata": {}, - "source": [ - "### 7. 创建数据集" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "9108f4e0", - "metadata": {}, - "outputs": [], - "source": [ - "import mindspore.dataset as mds" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "edc4efdd", - "metadata": {}, - "outputs": [], - "source": [ - "def create_dataset(base_path, batch_size, num_epochs, is_train):\n", - " columns_list = [\"feature\", \"label\"]\n", - " num_consumer = 4\n", - " if is_train:\n", - " path = os.path.join(base_path, \"aclImdb_train.mindrecord0\")\n", - " else:\n", - " path = os.path.join(base_path, \"aclImdb_test.mindrecord0\")\n", - " dataset = mds.MindDataset(path, columns_list=[\"feature\", \"label\"], num_parallel_workers=4)\n", - " dataset = dataset.shuffle(buffer_size=dataset.get_dataset_size())\n", - " dataset = dataset.batch(batch_size=batch_size, drop_remainder=True)\n", - " dataset = dataset.repeat(count=num_epochs)\n", - " return dataset" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "7782b64e", - "metadata": {}, - "outputs": [], - "source": [ - "dataset_train = create_dataset(\"./mindrecord\", batch_size=32, num_epochs=10, is_train=True)" - ] - }, - { - "cell_type": "markdown", - "id": "452b266e", - "metadata": {}, - "source": [ - "### 8. 定义模型并训练" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "a6f692fd", - "metadata": {}, - "outputs": [], - "source": [ - "from mindspore import Tensor, nn, Model, context, Parameter\n", - "from mindspore.common.initializer import initializer\n", - "from mindspore.ops import operations as P\n", - "from mindspore.nn import Accuracy,LSTM\n", - "from mindspore.train.callback import LossMonitor, CheckpointConfig, ModelCheckpoint, TimeMonitor\n", - "from lstm import SentimentNet" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "50bcb227", - "metadata": {}, - "outputs": [], - "source": [ - "embedding_tabel = np.loadtxt(os.path.join(\"./cache\", \"weight.txt\")).astype(np.float32)" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "id": "ad0cec2f", - "metadata": {}, - "outputs": [], - "source": [ - "network = SentimentNet(vocab_size=embedding_tabel.shape[0],\n", - " embed_size=100,\n", - " num_hiddens=100,\n", - " num_layers=2,\n", - " bidirectional=False,\n", - " num_classes=2,\n", - " weight=Tensor(embedding_tabel),\n", - " batch_size=32)" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "18e4ad24", - "metadata": {}, - "outputs": [], - "source": [ - "loss = nn.SoftmaxCrossEntropyWithLogits(reduction='mean', sparse=True)\n", - "opt = nn.Momentum(network.trainable_params(), 0.1, 0.9)\n", - "loss_callback = LossMonitor(per_print_times=3000)\n", - "model = Model(network, loss, opt, {'acc': Accuracy()})\n", - "config_ck = CheckpointConfig(save_checkpoint_steps=390, keep_checkpoint_max=10)\n", - "checkpoint_cb = ModelCheckpoint(prefix=\"lstm\", directory=\"./model\", config=config_ck)\n", - "time_cb = TimeMonitor(data_size=dataset_train.get_dataset_size())" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "9fec8d6b", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "epoch: 1 step: 3000, loss is 0.6625426\n", - "epoch: 1 step: 6000, loss is 0.6949122\n", - "epoch time: 300260.071 ms, per step time: 38.446 ms\n", - "epoch: 2 step: 1190, loss is 0.7159466\n", - "epoch: 2 step: 4190, loss is 0.66605085\n", - "epoch: 2 step: 7190, loss is 0.59723616\n", - "epoch time: 227865.086 ms, per step time: 29.176 ms\n", - "epoch: 3 step: 2380, loss is 0.63499504\n", - "epoch: 3 step: 5380, loss is 0.6277052\n", - "epoch time: 228694.479 ms, per step time: 29.282 ms\n", - "epoch: 4 step: 570, loss is 0.5825846\n", - "epoch: 4 step: 3570, loss is 0.6407242\n", - "epoch: 4 step: 6570, loss is 0.6360452\n", - "epoch time: 228434.188 ms, per step time: 29.249 ms\n", - "epoch: 5 step: 1760, loss is 0.618327\n", - "epoch: 5 step: 4760, loss is 0.34937367\n", - "epoch: 5 step: 7760, loss is 0.3124014\n", - "epoch time: 228633.055 ms, per step time: 29.274 ms\n", - "epoch: 6 step: 2950, loss is 0.6037954\n", - "epoch: 6 step: 5950, loss is 0.49445567\n", - "epoch time: 227237.265 ms, per step time: 29.096 ms\n", - "epoch: 7 step: 1140, loss is 0.19343969\n", - "epoch: 7 step: 4140, loss is 0.2573592\n", - "epoch: 7 step: 7140, loss is 0.11479706\n", - "epoch time: 228353.065 ms, per step time: 29.239 ms\n", - "epoch: 8 step: 2330, loss is 0.32086658\n", - "epoch: 8 step: 5330, loss is 0.20143133\n", - "epoch time: 228068.243 ms, per step time: 29.202 ms\n", - "epoch: 9 step: 520, loss is 0.081819385\n", - "epoch: 9 step: 3520, loss is 0.3052325\n", - "epoch: 9 step: 6520, loss is 0.40288585\n", - "epoch time: 227941.615 ms, per step time: 29.186 ms\n", - "epoch: 10 step: 1710, loss is 0.012856578\n", - "epoch: 10 step: 4710, loss is 0.06560053\n", - "epoch: 10 step: 7710, loss is 0.07241162\n", - "epoch time: 228342.392 ms, per step time: 29.237 ms\n" - ] - } - ], - "source": [ - "from mindspore import context\n", - "context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target=\"GPU\")\n", - "model.train(10, dataset_train, callbacks=[time_cb, checkpoint_cb, loss_callback], dataset_sink_mode=False)" - ] - }, - { - "cell_type": "markdown", - "id": "0089756b", - "metadata": {}, - "source": [ - "### 9. 评估模型" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "eb6d15da", - "metadata": {}, - "outputs": [], - "source": [ - "dataset_test = create_dataset(\"./mindrecord\", batch_size=32, num_epochs=10, is_train=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "d89f2d0b", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "accuracy:{'acc': 0.813644366197183}\n" - ] - } - ], - "source": [ - "acc = model.eval(dataset_test)\n", - "print(\"accuracy:{}\".format(acc))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a5c1d85a", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "MindSpore", - "language": "python", - "name": "mindspore" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.10" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} -- Gitee From e7055004cd4fae19a813698046941f5e42ebf3c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=85=B7=E7=9B=96?= <10061212+wh1051899053@user.noreply.gitee.com> Date: Wed, 1 Dec 2021 07:52:27 +0000 Subject: [PATCH 03/11] =?UTF-8?q?=E6=96=B0=E5=BB=BA=20=E9=87=91=E4=BD=B3?= =?UTF-8?q?=E6=83=A0-=E5=9F=BA=E4=BA=8EMindspore=E7=9A=84=E4=BA=8C?= =?UTF-8?q?=E5=85=83=E6=83=85=E6=84=9F=E5=88=86=E6=9E=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../.keep" | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 "code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/.keep" diff --git "a/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/.keep" "b/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/.keep" new file mode 100644 index 0000000..e69de29 -- Gitee From e9cd8e678bdea266866bc8dfd775c98aeb2860ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=85=B7=E7=9B=96?= <10061212+wh1051899053@user.noreply.gitee.com> Date: Wed, 1 Dec 2021 07:52:55 +0000 Subject: [PATCH 04/11] =?UTF-8?q?=E7=AC=AC=E4=B8=80=E6=AC=A1=E6=8F=90?= =?UTF-8?q?=E4=BA=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../glove_bilstm.ipynb" | 597 ++++++++++++++++++ .../lstm.py" | 326 ++++++++++ .../readme.md" | 41 ++ 3 files changed, 964 insertions(+) create mode 100644 "code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/glove_bilstm.ipynb" create mode 100644 "code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/lstm.py" create mode 100644 "code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/readme.md" diff --git "a/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/glove_bilstm.ipynb" "b/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/glove_bilstm.ipynb" new file mode 100644 index 0000000..787f9d5 --- /dev/null +++ "b/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/glove_bilstm.ipynb" @@ -0,0 +1,597 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "fe8db7fb", + "metadata": {}, + "source": [ + "安装包" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "e831646d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Looking in indexes: http://repo.myhuaweicloud.com/repository/pypi/simple\n", + "Collecting gensim\n", + " Downloading http://repo.myhuaweicloud.com/repository/pypi/packages/9f/44/985c6291f160aca1257dae9b5bb62d91d0f61f12014297a2fa80e6464be1/gensim-4.1.2-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1 MB)\n", + "\u001b[K |████████████████████████████████| 24.1 MB 30.0 MB/s eta 0:00:016 MB 30.0 MB/s eta 0:00:01B 30.0 MB/s eta 0:00:01███████▉| 23.9 MB 30.0 MB/s eta 0:00:01\n", + "\u001b[?25hRequirement already satisfied: scipy>=0.18.1 in /home/ma-user/anaconda3/envs/MindSpore/lib/python3.7/site-packages (from gensim) (1.7.2)\n", + "Collecting smart-open>=1.8.1\n", + " Downloading http://repo.myhuaweicloud.com/repository/pypi/packages/cd/11/05f68ea934c24ade38e95ac30a38407767787c4e3db1776eae4886ad8c95/smart_open-5.2.1-py3-none-any.whl (58 kB)\n", + "\u001b[K |████████████████████████████████| 58 kB 26.0 MB/s eta 0:00:01\n", + "\u001b[?25hRequirement already satisfied: numpy>=1.17.0 in /home/ma-user/anaconda3/envs/MindSpore/lib/python3.7/site-packages (from gensim) (1.21.4)\n", + "Installing collected packages: smart-open, gensim\n", + "Successfully installed gensim-4.1.2 smart-open-5.2.1\n", + "\u001b[33mWARNING: You are using pip version 21.0.1; however, version 21.3 is available.\n", + "You should consider upgrading via the '/home/ma-user/anaconda3/envs/MindSpore/bin/python -m pip install --upgrade pip' command.\u001b[0m\n" + ] + } + ], + "source": [ + "!pip install gensim" + ] + }, + { + "cell_type": "markdown", + "id": "ad7c54d6", + "metadata": {}, + "source": [ + "导入包" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "44aa6bcd", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import math\n", + "from itertools import chain\n", + "import gensim\n", + "import numpy as np\n", + "from mindspore.mindrecord import FileWriter" + ] + }, + { + "cell_type": "markdown", + "id": "74543d4c", + "metadata": {}, + "source": [ + "### 1. 读入数据" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "5a2b601c", + "metadata": {}, + "outputs": [], + "source": [ + "# 定义读入数据的函数\n", + "def read_imdb(path, seg='train'):\n", + " labels = ['pos', 'neg']\n", + " data = []\n", + " for label in labels:\n", + " # rf:./aclImdb/seg/label.txt, eg:./aclImdb/train/pos.txt\n", + " with open(os.path.join(path, seg, label) + '.txt', 'r', encoding='utf8') as rf:\n", + " for review in rf.readlines():\n", + " review = review.replace('\\n', '')\n", + " if label == 'pos':\n", + " data.append([review, 1])\n", + " elif label == 'neg':\n", + " data.append([review, 0])\n", + " return data" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "df68cfff", + "metadata": {}, + "outputs": [], + "source": [ + "imdb_data_path='./aclImdb'\n", + "raw_data_train = read_imdb(imdb_data_path, seg='train')\n", + "raw_data_test = read_imdb(imdb_data_path, seg='test')" + ] + }, + { + "cell_type": "markdown", + "id": "2204a189", + "metadata": {}, + "source": [ + "### 2. 分词" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "c3619ef5", + "metadata": {}, + "outputs": [], + "source": [ + "# 定义分词函数\n", + "def tokenize_samples(raw_data):\n", + " tokenized_data = []\n", + " for review in raw_data:\n", + " tokenized_data.append([tok.lower() for tok in review.split()])\n", + " return tokenized_data" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "eaa71af9", + "metadata": {}, + "outputs": [], + "source": [ + "# tokenized_data:[[word,word,word,...],...]\n", + "tokenized_data_train = tokenize_samples([review for review, _ in raw_data_train])\n", + "tokenized_data_test = tokenize_samples([review for review, _ in raw_data_test])" + ] + }, + { + "cell_type": "markdown", + "id": "4d8e86a7", + "metadata": {}, + "source": [ + "### 3. word2idx" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "bb31338d", + "metadata": {}, + "outputs": [], + "source": [ + "# 建立word->index的字典,用作后面将文字转化为数字序列\n", + "vocab = set(chain(*tokenized_data_train))\n", + "word_to_idx = {word: i+1 for i, word in enumerate(vocab)}\n", + "word_to_idx[''] = 0" + ] + }, + { + "cell_type": "markdown", + "id": "51c1ed8e", + "metadata": {}, + "source": [ + "### 4. embedding" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "ce4f1a73", + "metadata": {}, + "outputs": [], + "source": [ + "# !sed -i '1i\\400000 100' ./glove.6B.100d.txt\n", + "def load_embeddings(glove_file_path, word_to_idx, embed_size=100):\n", + " word2vector = gensim.models.KeyedVectors.load_word2vec_format(\n", + " glove_file_path, binary=False, encoding='utf-8')\n", + " assert embed_size == word2vector.vector_size\n", + " embeddings = np.zeros((len(word_to_idx), embed_size)).astype(np.float32)\n", + " for word, idx in word_to_idx.items():\n", + " try:\n", + " embeddings[idx, :] = word2vector.get_vector(word)\n", + " except KeyError:\n", + " continue\n", + " return embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "899ce46a", + "metadata": {}, + "outputs": [], + "source": [ + "embeddings = load_embeddings('./glove/glove.6B.100d.txt', word_to_idx)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "d3e93498", + "metadata": {}, + "outputs": [], + "source": [ + "np.savetxt(\"./cache/weight.txt\", embeddings)" + ] + }, + { + "cell_type": "markdown", + "id": "b0a6045f", + "metadata": {}, + "source": [ + "### 5. encode" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "d83aaf7c", + "metadata": {}, + "outputs": [], + "source": [ + "def encode_samples(tokenized_samples, word_to_idx):\n", + " \"\"\"\n", + " tokenized_samples: [[word, word, ...]]\n", + " word_to_idx: {word:idx, word:idx, ...}\n", + " features: [[idx, idx, ...], [idx, idx, ...], ...]\n", + " \"\"\"\n", + " features = []\n", + " for sample in tokenized_samples:\n", + " feature = []\n", + " for token in sample:\n", + " feature.append(word_to_idx.get(token, 0))\n", + " features.append(feature)\n", + " return features" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "fd203f67", + "metadata": {}, + "outputs": [], + "source": [ + "def pad_samples(features, maxlen=500, pad=0):\n", + " padded_features = []\n", + " for feature in features:\n", + " if len(feature) >= maxlen:\n", + " padded_feature = feature[:maxlen]\n", + " else:\n", + " padded_feature = feature\n", + " while len(padded_feature) < maxlen:\n", + " padded_feature.append(pad)\n", + " padded_features.append(padded_feature)\n", + " return padded_features" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "5dae973a", + "metadata": {}, + "outputs": [], + "source": [ + "y_train = np.array([label for _, label in raw_data_train]).astype(np.int32)\n", + "y_test = np.array([label for _, label in raw_data_test]).astype(np.int32)\n", + "\n", + "X_train = np.array(pad_samples(encode_samples(tokenized_data_train, word_to_idx))).astype(np.int32)\n", + "X_test = np.array(pad_samples(encode_samples(tokenized_data_test, word_to_idx))).astype(np.int32)" + ] + }, + { + "cell_type": "markdown", + "id": "324b1291", + "metadata": {}, + "source": [ + "### 6. convert to mindrecord" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "80a0e3b2", + "metadata": {}, + "outputs": [], + "source": [ + "def get_json_data_list(X, y):\n", + " data_list = []\n", + " for i, (feature, label) in enumerate(zip(X, y)):\n", + " data_json = {\"id\": i, \"feature\": feature.reshape(-1), \"label\": int(label)}\n", + " data_list.append(data_json)\n", + " return data_list" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "a9440916", + "metadata": {}, + "outputs": [], + "source": [ + "def convert_np_to_mindrecord(X_train, y_train, X_test, y_test, mindrecord_save_path=\"./mindrecord\"):\n", + " schema_json = {\"id\": {\"type\": \"int32\"},\n", + " \"label\": {\"type\": \"int32\"},\n", + " \"feature\": {\"type\": \"int32\", \"shape\": [-1]}}\n", + " writer = FileWriter(os.path.join(mindrecord_save_path, \"aclImdb_train.mindrecord\"), shard_num=4)\n", + " data_train = get_json_data_list(X_train, y_train)\n", + " writer.add_schema(schema_json, \"nlp_schema\")\n", + " writer.add_index([\"id\", \"label\"])\n", + " writer.write_raw_data(data_train)\n", + " writer.commit()\n", + " \n", + " writer = FileWriter(os.path.join(mindrecord_save_path, \"aclImdb_test.mindrecord\"), shard_num=4)\n", + " data_test = get_json_data_list(X_test, y_test)\n", + " writer.add_schema(schema_json, \"nlp_schema\")\n", + " writer.add_index([\"id\", \"label\"])\n", + " writer.write_raw_data(data_test)\n", + " writer.commit()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "8a3dad17", + "metadata": {}, + "outputs": [], + "source": [ + "convert_np_to_mindrecord(X_train, y_train, X_test, y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "e6952e97", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "aclImdb_test.mindrecord0 aclImdb_train.mindrecord0\n", + "aclImdb_test.mindrecord0.db aclImdb_train.mindrecord0.db\n", + "aclImdb_test.mindrecord1 aclImdb_train.mindrecord1\n", + "aclImdb_test.mindrecord1.db aclImdb_train.mindrecord1.db\n", + "aclImdb_test.mindrecord2 aclImdb_train.mindrecord2\n", + "aclImdb_test.mindrecord2.db aclImdb_train.mindrecord2.db\n", + "aclImdb_test.mindrecord3 aclImdb_train.mindrecord3\n", + "aclImdb_test.mindrecord3.db aclImdb_train.mindrecord3.db\n" + ] + } + ], + "source": [ + "!ls ./mindrecord" + ] + }, + { + "cell_type": "markdown", + "id": "500a2411", + "metadata": {}, + "source": [ + "### 7. 创建数据集" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "9108f4e0", + "metadata": {}, + "outputs": [], + "source": [ + "import mindspore.dataset as mds" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "edc4efdd", + "metadata": {}, + "outputs": [], + "source": [ + "def create_dataset(base_path, batch_size, num_epochs, is_train):\n", + " columns_list = [\"feature\", \"label\"]\n", + " num_consumer = 4\n", + " if is_train:\n", + " path = os.path.join(base_path, \"aclImdb_train.mindrecord0\")\n", + " else:\n", + " path = os.path.join(base_path, \"aclImdb_test.mindrecord0\")\n", + " dataset = mds.MindDataset(path, columns_list=[\"feature\", \"label\"], num_parallel_workers=4)\n", + " dataset = dataset.shuffle(buffer_size=dataset.get_dataset_size())\n", + " dataset = dataset.batch(batch_size=batch_size, drop_remainder=True)\n", + " dataset = dataset.repeat(count=num_epochs)\n", + " return dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "7782b64e", + "metadata": {}, + "outputs": [], + "source": [ + "dataset_train = create_dataset(\"./mindrecord\", batch_size=32, num_epochs=10, is_train=True)" + ] + }, + { + "cell_type": "markdown", + "id": "452b266e", + "metadata": {}, + "source": [ + "### 8. 定义模型并训练" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "a6f692fd", + "metadata": {}, + "outputs": [], + "source": [ + "from mindspore import Tensor, nn, Model, context, Parameter\n", + "from mindspore.common.initializer import initializer\n", + "from mindspore.ops import operations as P\n", + "from mindspore.nn import Accuracy,LSTM\n", + "from mindspore.train.callback import LossMonitor, CheckpointConfig, ModelCheckpoint, TimeMonitor\n", + "from lstm import SentimentNet" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "50bcb227", + "metadata": {}, + "outputs": [], + "source": [ + "embedding_tabel = np.loadtxt(os.path.join(\"./cache\", \"weight.txt\")).astype(np.float32)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "ad0cec2f", + "metadata": {}, + "outputs": [], + "source": [ + "network = SentimentNet(vocab_size=embedding_tabel.shape[0],\n", + " embed_size=100,\n", + " num_hiddens=100,\n", + " num_layers=2,\n", + " bidirectional=False,\n", + " num_classes=2,\n", + " weight=Tensor(embedding_tabel),\n", + " batch_size=32)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "18e4ad24", + "metadata": {}, + "outputs": [], + "source": [ + "loss = nn.SoftmaxCrossEntropyWithLogits(reduction='mean', sparse=True)\n", + "opt = nn.Momentum(network.trainable_params(), 0.1, 0.9)\n", + "loss_callback = LossMonitor(per_print_times=3000)\n", + "model = Model(network, loss, opt, {'acc': Accuracy()})\n", + "config_ck = CheckpointConfig(save_checkpoint_steps=390, keep_checkpoint_max=10)\n", + "checkpoint_cb = ModelCheckpoint(prefix=\"lstm\", directory=\"./model\", config=config_ck)\n", + "time_cb = TimeMonitor(data_size=dataset_train.get_dataset_size())" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "9fec8d6b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch: 1 step: 3000, loss is 0.6625426\n", + "epoch: 1 step: 6000, loss is 0.6949122\n", + "epoch time: 300260.071 ms, per step time: 38.446 ms\n", + "epoch: 2 step: 1190, loss is 0.7159466\n", + "epoch: 2 step: 4190, loss is 0.66605085\n", + "epoch: 2 step: 7190, loss is 0.59723616\n", + "epoch time: 227865.086 ms, per step time: 29.176 ms\n", + "epoch: 3 step: 2380, loss is 0.63499504\n", + "epoch: 3 step: 5380, loss is 0.6277052\n", + "epoch time: 228694.479 ms, per step time: 29.282 ms\n", + "epoch: 4 step: 570, loss is 0.5825846\n", + "epoch: 4 step: 3570, loss is 0.6407242\n", + "epoch: 4 step: 6570, loss is 0.6360452\n", + "epoch time: 228434.188 ms, per step time: 29.249 ms\n", + "epoch: 5 step: 1760, loss is 0.618327\n", + "epoch: 5 step: 4760, loss is 0.34937367\n", + "epoch: 5 step: 7760, loss is 0.3124014\n", + "epoch time: 228633.055 ms, per step time: 29.274 ms\n", + "epoch: 6 step: 2950, loss is 0.6037954\n", + "epoch: 6 step: 5950, loss is 0.49445567\n", + "epoch time: 227237.265 ms, per step time: 29.096 ms\n", + "epoch: 7 step: 1140, loss is 0.19343969\n", + "epoch: 7 step: 4140, loss is 0.2573592\n", + "epoch: 7 step: 7140, loss is 0.11479706\n", + "epoch time: 228353.065 ms, per step time: 29.239 ms\n", + "epoch: 8 step: 2330, loss is 0.32086658\n", + "epoch: 8 step: 5330, loss is 0.20143133\n", + "epoch time: 228068.243 ms, per step time: 29.202 ms\n", + "epoch: 9 step: 520, loss is 0.081819385\n", + "epoch: 9 step: 3520, loss is 0.3052325\n", + "epoch: 9 step: 6520, loss is 0.40288585\n", + "epoch time: 227941.615 ms, per step time: 29.186 ms\n", + "epoch: 10 step: 1710, loss is 0.012856578\n", + "epoch: 10 step: 4710, loss is 0.06560053\n", + "epoch: 10 step: 7710, loss is 0.07241162\n", + "epoch time: 228342.392 ms, per step time: 29.237 ms\n" + ] + } + ], + "source": [ + "from mindspore import context\n", + "context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target=\"GPU\")\n", + "model.train(10, dataset_train, callbacks=[time_cb, checkpoint_cb, loss_callback], dataset_sink_mode=False)" + ] + }, + { + "cell_type": "markdown", + "id": "0089756b", + "metadata": {}, + "source": [ + "### 9. 评估模型" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "eb6d15da", + "metadata": {}, + "outputs": [], + "source": [ + "dataset_test = create_dataset(\"./mindrecord\", batch_size=32, num_epochs=10, is_train=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "d89f2d0b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "accuracy:{'acc': 0.813644366197183}\n" + ] + } + ], + "source": [ + "acc = model.eval(dataset_test)\n", + "print(\"accuracy:{}\".format(acc))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a5c1d85a", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "MindSpore", + "language": "python", + "name": "mindspore" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git "a/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/lstm.py" "b/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/lstm.py" new file mode 100644 index 0000000..7cbaecb --- /dev/null +++ "b/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/lstm.py" @@ -0,0 +1,326 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""LSTM.""" +import math + +import numpy as np + +from mindspore import Tensor, nn, context, Parameter, ParameterTuple +from mindspore.common.initializer import initializer +from mindspore.ops import operations as P +import mindspore.ops.functional as F +import mindspore.common.dtype as mstype + +STACK_LSTM_DEVICE = ["CPU"] + + +# Initialize short-term memory (h) and long-term memory (c) to 0 +def lstm_default_state(batch_size, hidden_size, num_layers, bidirectional): + """init default input.""" + num_directions = 2 if bidirectional else 1 + h = Tensor(np.zeros((num_layers * num_directions, batch_size, hidden_size)).astype(np.float32)) + c = Tensor(np.zeros((num_layers * num_directions, batch_size, hidden_size)).astype(np.float32)) + return h, c + + +def stack_lstm_default_state(batch_size, hidden_size, num_layers, bidirectional): + """init default input.""" + num_directions = 2 if bidirectional else 1 + + h_list = c_list = [] + for _ in range(num_layers): + h_list.append(Tensor(np.zeros((num_directions, batch_size, hidden_size)).astype(np.float32))) + c_list.append(Tensor(np.zeros((num_directions, batch_size, hidden_size)).astype(np.float32))) + h, c = tuple(h_list), tuple(c_list) + return h, c + +def stack_lstm_default_state_ascend(batch_size, hidden_size, num_layers, bidirectional): + """init default input.""" + + h_list = c_list = [] + for _ in range(num_layers): + h_fw = Tensor(np.zeros((1, batch_size, hidden_size)).astype(np.float16)) + c_fw = Tensor(np.zeros((1, batch_size, hidden_size)).astype(np.float16)) + h_i = [h_fw] + c_i = [c_fw] + + if bidirectional: + h_bw = Tensor(np.zeros((1, batch_size, hidden_size)).astype(np.float16)) + c_bw = Tensor(np.zeros((1, batch_size, hidden_size)).astype(np.float16)) + h_i.append(h_bw) + c_i.append(c_bw) + + h_list.append(h_i) + c_list.append(c_i) + + h, c = tuple(h_list), tuple(c_list) + return h, c + + +class StackLSTM(nn.Cell): + """ + Stack multi-layers LSTM together. + """ + + def __init__(self, + input_size, + hidden_size, + num_layers=1, + has_bias=True, + batch_first=False, + dropout=0.0, + bidirectional=False): + super(StackLSTM, self).__init__() + self.num_layers = num_layers + self.batch_first = batch_first + self.transpose = P.Transpose() + + # direction number + num_directions = 2 if bidirectional else 1 + + # input_size list + input_size_list = [input_size] + for i in range(num_layers - 1): + input_size_list.append(hidden_size * num_directions) + + # layers + layers = [] + for i in range(num_layers): + layers.append(nn.LSTMCell(input_size=input_size_list[i], + hidden_size=hidden_size, + has_bias=has_bias, + batch_first=batch_first, + bidirectional=bidirectional, + dropout=dropout)) + + # weights + weights = [] + for i in range(num_layers): + # weight size + weight_size = (input_size_list[i] + hidden_size) * num_directions * hidden_size * 4 + if has_bias: + bias_size = num_directions * hidden_size * 4 + weight_size = weight_size + bias_size + + # numpy weight + stdv = 1 / math.sqrt(hidden_size) + w_np = np.random.uniform(-stdv, stdv, (weight_size, 1, 1)).astype(np.float32) + + # lstm weight + weights.append(Parameter(initializer(Tensor(w_np), w_np.shape), name="weight" + str(i))) + + # + self.lstms = layers + self.weight = ParameterTuple(tuple(weights)) + + def construct(self, x, hx): + """construct""" + if self.batch_first: + x = self.transpose(x, (1, 0, 2)) + # stack lstm + h, c = hx + hn = cn = None + for i in range(self.num_layers): + x, hn, cn, _, _ = self.lstms[i](x, h[i], c[i], self.weight[i]) + if self.batch_first: + x = self.transpose(x, (1, 0, 2)) + return x, (hn, cn) + +class LSTM_Ascend(nn.Cell): + """ LSTM in Ascend. """ + + def __init__(self, bidirectional=False): + super(LSTM_Ascend, self).__init__() + self.bidirectional = bidirectional + self.dynamic_rnn = P.DynamicRNN(forget_bias=0.0) + self.reverseV2 = P.ReverseV2(axis=[0]) + self.concat = P.Concat(2) + + def construct(self, x, h, c, w_f, b_f, w_b=None, b_b=None): + """construct""" + x = F.cast(x, mstype.float16) + if self.bidirectional: + y1, h1, c1, _, _, _, _, _ = self.dynamic_rnn(x, w_f, b_f, None, h[0], c[0]) + r_x = self.reverseV2(x) + y2, h2, c2, _, _, _, _, _ = self.dynamic_rnn(r_x, w_b, b_b, None, h[1], c[1]) + y2 = self.reverseV2(y2) + + output = self.concat((y1, y2)) + hn = self.concat((h1, h2)) + cn = self.concat((c1, c2)) + return output, (hn, cn) + + y1, h1, c1, _, _, _, _, _ = self.dynamic_rnn(x, w_f, b_f, None, h[0], c[0]) + return y1, (h1, c1) + +class StackLSTMAscend(nn.Cell): + """ Stack multi-layers LSTM together. """ + + def __init__(self, + input_size, + hidden_size, + num_layers=1, + has_bias=True, + batch_first=False, + dropout=0.0, + bidirectional=False): + super(StackLSTMAscend, self).__init__() + self.num_layers = num_layers + self.batch_first = batch_first + self.bidirectional = bidirectional + self.transpose = P.Transpose() + + # input_size list + input_size_list = [input_size] + for i in range(num_layers - 1): + input_size_list.append(hidden_size * 2) + + #weights, bias and layers init + weights_fw = [] + weights_bw = [] + bias_fw = [] + bias_bw = [] + + stdv = 1 / math.sqrt(hidden_size) + for i in range(num_layers): + # forward weight init + w_np_fw = np.random.uniform(-stdv, + stdv, + (input_size_list[i] + hidden_size, hidden_size * 4)).astype(np.float32) + w_fw = Parameter(initializer(Tensor(w_np_fw), w_np_fw.shape), name="w_fw_layer" + str(i)) + weights_fw.append(w_fw) + # forward bias init + if has_bias: + b_fw = np.random.uniform(-stdv, stdv, (hidden_size * 4)).astype(np.float32) + b_fw = Parameter(initializer(Tensor(b_fw), b_fw.shape), name="b_fw_layer" + str(i)) + else: + b_fw = np.zeros((hidden_size * 4)).astype(np.float32) + b_fw = Parameter(initializer(Tensor(b_fw), b_fw.shape), name="b_fw_layer" + str(i)) + bias_fw.append(b_fw) + + if bidirectional: + # backward weight init + w_np_bw = np.random.uniform(-stdv, + stdv, + (input_size_list[i] + hidden_size, hidden_size * 4)).astype(np.float32) + w_bw = Parameter(initializer(Tensor(w_np_bw), w_np_bw.shape), name="w_bw_layer" + str(i)) + weights_bw.append(w_bw) + + # backward bias init + if has_bias: + b_bw = np.random.uniform(-stdv, stdv, (hidden_size * 4)).astype(np.float32) + b_bw = Parameter(initializer(Tensor(b_bw), b_bw.shape), name="b_bw_layer" + str(i)) + else: + b_bw = np.zeros((hidden_size * 4)).astype(np.float32) + b_bw = Parameter(initializer(Tensor(b_bw), b_bw.shape), name="b_bw_layer" + str(i)) + bias_bw.append(b_bw) + + # layer init + self.lstm = LSTM_Ascend(bidirectional=bidirectional).to_float(mstype.float16) + + self.weight_fw = ParameterTuple(tuple(weights_fw)) + self.weight_bw = ParameterTuple(tuple(weights_bw)) + self.bias_fw = ParameterTuple(tuple(bias_fw)) + self.bias_bw = ParameterTuple(tuple(bias_bw)) + + def construct(self, x, hx): + """construct""" + x = F.cast(x, mstype.float16) + if self.batch_first: + x = self.transpose(x, (1, 0, 2)) + # stack lstm + h, c = hx + hn = cn = None + for i in range(self.num_layers): + if self.bidirectional: + x, (hn, cn) = self.lstm(x, + h[i], + c[i], + self.weight_fw[i], + self.bias_fw[i], + self.weight_bw[i], + self.bias_bw[i]) + else: + x, (hn, cn) = self.lstm(x, h[i], c[i], self.weight_fw[i], self.bias_fw[i]) + if self.batch_first: + x = self.transpose(x, (1, 0, 2)) + x = F.cast(x, mstype.float32) + hn = F.cast(x, mstype.float32) + cn = F.cast(x, mstype.float32) + return x, (hn, cn) + +class SentimentNet(nn.Cell): + """Sentiment network structure.""" + + def __init__(self, + vocab_size, + embed_size, + num_hiddens, + num_layers, + bidirectional, + num_classes, + weight, + batch_size): + super(SentimentNet, self).__init__() + # Mapp words to vectors + self.embedding = nn.Embedding(vocab_size, + embed_size, + embedding_table=weight) + self.embedding.embedding_table.requires_grad = False + self.trans = P.Transpose() + self.perm = (1, 0, 2) + + if context.get_context("device_target") in STACK_LSTM_DEVICE: + # stack lstm by user + self.encoder = StackLSTM(input_size=embed_size, + hidden_size=num_hiddens, + num_layers=num_layers, + has_bias=True, + bidirectional=bidirectional, + dropout=0.0) + self.h, self.c = stack_lstm_default_state(batch_size, num_hiddens, num_layers, bidirectional) + elif context.get_context("device_target") == "GPU": + # standard lstm + self.encoder = nn.LSTM(input_size=embed_size, + hidden_size=num_hiddens, + num_layers=num_layers, + has_bias=True, + bidirectional=bidirectional, + dropout=0.0) + self.h, self.c = lstm_default_state(batch_size, num_hiddens, num_layers, bidirectional) + else: + self.encoder = StackLSTMAscend(input_size=embed_size, + hidden_size=num_hiddens, + num_layers=num_layers, + has_bias=True, + bidirectional=bidirectional) + self.h, self.c = stack_lstm_default_state_ascend(batch_size, num_hiddens, num_layers, bidirectional) + + self.concat = P.Concat(1) + self.squeeze = P.Squeeze(axis=0) + if bidirectional: + self.decoder = nn.Dense(num_hiddens * 4, num_classes) + else: + self.decoder = nn.Dense(num_hiddens * 2, num_classes) + + def construct(self, inputs): + # input:(64,500,300) + embeddings = self.embedding(inputs) + embeddings = self.trans(embeddings, self.perm) + output, _ = self.encoder(embeddings, (self.h, self.c)) + # states[i] size(64,200) -> encoding.size(64,400) + encoding = self.concat((self.squeeze(output[0:1:1]), self.squeeze(output[499:500:1]))) + outputs = self.decoder(encoding) + return outputs diff --git "a/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/readme.md" "b/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/readme.md" new file mode 100644 index 0000000..89bd541 --- /dev/null +++ "b/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/readme.md" @@ -0,0 +1,41 @@ +# 文件夹总览 + +- sentiment_analysis +   ├── aclImdb +   │ ├── test +   │ │   ├── neg.txt +   │ │   └── pos.txt +   │ ├── train +   │ │   ├── neg.txt +   │ │   └── pos.txt +   │ ├── data_reform.py +   │  └── readme.md +   ├── cache +  │  └── weight.txt +   ├── glove +  │  ├── glove_re_file.py # 用于分解glove文件并上传至华为云 +  │  ├── glove.6B.50d.py +  │  ├── glove.6B.100d.py +  │  ├── glove.6B.200d.py +  │  └── glove.6B.300d.py +   ├── mindrecord +   │  └── 用于存放mindrecord格式数据集文件 +   ├── model +   │  └── 用于存放训练好的模型文件 +   ├── glove_bilstm.ipynb +   ├── lstm.py +   └── readme.md + + +# 环境要求 + +- 本环境要求指,在华为云notebook-mindspore1.2.0-cuda10.1-cudnn7-ubuntu18.04下的配置环境要求 + * gensim-4.1.2 + +# 运行方法 + +- 第一次运行时,需删除mindrecord文件夹下的数据集文件 + * 运行glove_bilstm.ipynb +- 第二次及之后运行时 + * 若未更改glove维度,则可以直接从glove_bilstm.ipynb内【7.创建数据集】开始的步骤开始运行 + * 若有更改glove维度,则参考第一次运行时的步骤,并从头开始运行glove_bilstm.ipynb文件 \ No newline at end of file -- Gitee From d373bae9b5a08bdb007c5fd128ee0c86c01466bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=85=B7=E7=9B=96?= <10061212+wh1051899053@user.noreply.gitee.com> Date: Wed, 1 Dec 2021 07:53:06 +0000 Subject: [PATCH 05/11] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20co?= =?UTF-8?q?de/2021=5Fautumn/=E9=87=91=E4=BD=B3=E6=83=A0-=E5=9F=BA=E4=BA=8E?= =?UTF-8?q?Mindspore=E7=9A=84=E4=BA=8C=E5=85=83=E6=83=85=E6=84=9F=E5=88=86?= =?UTF-8?q?=E6=9E=90/.keep?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../.keep" | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 "code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/.keep" diff --git "a/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/.keep" "b/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/.keep" deleted file mode 100644 index e69de29..0000000 -- Gitee From 536dd1980ef767599c9abfc9c190b960ff33a731 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=85=B7=E7=9B=96?= <10061212+wh1051899053@user.noreply.gitee.com> Date: Wed, 1 Dec 2021 07:59:19 +0000 Subject: [PATCH 06/11] =?UTF-8?q?=E6=96=B0=E5=BB=BA=20aclImdb?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../aclImdb/.keep" | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 "code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/aclImdb/.keep" diff --git "a/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/aclImdb/.keep" "b/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/aclImdb/.keep" new file mode 100644 index 0000000..e69de29 -- Gitee From 8218e1a31f3946cfeae55077b75b34594eaabd71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=85=B7=E7=9B=96?= <10061212+wh1051899053@user.noreply.gitee.com> Date: Wed, 1 Dec 2021 07:59:36 +0000 Subject: [PATCH 07/11] =?UTF-8?q?=E7=AC=AC=E4=B8=80=E6=AC=A1=E6=8F=90?= =?UTF-8?q?=E4=BA=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../aclImdb/data_reform.py" | 23 ++++++++++ .../aclImdb/readme.md" | 44 +++++++++++++++++++ 2 files changed, 67 insertions(+) create mode 100644 "code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/aclImdb/data_reform.py" create mode 100644 "code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/aclImdb/readme.md" diff --git "a/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/aclImdb/data_reform.py" "b/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/aclImdb/data_reform.py" new file mode 100644 index 0000000..9a06206 --- /dev/null +++ "b/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/aclImdb/data_reform.py" @@ -0,0 +1,23 @@ +import os + +dir_path = os.path.dirname(os.path.realpath(__file__)) + +def data_reform(path, seg='train'): + + labels = ['pos', 'neg'] + + for label in labels: + + # 重新写入的文件名为dir_path/train/pos.txt + with open(os.path.join(path, seg, label)+'.txt', 'w', encoding='utf-8') as wf: + files = os.listdir(os.path.join(path, seg, label)) + + # files: dir_path/train/pos + for file in files: + with open(os.path.join(path, seg, label, file), 'r', encoding='utf8') as rf: + wf.write(rf.read() + '\n') + +imdb_data_path = dir_path + +data_reform(imdb_data_path, seg='train') +data_reform(imdb_data_path, seg='test') diff --git "a/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/aclImdb/readme.md" "b/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/aclImdb/readme.md" new file mode 100644 index 0000000..c5e48ae --- /dev/null +++ "b/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/aclImdb/readme.md" @@ -0,0 +1,44 @@ + +- 在http://ai.stanford.edu/~amaas/data/sentiment/ 下载得到压缩包aclImdb.tar.gz + +- 解压后文件夹目录如下: + + aclImdb + ├── test + │   ├── neg + │  │  └── txt文件 * 12500 + │   ├── pos + │  │  └── txt文件 * 12500 + │   ├── labeledBow.feat + │   ├── urls_neg.txt + │   └── urls_pos.txt + ├── train + │   ├── neg + │  │  └── txt文件 * 12500 + │   ├── pos + │  │  └── txt文件 * 12500 + │   ├── unsup + │  │  └── txt文件 * 50000 + │   ├── labeledBow.feat + │   ├── unsupBow.feat + │   ├── urls_neg.txt + │   ├── urls_pos.txt + │   └── ulrs_unsup.txt + ├── imdb.vocab + ├── imdbEr.txt + └── README + +- 由于train和test文件夹下的文件数量过多,因此在上传至ModelArts时文件数量受到限制。 +- 因此,在本地执行了data_reform.py文件,将每文件目录下的数据按序整合进同一txt文件中。 + +- 删去无用文件并重整后的文件夹目录如下: + +aclImdb + ├── test + │   ├── neg.txt + │   └── pos.txt + ├── train + │   ├── neg.txt + │   └── pos.txt + ├── data_reform.py + └── readme.md -- Gitee From e25a3c5a83f6db23ad290d0f0e647a6c35d559ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=85=B7=E7=9B=96?= <10061212+wh1051899053@user.noreply.gitee.com> Date: Wed, 1 Dec 2021 08:01:33 +0000 Subject: [PATCH 08/11] =?UTF-8?q?update=20code/2021=5Fautumn/=E9=87=91?= =?UTF-8?q?=E4=BD=B3=E6=83=A0-=E5=9F=BA=E4=BA=8EMindspore=E7=9A=84?= =?UTF-8?q?=E4=BA=8C=E5=85=83=E6=83=85=E6=84=9F=E5=88=86=E6=9E=90/readme.m?= =?UTF-8?q?d.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../readme.md" | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git "a/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/readme.md" "b/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/readme.md" index 89bd541..974baf3 100644 --- "a/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/readme.md" +++ "b/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/readme.md" @@ -38,4 +38,16 @@ * 运行glove_bilstm.ipynb - 第二次及之后运行时 * 若未更改glove维度,则可以直接从glove_bilstm.ipynb内【7.创建数据集】开始的步骤开始运行 - * 若有更改glove维度,则参考第一次运行时的步骤,并从头开始运行glove_bilstm.ipynb文件 \ No newline at end of file + * 若有更改glove维度,则参考第一次运行时的步骤,并从头开始运行glove_bilstm.ipynb文件 + +# gitee中需要补足的文件 + +- 数据集文件aclImdb + * 下载地址:http://ai.stanford.edu/~amaas/data/sentiment/ + * 下载并解压至aclImdb文件夹中,运行data_reform.py文件,即可得到重整后的数据集文件 +- 词嵌入文件glove + * 下载地址:http://nlp.stanford.edu/data/glove.6B.zip + * 下载后解压存放至glove文件夹下即可 +- 文件夹 + * 文件夹存放位置如上述文件夹总览所示 + * 仍需不足的空文件夹有:cache、mindrecord、model \ No newline at end of file -- Gitee From 2c18b5893d922364546fc2ea244cf9b56e685eb2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=85=B7=E7=9B=96?= <10061212+wh1051899053@user.noreply.gitee.com> Date: Wed, 1 Dec 2021 08:01:44 +0000 Subject: [PATCH 09/11] =?UTF-8?q?=E6=96=B0=E5=BB=BA=20glove?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../glove/.keep" | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 "code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/glove/.keep" diff --git "a/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/glove/.keep" "b/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/glove/.keep" new file mode 100644 index 0000000..e69de29 -- Gitee From 6c025db6e40779c48a6eb20f3aeeff3329421f06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=85=B7=E7=9B=96?= <10061212+wh1051899053@user.noreply.gitee.com> Date: Wed, 1 Dec 2021 08:01:59 +0000 Subject: [PATCH 10/11] =?UTF-8?q?=E7=AC=AC=E4=BA=8C=E6=AC=A1=E4=B8=8A?= =?UTF-8?q?=E4=BC=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../glove/glove_re_file.py" | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 "code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/glove/glove_re_file.py" diff --git "a/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/glove/glove_re_file.py" "b/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/glove/glove_re_file.py" new file mode 100644 index 0000000..5e5faa9 --- /dev/null +++ "b/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/glove/glove_re_file.py" @@ -0,0 +1,24 @@ +linecount = 0 +filecount = 0 +with open('glove.6B.100d.txt','r',encoding='utf-8') as f: + g = open('glove.6B.100d_0.txt','a',encoding='utf-8') + for i in f.readlines(): + if linecount>= 50000: + filecount += 1 + g = open('glove.6B.100d_'+str(filecount)+'.txt','a',encoding='utf-8') + linecount = 0 + + g.write(i) + linecount += 1 + +linecount = 0 +with open('new_glove.6B.100d.txt','w',encoding='utf-8') as f: + for i in range(0,filecount+1): + g = open('glove.6B.100d_'+str(i)+'.txt','r',encoding='utf-8') + for line in g.readlines(): + f.write(line) + linecount+=1 + +print(linecount) + + -- Gitee From 50e0abc5c909bb2a7d844b5f97cbefcd50eb61f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=85=B7=E7=9B=96?= <10061212+wh1051899053@user.noreply.gitee.com> Date: Wed, 1 Dec 2021 08:02:40 +0000 Subject: [PATCH 11/11] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20co?= =?UTF-8?q?de/2021=5Fautumn/=E9=87=91=E4=BD=B3=E6=83=A0-=E5=9F=BA=E4=BA=8E?= =?UTF-8?q?Mindspore=E7=9A=84=E4=BA=8C=E5=85=83=E6=83=85=E6=84=9F=E5=88=86?= =?UTF-8?q?=E6=9E=90/glove/.keep?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../glove/.keep" | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 "code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/glove/.keep" diff --git "a/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/glove/.keep" "b/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/glove/.keep" deleted file mode 100644 index e69de29..0000000 -- Gitee