From 3e6f6f85b4d68ffd987d794bccf1aa08fe752e85 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=85=B7=E7=9B=96?=
 <10061212+wh1051899053@user.noreply.gitee.com>
Date: Wed, 1 Dec 2021 07:51:09 +0000
Subject: [PATCH 01/11] =?UTF-8?q?=E7=AC=AC=E4=B8=80=E6=AC=A1=E6=8F=90?=
 =?UTF-8?q?=E4=BA=A4?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 glove_bilstm.ipynb | 597 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 597 insertions(+)
 create mode 100644 glove_bilstm.ipynb

diff --git a/glove_bilstm.ipynb b/glove_bilstm.ipynb
new file mode 100644
index 0000000..787f9d5
--- /dev/null
+++ b/glove_bilstm.ipynb
@@ -0,0 +1,597 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "fe8db7fb",
+   "metadata": {},
+   "source": [
+    "安装包"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "e831646d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Looking in indexes: http://repo.myhuaweicloud.com/repository/pypi/simple\n",
+      "Collecting gensim\n",
+      "  Downloading http://repo.myhuaweicloud.com/repository/pypi/packages/9f/44/985c6291f160aca1257dae9b5bb62d91d0f61f12014297a2fa80e6464be1/gensim-4.1.2-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1 MB)\n",
+      "\u001b[K     |████████████████████████████████| 24.1 MB 30.0 MB/s eta 0:00:016 MB 30.0 MB/s eta 0:00:01B 30.0 MB/s eta 0:00:01███████▉| 23.9 MB 30.0 MB/s eta 0:00:01\n",
+      "\u001b[?25hRequirement already satisfied: scipy>=0.18.1 in /home/ma-user/anaconda3/envs/MindSpore/lib/python3.7/site-packages (from gensim) (1.7.2)\n",
+      "Collecting smart-open>=1.8.1\n",
+      "  Downloading http://repo.myhuaweicloud.com/repository/pypi/packages/cd/11/05f68ea934c24ade38e95ac30a38407767787c4e3db1776eae4886ad8c95/smart_open-5.2.1-py3-none-any.whl (58 kB)\n",
+      "\u001b[K     |████████████████████████████████| 58 kB 26.0 MB/s eta 0:00:01\n",
+      "\u001b[?25hRequirement already satisfied: numpy>=1.17.0 in /home/ma-user/anaconda3/envs/MindSpore/lib/python3.7/site-packages (from gensim) (1.21.4)\n",
+      "Installing collected packages: smart-open, gensim\n",
+      "Successfully installed gensim-4.1.2 smart-open-5.2.1\n",
+      "\u001b[33mWARNING: You are using pip version 21.0.1; however, version 21.3 is available.\n",
+      "You should consider upgrading via the '/home/ma-user/anaconda3/envs/MindSpore/bin/python -m pip install --upgrade pip' command.\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install gensim"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ad7c54d6",
+   "metadata": {},
+   "source": [
+    "导入包"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "44aa6bcd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import math\n",
+    "from itertools import chain\n",
+    "import gensim\n",
+    "import numpy as np\n",
+    "from mindspore.mindrecord import FileWriter"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "74543d4c",
+   "metadata": {},
+   "source": [
+    "### 1. 读入数据"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "5a2b601c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 定义读入数据的函数\n",
+    "def read_imdb(path, seg='train'):\n",
+    "    labels = ['pos', 'neg']\n",
+    "    data = []\n",
+    "    for label in labels:\n",
+    "        # rf:./aclImdb/seg/label.txt, eg:./aclImdb/train/pos.txt\n",
+    "        with open(os.path.join(path, seg, label) + '.txt', 'r', encoding='utf8') as rf:\n",
+    "            for review in rf.readlines():\n",
+    "                review = review.replace('\\n', '')\n",
+    "                if label == 'pos':\n",
+    "                    data.append([review, 1])\n",
+    "                elif label == 'neg':\n",
+    "                    data.append([review, 0])\n",
+    "    return data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "df68cfff",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "imdb_data_path='./aclImdb'\n",
+    "raw_data_train = read_imdb(imdb_data_path, seg='train')\n",
+    "raw_data_test = read_imdb(imdb_data_path, seg='test')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2204a189",
+   "metadata": {},
+   "source": [
+    "### 2. 分词"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "c3619ef5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 定义分词函数\n",
+    "def tokenize_samples(raw_data):\n",
+    "    tokenized_data = []\n",
+    "    for review in raw_data:\n",
+    "        tokenized_data.append([tok.lower() for tok in review.split()])\n",
+    "    return tokenized_data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "eaa71af9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# tokenized_data:[[word,word,word,...],...]\n",
+    "tokenized_data_train = tokenize_samples([review for review, _ in raw_data_train])\n",
+    "tokenized_data_test = tokenize_samples([review for review, _ in raw_data_test])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4d8e86a7",
+   "metadata": {},
+   "source": [
+    "### 3. word2idx"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "bb31338d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 建立word->index的字典，用作后面将文字转化为数字序列\n",
+    "vocab = set(chain(*tokenized_data_train))\n",
+    "word_to_idx = {word: i+1 for i, word in enumerate(vocab)}\n",
+    "word_to_idx['<unk>'] = 0"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "51c1ed8e",
+   "metadata": {},
+   "source": [
+    "### 4. embedding"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "ce4f1a73",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# !sed -i '1i\\400000 100' ./glove.6B.100d.txt\n",
+    "def load_embeddings(glove_file_path, word_to_idx, embed_size=100):\n",
+    "    word2vector = gensim.models.KeyedVectors.load_word2vec_format(\n",
+    "        glove_file_path, binary=False, encoding='utf-8')\n",
+    "    assert embed_size == word2vector.vector_size\n",
+    "    embeddings = np.zeros((len(word_to_idx), embed_size)).astype(np.float32)\n",
+    "    for word, idx in word_to_idx.items():\n",
+    "        try:\n",
+    "            embeddings[idx, :] = word2vector.get_vector(word)\n",
+    "        except KeyError:\n",
+    "            continue\n",
+    "    return embeddings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "899ce46a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "embeddings = load_embeddings('./glove/glove.6B.100d.txt', word_to_idx)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "d3e93498",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "np.savetxt(\"./cache/weight.txt\", embeddings)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b0a6045f",
+   "metadata": {},
+   "source": [
+    "### 5. encode"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "d83aaf7c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def encode_samples(tokenized_samples, word_to_idx):\n",
+    "    \"\"\"\n",
+    "    tokenized_samples: [[word, word, ...]]\n",
+    "    word_to_idx: {word:idx, word:idx, ...}\n",
+    "    features: [[idx, idx, ...], [idx, idx, ...], ...]\n",
+    "    \"\"\"\n",
+    "    features = []\n",
+    "    for sample in tokenized_samples:\n",
+    "        feature = []\n",
+    "        for token in sample:\n",
+    "            feature.append(word_to_idx.get(token, 0))\n",
+    "        features.append(feature)\n",
+    "    return features"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "fd203f67",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def pad_samples(features, maxlen=500, pad=0):\n",
+    "    padded_features = []\n",
+    "    for feature in features:\n",
+    "        if len(feature) >= maxlen:\n",
+    "            padded_feature = feature[:maxlen]\n",
+    "        else:\n",
+    "            padded_feature = feature\n",
+    "            while len(padded_feature) < maxlen:\n",
+    "                padded_feature.append(pad)\n",
+    "        padded_features.append(padded_feature)\n",
+    "    return padded_features"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "5dae973a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "y_train = np.array([label for _, label in raw_data_train]).astype(np.int32)\n",
+    "y_test = np.array([label for _, label in raw_data_test]).astype(np.int32)\n",
+    "\n",
+    "X_train = np.array(pad_samples(encode_samples(tokenized_data_train, word_to_idx))).astype(np.int32)\n",
+    "X_test = np.array(pad_samples(encode_samples(tokenized_data_test, word_to_idx))).astype(np.int32)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "324b1291",
+   "metadata": {},
+   "source": [
+    "### 6. convert to mindrecord"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "80a0e3b2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_json_data_list(X, y):\n",
+    "    data_list = []\n",
+    "    for i, (feature, label) in enumerate(zip(X, y)):\n",
+    "        data_json = {\"id\": i, \"feature\": feature.reshape(-1), \"label\": int(label)}\n",
+    "        data_list.append(data_json)\n",
+    "    return data_list"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "a9440916",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def convert_np_to_mindrecord(X_train, y_train, X_test, y_test, mindrecord_save_path=\"./mindrecord\"):\n",
+    "    schema_json = {\"id\": {\"type\": \"int32\"},\n",
+    "                  \"label\": {\"type\": \"int32\"},\n",
+    "                  \"feature\": {\"type\": \"int32\", \"shape\": [-1]}}\n",
+    "    writer = FileWriter(os.path.join(mindrecord_save_path, \"aclImdb_train.mindrecord\"), shard_num=4)\n",
+    "    data_train = get_json_data_list(X_train, y_train)\n",
+    "    writer.add_schema(schema_json, \"nlp_schema\")\n",
+    "    writer.add_index([\"id\", \"label\"])\n",
+    "    writer.write_raw_data(data_train)\n",
+    "    writer.commit()\n",
+    "    \n",
+    "    writer = FileWriter(os.path.join(mindrecord_save_path, \"aclImdb_test.mindrecord\"), shard_num=4)\n",
+    "    data_test = get_json_data_list(X_test, y_test)\n",
+    "    writer.add_schema(schema_json, \"nlp_schema\")\n",
+    "    writer.add_index([\"id\", \"label\"])\n",
+    "    writer.write_raw_data(data_test)\n",
+    "    writer.commit()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "8a3dad17",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "convert_np_to_mindrecord(X_train, y_train, X_test, y_test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "e6952e97",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "aclImdb_test.mindrecord0     aclImdb_train.mindrecord0\n",
+      "aclImdb_test.mindrecord0.db  aclImdb_train.mindrecord0.db\n",
+      "aclImdb_test.mindrecord1     aclImdb_train.mindrecord1\n",
+      "aclImdb_test.mindrecord1.db  aclImdb_train.mindrecord1.db\n",
+      "aclImdb_test.mindrecord2     aclImdb_train.mindrecord2\n",
+      "aclImdb_test.mindrecord2.db  aclImdb_train.mindrecord2.db\n",
+      "aclImdb_test.mindrecord3     aclImdb_train.mindrecord3\n",
+      "aclImdb_test.mindrecord3.db  aclImdb_train.mindrecord3.db\n"
+     ]
+    }
+   ],
+   "source": [
+    "!ls ./mindrecord"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "500a2411",
+   "metadata": {},
+   "source": [
+    "### 7. 创建数据集"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "9108f4e0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import mindspore.dataset as mds"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "edc4efdd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_dataset(base_path, batch_size, num_epochs, is_train):\n",
+    "    columns_list = [\"feature\", \"label\"]\n",
+    "    num_consumer = 4\n",
+    "    if is_train:\n",
+    "        path = os.path.join(base_path, \"aclImdb_train.mindrecord0\")\n",
+    "    else:\n",
+    "        path = os.path.join(base_path, \"aclImdb_test.mindrecord0\")\n",
+    "    dataset = mds.MindDataset(path, columns_list=[\"feature\", \"label\"], num_parallel_workers=4)\n",
+    "    dataset = dataset.shuffle(buffer_size=dataset.get_dataset_size())\n",
+    "    dataset = dataset.batch(batch_size=batch_size, drop_remainder=True)\n",
+    "    dataset = dataset.repeat(count=num_epochs)\n",
+    "    return dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "7782b64e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset_train = create_dataset(\"./mindrecord\", batch_size=32, num_epochs=10, is_train=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "452b266e",
+   "metadata": {},
+   "source": [
+    "### 8. 定义模型并训练"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "a6f692fd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from mindspore import Tensor, nn, Model, context, Parameter\n",
+    "from mindspore.common.initializer import initializer\n",
+    "from mindspore.ops import operations as P\n",
+    "from mindspore.nn import Accuracy,LSTM\n",
+    "from mindspore.train.callback import LossMonitor, CheckpointConfig, ModelCheckpoint, TimeMonitor\n",
+    "from lstm import SentimentNet"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "50bcb227",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "embedding_tabel = np.loadtxt(os.path.join(\"./cache\", \"weight.txt\")).astype(np.float32)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "ad0cec2f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "network = SentimentNet(vocab_size=embedding_tabel.shape[0],\n",
+    "                embed_size=100,\n",
+    "                num_hiddens=100,\n",
+    "                num_layers=2,\n",
+    "                bidirectional=False,\n",
+    "                num_classes=2,\n",
+    "                weight=Tensor(embedding_tabel),\n",
+    "                batch_size=32)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "18e4ad24",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loss = nn.SoftmaxCrossEntropyWithLogits(reduction='mean', sparse=True)\n",
+    "opt = nn.Momentum(network.trainable_params(), 0.1, 0.9)\n",
+    "loss_callback = LossMonitor(per_print_times=3000)\n",
+    "model = Model(network, loss, opt, {'acc': Accuracy()})\n",
+    "config_ck = CheckpointConfig(save_checkpoint_steps=390, keep_checkpoint_max=10)\n",
+    "checkpoint_cb = ModelCheckpoint(prefix=\"lstm\", directory=\"./model\", config=config_ck)\n",
+    "time_cb = TimeMonitor(data_size=dataset_train.get_dataset_size())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "9fec8d6b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch: 1 step: 3000, loss is 0.6625426\n",
+      "epoch: 1 step: 6000, loss is 0.6949122\n",
+      "epoch time: 300260.071 ms, per step time: 38.446 ms\n",
+      "epoch: 2 step: 1190, loss is 0.7159466\n",
+      "epoch: 2 step: 4190, loss is 0.66605085\n",
+      "epoch: 2 step: 7190, loss is 0.59723616\n",
+      "epoch time: 227865.086 ms, per step time: 29.176 ms\n",
+      "epoch: 3 step: 2380, loss is 0.63499504\n",
+      "epoch: 3 step: 5380, loss is 0.6277052\n",
+      "epoch time: 228694.479 ms, per step time: 29.282 ms\n",
+      "epoch: 4 step: 570, loss is 0.5825846\n",
+      "epoch: 4 step: 3570, loss is 0.6407242\n",
+      "epoch: 4 step: 6570, loss is 0.6360452\n",
+      "epoch time: 228434.188 ms, per step time: 29.249 ms\n",
+      "epoch: 5 step: 1760, loss is 0.618327\n",
+      "epoch: 5 step: 4760, loss is 0.34937367\n",
+      "epoch: 5 step: 7760, loss is 0.3124014\n",
+      "epoch time: 228633.055 ms, per step time: 29.274 ms\n",
+      "epoch: 6 step: 2950, loss is 0.6037954\n",
+      "epoch: 6 step: 5950, loss is 0.49445567\n",
+      "epoch time: 227237.265 ms, per step time: 29.096 ms\n",
+      "epoch: 7 step: 1140, loss is 0.19343969\n",
+      "epoch: 7 step: 4140, loss is 0.2573592\n",
+      "epoch: 7 step: 7140, loss is 0.11479706\n",
+      "epoch time: 228353.065 ms, per step time: 29.239 ms\n",
+      "epoch: 8 step: 2330, loss is 0.32086658\n",
+      "epoch: 8 step: 5330, loss is 0.20143133\n",
+      "epoch time: 228068.243 ms, per step time: 29.202 ms\n",
+      "epoch: 9 step: 520, loss is 0.081819385\n",
+      "epoch: 9 step: 3520, loss is 0.3052325\n",
+      "epoch: 9 step: 6520, loss is 0.40288585\n",
+      "epoch time: 227941.615 ms, per step time: 29.186 ms\n",
+      "epoch: 10 step: 1710, loss is 0.012856578\n",
+      "epoch: 10 step: 4710, loss is 0.06560053\n",
+      "epoch: 10 step: 7710, loss is 0.07241162\n",
+      "epoch time: 228342.392 ms, per step time: 29.237 ms\n"
+     ]
+    }
+   ],
+   "source": [
+    "from mindspore import context\n",
+    "context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target=\"GPU\")\n",
+    "model.train(10, dataset_train, callbacks=[time_cb, checkpoint_cb, loss_callback], dataset_sink_mode=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0089756b",
+   "metadata": {},
+   "source": [
+    "### 9. 评估模型"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "eb6d15da",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset_test = create_dataset(\"./mindrecord\", batch_size=32, num_epochs=10, is_train=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "d89f2d0b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "accuracy:{'acc': 0.813644366197183}\n"
+     ]
+    }
+   ],
+   "source": [
+    "acc = model.eval(dataset_test)\n",
+    "print(\"accuracy:{}\".format(acc))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a5c1d85a",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "MindSpore",
+   "language": "python",
+   "name": "mindspore"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
-- 
Gitee


From e9bff42ecde06576fa51c112bce8f73936f3acb5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=85=B7=E7=9B=96?=
 <10061212+wh1051899053@user.noreply.gitee.com>
Date: Wed, 1 Dec 2021 07:51:29 +0000
Subject: [PATCH 02/11] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20gl?=
 =?UTF-8?q?ove=5Fbilstm.ipynb?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 glove_bilstm.ipynb | 597 ---------------------------------------------
 1 file changed, 597 deletions(-)
 delete mode 100644 glove_bilstm.ipynb

diff --git a/glove_bilstm.ipynb b/glove_bilstm.ipynb
deleted file mode 100644
index 787f9d5..0000000
--- a/glove_bilstm.ipynb
+++ /dev/null
@@ -1,597 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "fe8db7fb",
-   "metadata": {},
-   "source": [
-    "安装包"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "e831646d",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Looking in indexes: http://repo.myhuaweicloud.com/repository/pypi/simple\n",
-      "Collecting gensim\n",
-      "  Downloading http://repo.myhuaweicloud.com/repository/pypi/packages/9f/44/985c6291f160aca1257dae9b5bb62d91d0f61f12014297a2fa80e6464be1/gensim-4.1.2-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1 MB)\n",
-      "\u001b[K     |████████████████████████████████| 24.1 MB 30.0 MB/s eta 0:00:016 MB 30.0 MB/s eta 0:00:01B 30.0 MB/s eta 0:00:01███████▉| 23.9 MB 30.0 MB/s eta 0:00:01\n",
-      "\u001b[?25hRequirement already satisfied: scipy>=0.18.1 in /home/ma-user/anaconda3/envs/MindSpore/lib/python3.7/site-packages (from gensim) (1.7.2)\n",
-      "Collecting smart-open>=1.8.1\n",
-      "  Downloading http://repo.myhuaweicloud.com/repository/pypi/packages/cd/11/05f68ea934c24ade38e95ac30a38407767787c4e3db1776eae4886ad8c95/smart_open-5.2.1-py3-none-any.whl (58 kB)\n",
-      "\u001b[K     |████████████████████████████████| 58 kB 26.0 MB/s eta 0:00:01\n",
-      "\u001b[?25hRequirement already satisfied: numpy>=1.17.0 in /home/ma-user/anaconda3/envs/MindSpore/lib/python3.7/site-packages (from gensim) (1.21.4)\n",
-      "Installing collected packages: smart-open, gensim\n",
-      "Successfully installed gensim-4.1.2 smart-open-5.2.1\n",
-      "\u001b[33mWARNING: You are using pip version 21.0.1; however, version 21.3 is available.\n",
-      "You should consider upgrading via the '/home/ma-user/anaconda3/envs/MindSpore/bin/python -m pip install --upgrade pip' command.\u001b[0m\n"
-     ]
-    }
-   ],
-   "source": [
-    "!pip install gensim"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "ad7c54d6",
-   "metadata": {},
-   "source": [
-    "导入包"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "44aa6bcd",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import os\n",
-    "import math\n",
-    "from itertools import chain\n",
-    "import gensim\n",
-    "import numpy as np\n",
-    "from mindspore.mindrecord import FileWriter"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "74543d4c",
-   "metadata": {},
-   "source": [
-    "### 1. 读入数据"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "5a2b601c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# 定义读入数据的函数\n",
-    "def read_imdb(path, seg='train'):\n",
-    "    labels = ['pos', 'neg']\n",
-    "    data = []\n",
-    "    for label in labels:\n",
-    "        # rf:./aclImdb/seg/label.txt, eg:./aclImdb/train/pos.txt\n",
-    "        with open(os.path.join(path, seg, label) + '.txt', 'r', encoding='utf8') as rf:\n",
-    "            for review in rf.readlines():\n",
-    "                review = review.replace('\\n', '')\n",
-    "                if label == 'pos':\n",
-    "                    data.append([review, 1])\n",
-    "                elif label == 'neg':\n",
-    "                    data.append([review, 0])\n",
-    "    return data"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "df68cfff",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "imdb_data_path='./aclImdb'\n",
-    "raw_data_train = read_imdb(imdb_data_path, seg='train')\n",
-    "raw_data_test = read_imdb(imdb_data_path, seg='test')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "2204a189",
-   "metadata": {},
-   "source": [
-    "### 2. 分词"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "c3619ef5",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# 定义分词函数\n",
-    "def tokenize_samples(raw_data):\n",
-    "    tokenized_data = []\n",
-    "    for review in raw_data:\n",
-    "        tokenized_data.append([tok.lower() for tok in review.split()])\n",
-    "    return tokenized_data"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "eaa71af9",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# tokenized_data:[[word,word,word,...],...]\n",
-    "tokenized_data_train = tokenize_samples([review for review, _ in raw_data_train])\n",
-    "tokenized_data_test = tokenize_samples([review for review, _ in raw_data_test])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "4d8e86a7",
-   "metadata": {},
-   "source": [
-    "### 3. word2idx"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "bb31338d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# 建立word->index的字典，用作后面将文字转化为数字序列\n",
-    "vocab = set(chain(*tokenized_data_train))\n",
-    "word_to_idx = {word: i+1 for i, word in enumerate(vocab)}\n",
-    "word_to_idx['<unk>'] = 0"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "51c1ed8e",
-   "metadata": {},
-   "source": [
-    "### 4. embedding"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "id": "ce4f1a73",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# !sed -i '1i\\400000 100' ./glove.6B.100d.txt\n",
-    "def load_embeddings(glove_file_path, word_to_idx, embed_size=100):\n",
-    "    word2vector = gensim.models.KeyedVectors.load_word2vec_format(\n",
-    "        glove_file_path, binary=False, encoding='utf-8')\n",
-    "    assert embed_size == word2vector.vector_size\n",
-    "    embeddings = np.zeros((len(word_to_idx), embed_size)).astype(np.float32)\n",
-    "    for word, idx in word_to_idx.items():\n",
-    "        try:\n",
-    "            embeddings[idx, :] = word2vector.get_vector(word)\n",
-    "        except KeyError:\n",
-    "            continue\n",
-    "    return embeddings"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "id": "899ce46a",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "embeddings = load_embeddings('./glove/glove.6B.100d.txt', word_to_idx)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "id": "d3e93498",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "np.savetxt(\"./cache/weight.txt\", embeddings)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "b0a6045f",
-   "metadata": {},
-   "source": [
-    "### 5. encode"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "id": "d83aaf7c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def encode_samples(tokenized_samples, word_to_idx):\n",
-    "    \"\"\"\n",
-    "    tokenized_samples: [[word, word, ...]]\n",
-    "    word_to_idx: {word:idx, word:idx, ...}\n",
-    "    features: [[idx, idx, ...], [idx, idx, ...], ...]\n",
-    "    \"\"\"\n",
-    "    features = []\n",
-    "    for sample in tokenized_samples:\n",
-    "        feature = []\n",
-    "        for token in sample:\n",
-    "            feature.append(word_to_idx.get(token, 0))\n",
-    "        features.append(feature)\n",
-    "    return features"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "id": "fd203f67",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def pad_samples(features, maxlen=500, pad=0):\n",
-    "    padded_features = []\n",
-    "    for feature in features:\n",
-    "        if len(feature) >= maxlen:\n",
-    "            padded_feature = feature[:maxlen]\n",
-    "        else:\n",
-    "            padded_feature = feature\n",
-    "            while len(padded_feature) < maxlen:\n",
-    "                padded_feature.append(pad)\n",
-    "        padded_features.append(padded_feature)\n",
-    "    return padded_features"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "id": "5dae973a",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "y_train = np.array([label for _, label in raw_data_train]).astype(np.int32)\n",
-    "y_test = np.array([label for _, label in raw_data_test]).astype(np.int32)\n",
-    "\n",
-    "X_train = np.array(pad_samples(encode_samples(tokenized_data_train, word_to_idx))).astype(np.int32)\n",
-    "X_test = np.array(pad_samples(encode_samples(tokenized_data_test, word_to_idx))).astype(np.int32)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "324b1291",
-   "metadata": {},
-   "source": [
-    "### 6. convert to mindrecord"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "id": "80a0e3b2",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def get_json_data_list(X, y):\n",
-    "    data_list = []\n",
-    "    for i, (feature, label) in enumerate(zip(X, y)):\n",
-    "        data_json = {\"id\": i, \"feature\": feature.reshape(-1), \"label\": int(label)}\n",
-    "        data_list.append(data_json)\n",
-    "    return data_list"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "id": "a9440916",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def convert_np_to_mindrecord(X_train, y_train, X_test, y_test, mindrecord_save_path=\"./mindrecord\"):\n",
-    "    schema_json = {\"id\": {\"type\": \"int32\"},\n",
-    "                  \"label\": {\"type\": \"int32\"},\n",
-    "                  \"feature\": {\"type\": \"int32\", \"shape\": [-1]}}\n",
-    "    writer = FileWriter(os.path.join(mindrecord_save_path, \"aclImdb_train.mindrecord\"), shard_num=4)\n",
-    "    data_train = get_json_data_list(X_train, y_train)\n",
-    "    writer.add_schema(schema_json, \"nlp_schema\")\n",
-    "    writer.add_index([\"id\", \"label\"])\n",
-    "    writer.write_raw_data(data_train)\n",
-    "    writer.commit()\n",
-    "    \n",
-    "    writer = FileWriter(os.path.join(mindrecord_save_path, \"aclImdb_test.mindrecord\"), shard_num=4)\n",
-    "    data_test = get_json_data_list(X_test, y_test)\n",
-    "    writer.add_schema(schema_json, \"nlp_schema\")\n",
-    "    writer.add_index([\"id\", \"label\"])\n",
-    "    writer.write_raw_data(data_test)\n",
-    "    writer.commit()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "id": "8a3dad17",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "convert_np_to_mindrecord(X_train, y_train, X_test, y_test)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "id": "e6952e97",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "aclImdb_test.mindrecord0     aclImdb_train.mindrecord0\n",
-      "aclImdb_test.mindrecord0.db  aclImdb_train.mindrecord0.db\n",
-      "aclImdb_test.mindrecord1     aclImdb_train.mindrecord1\n",
-      "aclImdb_test.mindrecord1.db  aclImdb_train.mindrecord1.db\n",
-      "aclImdb_test.mindrecord2     aclImdb_train.mindrecord2\n",
-      "aclImdb_test.mindrecord2.db  aclImdb_train.mindrecord2.db\n",
-      "aclImdb_test.mindrecord3     aclImdb_train.mindrecord3\n",
-      "aclImdb_test.mindrecord3.db  aclImdb_train.mindrecord3.db\n"
-     ]
-    }
-   ],
-   "source": [
-    "!ls ./mindrecord"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "500a2411",
-   "metadata": {},
-   "source": [
-    "### 7. 创建数据集"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 18,
-   "id": "9108f4e0",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import mindspore.dataset as mds"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 19,
-   "id": "edc4efdd",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def create_dataset(base_path, batch_size, num_epochs, is_train):\n",
-    "    columns_list = [\"feature\", \"label\"]\n",
-    "    num_consumer = 4\n",
-    "    if is_train:\n",
-    "        path = os.path.join(base_path, \"aclImdb_train.mindrecord0\")\n",
-    "    else:\n",
-    "        path = os.path.join(base_path, \"aclImdb_test.mindrecord0\")\n",
-    "    dataset = mds.MindDataset(path, columns_list=[\"feature\", \"label\"], num_parallel_workers=4)\n",
-    "    dataset = dataset.shuffle(buffer_size=dataset.get_dataset_size())\n",
-    "    dataset = dataset.batch(batch_size=batch_size, drop_remainder=True)\n",
-    "    dataset = dataset.repeat(count=num_epochs)\n",
-    "    return dataset"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 20,
-   "id": "7782b64e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "dataset_train = create_dataset(\"./mindrecord\", batch_size=32, num_epochs=10, is_train=True)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "452b266e",
-   "metadata": {},
-   "source": [
-    "### 8. 定义模型并训练"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 21,
-   "id": "a6f692fd",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from mindspore import Tensor, nn, Model, context, Parameter\n",
-    "from mindspore.common.initializer import initializer\n",
-    "from mindspore.ops import operations as P\n",
-    "from mindspore.nn import Accuracy,LSTM\n",
-    "from mindspore.train.callback import LossMonitor, CheckpointConfig, ModelCheckpoint, TimeMonitor\n",
-    "from lstm import SentimentNet"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 22,
-   "id": "50bcb227",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "embedding_tabel = np.loadtxt(os.path.join(\"./cache\", \"weight.txt\")).astype(np.float32)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 23,
-   "id": "ad0cec2f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "network = SentimentNet(vocab_size=embedding_tabel.shape[0],\n",
-    "                embed_size=100,\n",
-    "                num_hiddens=100,\n",
-    "                num_layers=2,\n",
-    "                bidirectional=False,\n",
-    "                num_classes=2,\n",
-    "                weight=Tensor(embedding_tabel),\n",
-    "                batch_size=32)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 24,
-   "id": "18e4ad24",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "loss = nn.SoftmaxCrossEntropyWithLogits(reduction='mean', sparse=True)\n",
-    "opt = nn.Momentum(network.trainable_params(), 0.1, 0.9)\n",
-    "loss_callback = LossMonitor(per_print_times=3000)\n",
-    "model = Model(network, loss, opt, {'acc': Accuracy()})\n",
-    "config_ck = CheckpointConfig(save_checkpoint_steps=390, keep_checkpoint_max=10)\n",
-    "checkpoint_cb = ModelCheckpoint(prefix=\"lstm\", directory=\"./model\", config=config_ck)\n",
-    "time_cb = TimeMonitor(data_size=dataset_train.get_dataset_size())"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 25,
-   "id": "9fec8d6b",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "epoch: 1 step: 3000, loss is 0.6625426\n",
-      "epoch: 1 step: 6000, loss is 0.6949122\n",
-      "epoch time: 300260.071 ms, per step time: 38.446 ms\n",
-      "epoch: 2 step: 1190, loss is 0.7159466\n",
-      "epoch: 2 step: 4190, loss is 0.66605085\n",
-      "epoch: 2 step: 7190, loss is 0.59723616\n",
-      "epoch time: 227865.086 ms, per step time: 29.176 ms\n",
-      "epoch: 3 step: 2380, loss is 0.63499504\n",
-      "epoch: 3 step: 5380, loss is 0.6277052\n",
-      "epoch time: 228694.479 ms, per step time: 29.282 ms\n",
-      "epoch: 4 step: 570, loss is 0.5825846\n",
-      "epoch: 4 step: 3570, loss is 0.6407242\n",
-      "epoch: 4 step: 6570, loss is 0.6360452\n",
-      "epoch time: 228434.188 ms, per step time: 29.249 ms\n",
-      "epoch: 5 step: 1760, loss is 0.618327\n",
-      "epoch: 5 step: 4760, loss is 0.34937367\n",
-      "epoch: 5 step: 7760, loss is 0.3124014\n",
-      "epoch time: 228633.055 ms, per step time: 29.274 ms\n",
-      "epoch: 6 step: 2950, loss is 0.6037954\n",
-      "epoch: 6 step: 5950, loss is 0.49445567\n",
-      "epoch time: 227237.265 ms, per step time: 29.096 ms\n",
-      "epoch: 7 step: 1140, loss is 0.19343969\n",
-      "epoch: 7 step: 4140, loss is 0.2573592\n",
-      "epoch: 7 step: 7140, loss is 0.11479706\n",
-      "epoch time: 228353.065 ms, per step time: 29.239 ms\n",
-      "epoch: 8 step: 2330, loss is 0.32086658\n",
-      "epoch: 8 step: 5330, loss is 0.20143133\n",
-      "epoch time: 228068.243 ms, per step time: 29.202 ms\n",
-      "epoch: 9 step: 520, loss is 0.081819385\n",
-      "epoch: 9 step: 3520, loss is 0.3052325\n",
-      "epoch: 9 step: 6520, loss is 0.40288585\n",
-      "epoch time: 227941.615 ms, per step time: 29.186 ms\n",
-      "epoch: 10 step: 1710, loss is 0.012856578\n",
-      "epoch: 10 step: 4710, loss is 0.06560053\n",
-      "epoch: 10 step: 7710, loss is 0.07241162\n",
-      "epoch time: 228342.392 ms, per step time: 29.237 ms\n"
-     ]
-    }
-   ],
-   "source": [
-    "from mindspore import context\n",
-    "context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target=\"GPU\")\n",
-    "model.train(10, dataset_train, callbacks=[time_cb, checkpoint_cb, loss_callback], dataset_sink_mode=False)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "0089756b",
-   "metadata": {},
-   "source": [
-    "### 9. 评估模型"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 26,
-   "id": "eb6d15da",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "dataset_test = create_dataset(\"./mindrecord\", batch_size=32, num_epochs=10, is_train=False)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 27,
-   "id": "d89f2d0b",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "accuracy:{'acc': 0.813644366197183}\n"
-     ]
-    }
-   ],
-   "source": [
-    "acc = model.eval(dataset_test)\n",
-    "print(\"accuracy:{}\".format(acc))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a5c1d85a",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "MindSpore",
-   "language": "python",
-   "name": "mindspore"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.7.10"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
-- 
Gitee


From e7055004cd4fae19a813698046941f5e42ebf3c6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=85=B7=E7=9B=96?=
 <10061212+wh1051899053@user.noreply.gitee.com>
Date: Wed, 1 Dec 2021 07:52:27 +0000
Subject: [PATCH 03/11] =?UTF-8?q?=E6=96=B0=E5=BB=BA=20=E9=87=91=E4=BD=B3?=
 =?UTF-8?q?=E6=83=A0-=E5=9F=BA=E4=BA=8EMindspore=E7=9A=84=E4=BA=8C?=
 =?UTF-8?q?=E5=85=83=E6=83=85=E6=84=9F=E5=88=86=E6=9E=90?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../.keep"                                                        | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 "code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/.keep"

diff --git "a/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/.keep" "b/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/.keep"
new file mode 100644
index 0000000..e69de29
-- 
Gitee


From e9cd8e678bdea266866bc8dfd775c98aeb2860ac Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=85=B7=E7=9B=96?=
 <10061212+wh1051899053@user.noreply.gitee.com>
Date: Wed, 1 Dec 2021 07:52:55 +0000
Subject: [PATCH 04/11] =?UTF-8?q?=E7=AC=AC=E4=B8=80=E6=AC=A1=E6=8F=90?=
 =?UTF-8?q?=E4=BA=A4?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../glove_bilstm.ipynb"                       | 597 ++++++++++++++++++
 .../lstm.py"                                  | 326 ++++++++++
 .../readme.md"                                |  41 ++
 3 files changed, 964 insertions(+)
 create mode 100644 "code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/glove_bilstm.ipynb"
 create mode 100644 "code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/lstm.py"
 create mode 100644 "code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/readme.md"

diff --git "a/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/glove_bilstm.ipynb" "b/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/glove_bilstm.ipynb"
new file mode 100644
index 0000000..787f9d5
--- /dev/null
+++ "b/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/glove_bilstm.ipynb"
@@ -0,0 +1,597 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "fe8db7fb",
+   "metadata": {},
+   "source": [
+    "安装包"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "e831646d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Looking in indexes: http://repo.myhuaweicloud.com/repository/pypi/simple\n",
+      "Collecting gensim\n",
+      "  Downloading http://repo.myhuaweicloud.com/repository/pypi/packages/9f/44/985c6291f160aca1257dae9b5bb62d91d0f61f12014297a2fa80e6464be1/gensim-4.1.2-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1 MB)\n",
+      "\u001b[K     |████████████████████████████████| 24.1 MB 30.0 MB/s eta 0:00:016 MB 30.0 MB/s eta 0:00:01B 30.0 MB/s eta 0:00:01███████▉| 23.9 MB 30.0 MB/s eta 0:00:01\n",
+      "\u001b[?25hRequirement already satisfied: scipy>=0.18.1 in /home/ma-user/anaconda3/envs/MindSpore/lib/python3.7/site-packages (from gensim) (1.7.2)\n",
+      "Collecting smart-open>=1.8.1\n",
+      "  Downloading http://repo.myhuaweicloud.com/repository/pypi/packages/cd/11/05f68ea934c24ade38e95ac30a38407767787c4e3db1776eae4886ad8c95/smart_open-5.2.1-py3-none-any.whl (58 kB)\n",
+      "\u001b[K     |████████████████████████████████| 58 kB 26.0 MB/s eta 0:00:01\n",
+      "\u001b[?25hRequirement already satisfied: numpy>=1.17.0 in /home/ma-user/anaconda3/envs/MindSpore/lib/python3.7/site-packages (from gensim) (1.21.4)\n",
+      "Installing collected packages: smart-open, gensim\n",
+      "Successfully installed gensim-4.1.2 smart-open-5.2.1\n",
+      "\u001b[33mWARNING: You are using pip version 21.0.1; however, version 21.3 is available.\n",
+      "You should consider upgrading via the '/home/ma-user/anaconda3/envs/MindSpore/bin/python -m pip install --upgrade pip' command.\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install gensim"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ad7c54d6",
+   "metadata": {},
+   "source": [
+    "导入包"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "44aa6bcd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import math\n",
+    "from itertools import chain\n",
+    "import gensim\n",
+    "import numpy as np\n",
+    "from mindspore.mindrecord import FileWriter"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "74543d4c",
+   "metadata": {},
+   "source": [
+    "### 1. 读入数据"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "5a2b601c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 定义读入数据的函数\n",
+    "def read_imdb(path, seg='train'):\n",
+    "    labels = ['pos', 'neg']\n",
+    "    data = []\n",
+    "    for label in labels:\n",
+    "        # rf:./aclImdb/seg/label.txt, eg:./aclImdb/train/pos.txt\n",
+    "        with open(os.path.join(path, seg, label) + '.txt', 'r', encoding='utf8') as rf:\n",
+    "            for review in rf.readlines():\n",
+    "                review = review.replace('\\n', '')\n",
+    "                if label == 'pos':\n",
+    "                    data.append([review, 1])\n",
+    "                elif label == 'neg':\n",
+    "                    data.append([review, 0])\n",
+    "    return data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "df68cfff",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "imdb_data_path='./aclImdb'\n",
+    "raw_data_train = read_imdb(imdb_data_path, seg='train')\n",
+    "raw_data_test = read_imdb(imdb_data_path, seg='test')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2204a189",
+   "metadata": {},
+   "source": [
+    "### 2. 分词"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "c3619ef5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 定义分词函数\n",
+    "def tokenize_samples(raw_data):\n",
+    "    tokenized_data = []\n",
+    "    for review in raw_data:\n",
+    "        tokenized_data.append([tok.lower() for tok in review.split()])\n",
+    "    return tokenized_data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "eaa71af9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# tokenized_data:[[word,word,word,...],...]\n",
+    "tokenized_data_train = tokenize_samples([review for review, _ in raw_data_train])\n",
+    "tokenized_data_test = tokenize_samples([review for review, _ in raw_data_test])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4d8e86a7",
+   "metadata": {},
+   "source": [
+    "### 3. word2idx"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "bb31338d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 建立word->index的字典，用作后面将文字转化为数字序列\n",
+    "vocab = set(chain(*tokenized_data_train))\n",
+    "word_to_idx = {word: i+1 for i, word in enumerate(vocab)}\n",
+    "word_to_idx['<unk>'] = 0"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "51c1ed8e",
+   "metadata": {},
+   "source": [
+    "### 4. embedding"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "ce4f1a73",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# !sed -i '1i\\400000 100' ./glove.6B.100d.txt\n",
+    "def load_embeddings(glove_file_path, word_to_idx, embed_size=100):\n",
+    "    word2vector = gensim.models.KeyedVectors.load_word2vec_format(\n",
+    "        glove_file_path, binary=False, encoding='utf-8')\n",
+    "    assert embed_size == word2vector.vector_size\n",
+    "    embeddings = np.zeros((len(word_to_idx), embed_size)).astype(np.float32)\n",
+    "    for word, idx in word_to_idx.items():\n",
+    "        try:\n",
+    "            embeddings[idx, :] = word2vector.get_vector(word)\n",
+    "        except KeyError:\n",
+    "            continue\n",
+    "    return embeddings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "899ce46a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "embeddings = load_embeddings('./glove/glove.6B.100d.txt', word_to_idx)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "d3e93498",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "np.savetxt(\"./cache/weight.txt\", embeddings)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b0a6045f",
+   "metadata": {},
+   "source": [
+    "### 5. encode"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "d83aaf7c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def encode_samples(tokenized_samples, word_to_idx):\n",
+    "    \"\"\"\n",
+    "    tokenized_samples: [[word, word, ...]]\n",
+    "    word_to_idx: {word:idx, word:idx, ...}\n",
+    "    features: [[idx, idx, ...], [idx, idx, ...], ...]\n",
+    "    \"\"\"\n",
+    "    features = []\n",
+    "    for sample in tokenized_samples:\n",
+    "        feature = []\n",
+    "        for token in sample:\n",
+    "            feature.append(word_to_idx.get(token, 0))\n",
+    "        features.append(feature)\n",
+    "    return features"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "fd203f67",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def pad_samples(features, maxlen=500, pad=0):\n",
+    "    padded_features = []\n",
+    "    for feature in features:\n",
+    "        if len(feature) >= maxlen:\n",
+    "            padded_feature = feature[:maxlen]\n",
+    "        else:\n",
+    "            padded_feature = feature\n",
+    "            while len(padded_feature) < maxlen:\n",
+    "                padded_feature.append(pad)\n",
+    "        padded_features.append(padded_feature)\n",
+    "    return padded_features"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "5dae973a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "y_train = np.array([label for _, label in raw_data_train]).astype(np.int32)\n",
+    "y_test = np.array([label for _, label in raw_data_test]).astype(np.int32)\n",
+    "\n",
+    "X_train = np.array(pad_samples(encode_samples(tokenized_data_train, word_to_idx))).astype(np.int32)\n",
+    "X_test = np.array(pad_samples(encode_samples(tokenized_data_test, word_to_idx))).astype(np.int32)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "324b1291",
+   "metadata": {},
+   "source": [
+    "### 6. convert to mindrecord"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "80a0e3b2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_json_data_list(X, y):\n",
+    "    data_list = []\n",
+    "    for i, (feature, label) in enumerate(zip(X, y)):\n",
+    "        data_json = {\"id\": i, \"feature\": feature.reshape(-1), \"label\": int(label)}\n",
+    "        data_list.append(data_json)\n",
+    "    return data_list"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "a9440916",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def convert_np_to_mindrecord(X_train, y_train, X_test, y_test, mindrecord_save_path=\"./mindrecord\"):\n",
+    "    schema_json = {\"id\": {\"type\": \"int32\"},\n",
+    "                  \"label\": {\"type\": \"int32\"},\n",
+    "                  \"feature\": {\"type\": \"int32\", \"shape\": [-1]}}\n",
+    "    writer = FileWriter(os.path.join(mindrecord_save_path, \"aclImdb_train.mindrecord\"), shard_num=4)\n",
+    "    data_train = get_json_data_list(X_train, y_train)\n",
+    "    writer.add_schema(schema_json, \"nlp_schema\")\n",
+    "    writer.add_index([\"id\", \"label\"])\n",
+    "    writer.write_raw_data(data_train)\n",
+    "    writer.commit()\n",
+    "    \n",
+    "    writer = FileWriter(os.path.join(mindrecord_save_path, \"aclImdb_test.mindrecord\"), shard_num=4)\n",
+    "    data_test = get_json_data_list(X_test, y_test)\n",
+    "    writer.add_schema(schema_json, \"nlp_schema\")\n",
+    "    writer.add_index([\"id\", \"label\"])\n",
+    "    writer.write_raw_data(data_test)\n",
+    "    writer.commit()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "8a3dad17",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "convert_np_to_mindrecord(X_train, y_train, X_test, y_test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "e6952e97",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "aclImdb_test.mindrecord0     aclImdb_train.mindrecord0\n",
+      "aclImdb_test.mindrecord0.db  aclImdb_train.mindrecord0.db\n",
+      "aclImdb_test.mindrecord1     aclImdb_train.mindrecord1\n",
+      "aclImdb_test.mindrecord1.db  aclImdb_train.mindrecord1.db\n",
+      "aclImdb_test.mindrecord2     aclImdb_train.mindrecord2\n",
+      "aclImdb_test.mindrecord2.db  aclImdb_train.mindrecord2.db\n",
+      "aclImdb_test.mindrecord3     aclImdb_train.mindrecord3\n",
+      "aclImdb_test.mindrecord3.db  aclImdb_train.mindrecord3.db\n"
+     ]
+    }
+   ],
+   "source": [
+    "!ls ./mindrecord"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "500a2411",
+   "metadata": {},
+   "source": [
+    "### 7. 创建数据集"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "9108f4e0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import mindspore.dataset as mds"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "edc4efdd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_dataset(base_path, batch_size, num_epochs, is_train):\n",
+    "    columns_list = [\"feature\", \"label\"]\n",
+    "    num_consumer = 4\n",
+    "    if is_train:\n",
+    "        path = os.path.join(base_path, \"aclImdb_train.mindrecord0\")\n",
+    "    else:\n",
+    "        path = os.path.join(base_path, \"aclImdb_test.mindrecord0\")\n",
+    "    dataset = mds.MindDataset(path, columns_list=[\"feature\", \"label\"], num_parallel_workers=4)\n",
+    "    dataset = dataset.shuffle(buffer_size=dataset.get_dataset_size())\n",
+    "    dataset = dataset.batch(batch_size=batch_size, drop_remainder=True)\n",
+    "    dataset = dataset.repeat(count=num_epochs)\n",
+    "    return dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "7782b64e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset_train = create_dataset(\"./mindrecord\", batch_size=32, num_epochs=10, is_train=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "452b266e",
+   "metadata": {},
+   "source": [
+    "### 8. 定义模型并训练"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "a6f692fd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from mindspore import Tensor, nn, Model, context, Parameter\n",
+    "from mindspore.common.initializer import initializer\n",
+    "from mindspore.ops import operations as P\n",
+    "from mindspore.nn import Accuracy,LSTM\n",
+    "from mindspore.train.callback import LossMonitor, CheckpointConfig, ModelCheckpoint, TimeMonitor\n",
+    "from lstm import SentimentNet"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "50bcb227",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "embedding_tabel = np.loadtxt(os.path.join(\"./cache\", \"weight.txt\")).astype(np.float32)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "ad0cec2f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "network = SentimentNet(vocab_size=embedding_tabel.shape[0],\n",
+    "                embed_size=100,\n",
+    "                num_hiddens=100,\n",
+    "                num_layers=2,\n",
+    "                bidirectional=False,\n",
+    "                num_classes=2,\n",
+    "                weight=Tensor(embedding_tabel),\n",
+    "                batch_size=32)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "18e4ad24",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loss = nn.SoftmaxCrossEntropyWithLogits(reduction='mean', sparse=True)\n",
+    "opt = nn.Momentum(network.trainable_params(), 0.1, 0.9)\n",
+    "loss_callback = LossMonitor(per_print_times=3000)\n",
+    "model = Model(network, loss, opt, {'acc': Accuracy()})\n",
+    "config_ck = CheckpointConfig(save_checkpoint_steps=390, keep_checkpoint_max=10)\n",
+    "checkpoint_cb = ModelCheckpoint(prefix=\"lstm\", directory=\"./model\", config=config_ck)\n",
+    "time_cb = TimeMonitor(data_size=dataset_train.get_dataset_size())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "9fec8d6b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch: 1 step: 3000, loss is 0.6625426\n",
+      "epoch: 1 step: 6000, loss is 0.6949122\n",
+      "epoch time: 300260.071 ms, per step time: 38.446 ms\n",
+      "epoch: 2 step: 1190, loss is 0.7159466\n",
+      "epoch: 2 step: 4190, loss is 0.66605085\n",
+      "epoch: 2 step: 7190, loss is 0.59723616\n",
+      "epoch time: 227865.086 ms, per step time: 29.176 ms\n",
+      "epoch: 3 step: 2380, loss is 0.63499504\n",
+      "epoch: 3 step: 5380, loss is 0.6277052\n",
+      "epoch time: 228694.479 ms, per step time: 29.282 ms\n",
+      "epoch: 4 step: 570, loss is 0.5825846\n",
+      "epoch: 4 step: 3570, loss is 0.6407242\n",
+      "epoch: 4 step: 6570, loss is 0.6360452\n",
+      "epoch time: 228434.188 ms, per step time: 29.249 ms\n",
+      "epoch: 5 step: 1760, loss is 0.618327\n",
+      "epoch: 5 step: 4760, loss is 0.34937367\n",
+      "epoch: 5 step: 7760, loss is 0.3124014\n",
+      "epoch time: 228633.055 ms, per step time: 29.274 ms\n",
+      "epoch: 6 step: 2950, loss is 0.6037954\n",
+      "epoch: 6 step: 5950, loss is 0.49445567\n",
+      "epoch time: 227237.265 ms, per step time: 29.096 ms\n",
+      "epoch: 7 step: 1140, loss is 0.19343969\n",
+      "epoch: 7 step: 4140, loss is 0.2573592\n",
+      "epoch: 7 step: 7140, loss is 0.11479706\n",
+      "epoch time: 228353.065 ms, per step time: 29.239 ms\n",
+      "epoch: 8 step: 2330, loss is 0.32086658\n",
+      "epoch: 8 step: 5330, loss is 0.20143133\n",
+      "epoch time: 228068.243 ms, per step time: 29.202 ms\n",
+      "epoch: 9 step: 520, loss is 0.081819385\n",
+      "epoch: 9 step: 3520, loss is 0.3052325\n",
+      "epoch: 9 step: 6520, loss is 0.40288585\n",
+      "epoch time: 227941.615 ms, per step time: 29.186 ms\n",
+      "epoch: 10 step: 1710, loss is 0.012856578\n",
+      "epoch: 10 step: 4710, loss is 0.06560053\n",
+      "epoch: 10 step: 7710, loss is 0.07241162\n",
+      "epoch time: 228342.392 ms, per step time: 29.237 ms\n"
+     ]
+    }
+   ],
+   "source": [
+    "from mindspore import context\n",
+    "context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target=\"GPU\")\n",
+    "model.train(10, dataset_train, callbacks=[time_cb, checkpoint_cb, loss_callback], dataset_sink_mode=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0089756b",
+   "metadata": {},
+   "source": [
+    "### 9. 评估模型"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "eb6d15da",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset_test = create_dataset(\"./mindrecord\", batch_size=32, num_epochs=10, is_train=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "d89f2d0b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "accuracy:{'acc': 0.813644366197183}\n"
+     ]
+    }
+   ],
+   "source": [
+    "acc = model.eval(dataset_test)\n",
+    "print(\"accuracy:{}\".format(acc))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a5c1d85a",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "MindSpore",
+   "language": "python",
+   "name": "mindspore"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git "a/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/lstm.py" "b/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/lstm.py"
new file mode 100644
index 0000000..7cbaecb
--- /dev/null
+++ "b/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/lstm.py"
@@ -0,0 +1,326 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""LSTM."""
+import math
+
+import numpy as np
+
+from mindspore import Tensor, nn, context, Parameter, ParameterTuple
+from mindspore.common.initializer import initializer
+from mindspore.ops import operations as P
+import mindspore.ops.functional as F
+import mindspore.common.dtype as mstype
+
+STACK_LSTM_DEVICE = ["CPU"]
+
+
+# Initialize short-term memory (h) and long-term memory (c) to 0
+def lstm_default_state(batch_size, hidden_size, num_layers, bidirectional):
+    """init default input."""
+    num_directions = 2 if bidirectional else 1
+    h = Tensor(np.zeros((num_layers * num_directions, batch_size, hidden_size)).astype(np.float32))
+    c = Tensor(np.zeros((num_layers * num_directions, batch_size, hidden_size)).astype(np.float32))
+    return h, c
+
+
+def stack_lstm_default_state(batch_size, hidden_size, num_layers, bidirectional):
+    """init default input."""
+    num_directions = 2 if bidirectional else 1
+
+    h_list = c_list = []
+    for _ in range(num_layers):
+        h_list.append(Tensor(np.zeros((num_directions, batch_size, hidden_size)).astype(np.float32)))
+        c_list.append(Tensor(np.zeros((num_directions, batch_size, hidden_size)).astype(np.float32)))
+    h, c = tuple(h_list), tuple(c_list)
+    return h, c
+
+def stack_lstm_default_state_ascend(batch_size, hidden_size, num_layers, bidirectional):
+    """init default input."""
+
+    h_list = c_list = []
+    for _ in range(num_layers):
+        h_fw = Tensor(np.zeros((1, batch_size, hidden_size)).astype(np.float16))
+        c_fw = Tensor(np.zeros((1, batch_size, hidden_size)).astype(np.float16))
+        h_i = [h_fw]
+        c_i = [c_fw]
+
+        if bidirectional:
+            h_bw = Tensor(np.zeros((1, batch_size, hidden_size)).astype(np.float16))
+            c_bw = Tensor(np.zeros((1, batch_size, hidden_size)).astype(np.float16))
+            h_i.append(h_bw)
+            c_i.append(c_bw)
+
+        h_list.append(h_i)
+        c_list.append(c_i)
+
+    h, c = tuple(h_list), tuple(c_list)
+    return h, c
+
+
+class StackLSTM(nn.Cell):
+    """
+    Stack multi-layers LSTM together.
+    """
+
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 num_layers=1,
+                 has_bias=True,
+                 batch_first=False,
+                 dropout=0.0,
+                 bidirectional=False):
+        super(StackLSTM, self).__init__()
+        self.num_layers = num_layers
+        self.batch_first = batch_first
+        self.transpose = P.Transpose()
+
+        # direction number
+        num_directions = 2 if bidirectional else 1
+
+        # input_size list
+        input_size_list = [input_size]
+        for i in range(num_layers - 1):
+            input_size_list.append(hidden_size * num_directions)
+
+        # layers
+        layers = []
+        for i in range(num_layers):
+            layers.append(nn.LSTMCell(input_size=input_size_list[i],
+                                      hidden_size=hidden_size,
+                                      has_bias=has_bias,
+                                      batch_first=batch_first,
+                                      bidirectional=bidirectional,
+                                      dropout=dropout))
+
+        # weights
+        weights = []
+        for i in range(num_layers):
+            # weight size
+            weight_size = (input_size_list[i] + hidden_size) * num_directions * hidden_size * 4
+            if has_bias:
+                bias_size = num_directions * hidden_size * 4
+                weight_size = weight_size + bias_size
+
+            # numpy weight
+            stdv = 1 / math.sqrt(hidden_size)
+            w_np = np.random.uniform(-stdv, stdv, (weight_size, 1, 1)).astype(np.float32)
+
+            # lstm weight
+            weights.append(Parameter(initializer(Tensor(w_np), w_np.shape), name="weight" + str(i)))
+
+        #
+        self.lstms = layers
+        self.weight = ParameterTuple(tuple(weights))
+
+    def construct(self, x, hx):
+        """construct"""
+        if self.batch_first:
+            x = self.transpose(x, (1, 0, 2))
+        # stack lstm
+        h, c = hx
+        hn = cn = None
+        for i in range(self.num_layers):
+            x, hn, cn, _, _ = self.lstms[i](x, h[i], c[i], self.weight[i])
+        if self.batch_first:
+            x = self.transpose(x, (1, 0, 2))
+        return x, (hn, cn)
+
+class LSTM_Ascend(nn.Cell):
+    """ LSTM in Ascend. """
+
+    def __init__(self, bidirectional=False):
+        super(LSTM_Ascend, self).__init__()
+        self.bidirectional = bidirectional
+        self.dynamic_rnn = P.DynamicRNN(forget_bias=0.0)
+        self.reverseV2 = P.ReverseV2(axis=[0])
+        self.concat = P.Concat(2)
+
+    def construct(self, x, h, c, w_f, b_f, w_b=None, b_b=None):
+        """construct"""
+        x = F.cast(x, mstype.float16)
+        if self.bidirectional:
+            y1, h1, c1, _, _, _, _, _ = self.dynamic_rnn(x, w_f, b_f, None, h[0], c[0])
+            r_x = self.reverseV2(x)
+            y2, h2, c2, _, _, _, _, _ = self.dynamic_rnn(r_x, w_b, b_b, None, h[1], c[1])
+            y2 = self.reverseV2(y2)
+
+            output = self.concat((y1, y2))
+            hn = self.concat((h1, h2))
+            cn = self.concat((c1, c2))
+            return output, (hn, cn)
+
+        y1, h1, c1, _, _, _, _, _ = self.dynamic_rnn(x, w_f, b_f, None, h[0], c[0])
+        return y1, (h1, c1)
+
+class StackLSTMAscend(nn.Cell):
+    """ Stack multi-layers LSTM together. """
+
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 num_layers=1,
+                 has_bias=True,
+                 batch_first=False,
+                 dropout=0.0,
+                 bidirectional=False):
+        super(StackLSTMAscend, self).__init__()
+        self.num_layers = num_layers
+        self.batch_first = batch_first
+        self.bidirectional = bidirectional
+        self.transpose = P.Transpose()
+
+        # input_size list
+        input_size_list = [input_size]
+        for i in range(num_layers - 1):
+            input_size_list.append(hidden_size * 2)
+
+        #weights, bias and layers init
+        weights_fw = []
+        weights_bw = []
+        bias_fw = []
+        bias_bw = []
+
+        stdv = 1 / math.sqrt(hidden_size)
+        for i in range(num_layers):
+            # forward weight init
+            w_np_fw = np.random.uniform(-stdv,
+                                        stdv,
+                                        (input_size_list[i] + hidden_size, hidden_size * 4)).astype(np.float32)
+            w_fw = Parameter(initializer(Tensor(w_np_fw), w_np_fw.shape), name="w_fw_layer" + str(i))
+            weights_fw.append(w_fw)
+            # forward bias init
+            if has_bias:
+                b_fw = np.random.uniform(-stdv, stdv, (hidden_size * 4)).astype(np.float32)
+                b_fw = Parameter(initializer(Tensor(b_fw), b_fw.shape), name="b_fw_layer" + str(i))
+            else:
+                b_fw = np.zeros((hidden_size * 4)).astype(np.float32)
+                b_fw = Parameter(initializer(Tensor(b_fw), b_fw.shape), name="b_fw_layer" + str(i))
+            bias_fw.append(b_fw)
+
+            if bidirectional:
+                # backward weight init
+                w_np_bw = np.random.uniform(-stdv,
+                                            stdv,
+                                            (input_size_list[i] + hidden_size, hidden_size * 4)).astype(np.float32)
+                w_bw = Parameter(initializer(Tensor(w_np_bw), w_np_bw.shape), name="w_bw_layer" + str(i))
+                weights_bw.append(w_bw)
+
+                # backward bias init
+                if has_bias:
+                    b_bw = np.random.uniform(-stdv, stdv, (hidden_size * 4)).astype(np.float32)
+                    b_bw = Parameter(initializer(Tensor(b_bw), b_bw.shape), name="b_bw_layer" + str(i))
+                else:
+                    b_bw = np.zeros((hidden_size * 4)).astype(np.float32)
+                    b_bw = Parameter(initializer(Tensor(b_bw), b_bw.shape), name="b_bw_layer" + str(i))
+                bias_bw.append(b_bw)
+
+        # layer init
+        self.lstm = LSTM_Ascend(bidirectional=bidirectional).to_float(mstype.float16)
+
+        self.weight_fw = ParameterTuple(tuple(weights_fw))
+        self.weight_bw = ParameterTuple(tuple(weights_bw))
+        self.bias_fw = ParameterTuple(tuple(bias_fw))
+        self.bias_bw = ParameterTuple(tuple(bias_bw))
+
+    def construct(self, x, hx):
+        """construct"""
+        x = F.cast(x, mstype.float16)
+        if self.batch_first:
+            x = self.transpose(x, (1, 0, 2))
+        # stack lstm
+        h, c = hx
+        hn = cn = None
+        for i in range(self.num_layers):
+            if self.bidirectional:
+                x, (hn, cn) = self.lstm(x,
+                                        h[i],
+                                        c[i],
+                                        self.weight_fw[i],
+                                        self.bias_fw[i],
+                                        self.weight_bw[i],
+                                        self.bias_bw[i])
+            else:
+                x, (hn, cn) = self.lstm(x, h[i], c[i], self.weight_fw[i], self.bias_fw[i])
+        if self.batch_first:
+            x = self.transpose(x, (1, 0, 2))
+        x = F.cast(x, mstype.float32)
+        hn = F.cast(x, mstype.float32)
+        cn = F.cast(x, mstype.float32)
+        return x, (hn, cn)
+
+class SentimentNet(nn.Cell):
+    """Sentiment network structure."""
+
+    def __init__(self,
+                 vocab_size,
+                 embed_size,
+                 num_hiddens,
+                 num_layers,
+                 bidirectional,
+                 num_classes,
+                 weight,
+                 batch_size):
+        super(SentimentNet, self).__init__()
+        # Mapp words to vectors
+        self.embedding = nn.Embedding(vocab_size,
+                                      embed_size,
+                                      embedding_table=weight)
+        self.embedding.embedding_table.requires_grad = False
+        self.trans = P.Transpose()
+        self.perm = (1, 0, 2)
+
+        if context.get_context("device_target") in STACK_LSTM_DEVICE:
+            # stack lstm by user
+            self.encoder = StackLSTM(input_size=embed_size,
+                                     hidden_size=num_hiddens,
+                                     num_layers=num_layers,
+                                     has_bias=True,
+                                     bidirectional=bidirectional,
+                                     dropout=0.0)
+            self.h, self.c = stack_lstm_default_state(batch_size, num_hiddens, num_layers, bidirectional)
+        elif context.get_context("device_target") == "GPU":
+            # standard lstm
+            self.encoder = nn.LSTM(input_size=embed_size,
+                                   hidden_size=num_hiddens,
+                                   num_layers=num_layers,
+                                   has_bias=True,
+                                   bidirectional=bidirectional,
+                                   dropout=0.0)
+            self.h, self.c = lstm_default_state(batch_size, num_hiddens, num_layers, bidirectional)
+        else:
+            self.encoder = StackLSTMAscend(input_size=embed_size,
+                                           hidden_size=num_hiddens,
+                                           num_layers=num_layers,
+                                           has_bias=True,
+                                           bidirectional=bidirectional)
+            self.h, self.c = stack_lstm_default_state_ascend(batch_size, num_hiddens, num_layers, bidirectional)
+
+        self.concat = P.Concat(1)
+        self.squeeze = P.Squeeze(axis=0)
+        if bidirectional:
+            self.decoder = nn.Dense(num_hiddens * 4, num_classes)
+        else:
+            self.decoder = nn.Dense(num_hiddens * 2, num_classes)
+
+    def construct(self, inputs):
+        # input：(64,500,300)
+        embeddings = self.embedding(inputs)
+        embeddings = self.trans(embeddings, self.perm)
+        output, _ = self.encoder(embeddings, (self.h, self.c))
+        # states[i] size(64,200)  -> encoding.size(64,400)
+        encoding = self.concat((self.squeeze(output[0:1:1]), self.squeeze(output[499:500:1])))
+        outputs = self.decoder(encoding)
+        return outputs
diff --git "a/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/readme.md" "b/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/readme.md"
new file mode 100644
index 0000000..89bd541
--- /dev/null
+++ "b/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/readme.md"
@@ -0,0 +1,41 @@
+# 文件夹总览
+
+-   sentiment_analysis
+   ├── aclImdb
+   │   ├── test
+   │   │   ├── neg.txt
+   │   │   └── pos.txt
+   │   ├── train
+   │   │   ├── neg.txt
+   │   │   └── pos.txt
+   │   ├── data_reform.py
+   │   └── readme.md
+   ├── cache
+   │   └── weight.txt
+   ├── glove
+   │   ├── glove_re_file.py        # 用于分解glove文件并上传至华为云
+   │   ├── glove.6B.50d.py
+   │   ├── glove.6B.100d.py
+   │   ├── glove.6B.200d.py
+   │   └── glove.6B.300d.py
+   ├── mindrecord
+   │   └── 用于存放mindrecord格式数据集文件
+   ├── model
+   │   └── 用于存放训练好的模型文件
+   ├── glove_bilstm.ipynb
+   ├── lstm.py
+   └── readme.md
+
+
+# 环境要求
+
+- 本环境要求指，在华为云notebook-mindspore1.2.0-cuda10.1-cudnn7-ubuntu18.04下的配置环境要求
+    * gensim-4.1.2
+
+# 运行方法
+
+- 第一次运行时，需删除mindrecord文件夹下的数据集文件
+    * 运行glove_bilstm.ipynb
+- 第二次及之后运行时
+    * 若未更改glove维度，则可以直接从glove_bilstm.ipynb内【7.创建数据集】开始的步骤开始运行
+    * 若有更改glove维度，则参考第一次运行时的步骤，并从头开始运行glove_bilstm.ipynb文件
\ No newline at end of file
-- 
Gitee


From d373bae9b5a08bdb007c5fd128ee0c86c01466bc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=85=B7=E7=9B=96?=
 <10061212+wh1051899053@user.noreply.gitee.com>
Date: Wed, 1 Dec 2021 07:53:06 +0000
Subject: [PATCH 05/11] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20co?=
 =?UTF-8?q?de/2021=5Fautumn/=E9=87=91=E4=BD=B3=E6=83=A0-=E5=9F=BA=E4=BA=8E?=
 =?UTF-8?q?Mindspore=E7=9A=84=E4=BA=8C=E5=85=83=E6=83=85=E6=84=9F=E5=88=86?=
 =?UTF-8?q?=E6=9E=90/.keep?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../.keep"                                                        | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 "code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/.keep"

diff --git "a/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/.keep" "b/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/.keep"
deleted file mode 100644
index e69de29..0000000
-- 
Gitee


From 536dd1980ef767599c9abfc9c190b960ff33a731 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=85=B7=E7=9B=96?=
 <10061212+wh1051899053@user.noreply.gitee.com>
Date: Wed, 1 Dec 2021 07:59:19 +0000
Subject: [PATCH 06/11] =?UTF-8?q?=E6=96=B0=E5=BB=BA=20aclImdb?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../aclImdb/.keep"                                                | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 "code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/aclImdb/.keep"

diff --git "a/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/aclImdb/.keep" "b/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/aclImdb/.keep"
new file mode 100644
index 0000000..e69de29
-- 
Gitee


From 8218e1a31f3946cfeae55077b75b34594eaabd71 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=85=B7=E7=9B=96?=
 <10061212+wh1051899053@user.noreply.gitee.com>
Date: Wed, 1 Dec 2021 07:59:36 +0000
Subject: [PATCH 07/11] =?UTF-8?q?=E7=AC=AC=E4=B8=80=E6=AC=A1=E6=8F=90?=
 =?UTF-8?q?=E4=BA=A4?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../aclImdb/data_reform.py"                   | 23 ++++++++++
 .../aclImdb/readme.md"                        | 44 +++++++++++++++++++
 2 files changed, 67 insertions(+)
 create mode 100644 "code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/aclImdb/data_reform.py"
 create mode 100644 "code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/aclImdb/readme.md"

diff --git "a/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/aclImdb/data_reform.py" "b/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/aclImdb/data_reform.py"
new file mode 100644
index 0000000..9a06206
--- /dev/null
+++ "b/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/aclImdb/data_reform.py"
@@ -0,0 +1,23 @@
+import os
+
+dir_path = os.path.dirname(os.path.realpath(__file__))
+
+def data_reform(path, seg='train'):
+
+    labels = ['pos', 'neg']
+    
+    for label in labels:
+
+        # 重新写入的文件名为dir_path/train/pos.txt
+        with open(os.path.join(path, seg, label)+'.txt', 'w', encoding='utf-8') as wf:
+            files = os.listdir(os.path.join(path, seg, label))
+        
+            # files: dir_path/train/pos
+            for file in files:
+                with open(os.path.join(path, seg, label, file), 'r', encoding='utf8') as rf:
+                    wf.write(rf.read() + '\n')
+
+imdb_data_path = dir_path
+
+data_reform(imdb_data_path, seg='train')
+data_reform(imdb_data_path, seg='test')
diff --git "a/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/aclImdb/readme.md" "b/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/aclImdb/readme.md"
new file mode 100644
index 0000000..c5e48ae
--- /dev/null
+++ "b/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/aclImdb/readme.md"
@@ -0,0 +1,44 @@
+  
+- 在http://ai.stanford.edu/~amaas/data/sentiment/ 下载得到压缩包aclImdb.tar.gz
+  
+- 解压后文件夹目录如下：
+  
+  aclImdb
+    ├── test
+    │   ├── neg
+    │   │  └── txt文件 * 12500
+    │   ├── pos
+    │   │  └── txt文件 * 12500
+    │   ├── labeledBow.feat
+    │   ├── urls_neg.txt
+    │   └── urls_pos.txt
+    ├── train
+    │   ├── neg
+    │   │  └── txt文件 * 12500
+    │   ├── pos
+    │   │  └── txt文件 * 12500
+    │   ├── unsup
+    │   │  └── txt文件 * 50000
+    │   ├── labeledBow.feat
+    │   ├── unsupBow.feat
+    │   ├── urls_neg.txt
+    │   ├── urls_pos.txt
+    │   └── ulrs_unsup.txt
+    ├── imdb.vocab
+    ├── imdbEr.txt
+    └── README
+
+- 由于train和test文件夹下的文件数量过多，因此在上传至ModelArts时文件数量受到限制。
+- 因此，在本地执行了data_reform.py文件，将每文件目录下的数据按序整合进同一txt文件中。
+
+- 删去无用文件并重整后的文件夹目录如下：
+
+aclImdb
+  ├── test
+  │   ├── neg.txt
+  │   └── pos.txt
+  ├── train
+  │   ├── neg.txt
+  │   └── pos.txt
+  ├── data_reform.py
+  └── readme.md
-- 
Gitee


From e25a3c5a83f6db23ad290d0f0e647a6c35d559ba Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=85=B7=E7=9B=96?=
 <10061212+wh1051899053@user.noreply.gitee.com>
Date: Wed, 1 Dec 2021 08:01:33 +0000
Subject: [PATCH 08/11] =?UTF-8?q?update=20code/2021=5Fautumn/=E9=87=91?=
 =?UTF-8?q?=E4=BD=B3=E6=83=A0-=E5=9F=BA=E4=BA=8EMindspore=E7=9A=84?=
 =?UTF-8?q?=E4=BA=8C=E5=85=83=E6=83=85=E6=84=9F=E5=88=86=E6=9E=90/readme.m?=
 =?UTF-8?q?d.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../readme.md"                                     | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git "a/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/readme.md" "b/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/readme.md"
index 89bd541..974baf3 100644
--- "a/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/readme.md"
+++ "b/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/readme.md"
@@ -38,4 +38,16 @@
     * 运行glove_bilstm.ipynb
 - 第二次及之后运行时
     * 若未更改glove维度，则可以直接从glove_bilstm.ipynb内【7.创建数据集】开始的步骤开始运行
-    * 若有更改glove维度，则参考第一次运行时的步骤，并从头开始运行glove_bilstm.ipynb文件
\ No newline at end of file
+    * 若有更改glove维度，则参考第一次运行时的步骤，并从头开始运行glove_bilstm.ipynb文件
+
+# gitee中需要补足的文件
+
+- 数据集文件aclImdb
+    * 下载地址：http://ai.stanford.edu/~amaas/data/sentiment/
+    * 下载并解压至aclImdb文件夹中，运行data_reform.py文件，即可得到重整后的数据集文件
+- 词嵌入文件glove
+    * 下载地址：http://nlp.stanford.edu/data/glove.6B.zip
+    * 下载后解压存放至glove文件夹下即可
+- 文件夹
+    * 文件夹存放位置如上述文件夹总览所示
+    * 仍需不足的空文件夹有：cache、mindrecord、model
\ No newline at end of file
-- 
Gitee


From 2c18b5893d922364546fc2ea244cf9b56e685eb2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=85=B7=E7=9B=96?=
 <10061212+wh1051899053@user.noreply.gitee.com>
Date: Wed, 1 Dec 2021 08:01:44 +0000
Subject: [PATCH 09/11] =?UTF-8?q?=E6=96=B0=E5=BB=BA=20glove?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../glove/.keep"                                                  | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 "code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/glove/.keep"

diff --git "a/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/glove/.keep" "b/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/glove/.keep"
new file mode 100644
index 0000000..e69de29
-- 
Gitee


From 6c025db6e40779c48a6eb20f3aeeff3329421f06 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=85=B7=E7=9B=96?=
 <10061212+wh1051899053@user.noreply.gitee.com>
Date: Wed, 1 Dec 2021 08:01:59 +0000
Subject: [PATCH 10/11] =?UTF-8?q?=E7=AC=AC=E4=BA=8C=E6=AC=A1=E4=B8=8A?=
 =?UTF-8?q?=E4=BC=A0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../glove/glove_re_file.py"                   | 24 +++++++++++++++++++
 1 file changed, 24 insertions(+)
 create mode 100644 "code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/glove/glove_re_file.py"

diff --git "a/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/glove/glove_re_file.py" "b/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/glove/glove_re_file.py"
new file mode 100644
index 0000000..5e5faa9
--- /dev/null
+++ "b/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/glove/glove_re_file.py"
@@ -0,0 +1,24 @@
+linecount = 0
+filecount = 0
+with open('glove.6B.100d.txt','r',encoding='utf-8') as f:
+    g = open('glove.6B.100d_0.txt','a',encoding='utf-8')
+    for i in f.readlines():
+        if linecount>= 50000:
+            filecount += 1
+            g = open('glove.6B.100d_'+str(filecount)+'.txt','a',encoding='utf-8')
+            linecount = 0
+        
+        g.write(i)
+        linecount += 1
+
+linecount = 0
+with open('new_glove.6B.100d.txt','w',encoding='utf-8') as f:
+    for i in range(0,filecount+1):
+        g = open('glove.6B.100d_'+str(i)+'.txt','r',encoding='utf-8')
+        for line in g.readlines():
+            f.write(line)
+            linecount+=1
+
+print(linecount)
+
+
-- 
Gitee


From 50e0abc5c909bb2a7d844b5f97cbefcd50eb61f5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=85=B7=E7=9B=96?=
 <10061212+wh1051899053@user.noreply.gitee.com>
Date: Wed, 1 Dec 2021 08:02:40 +0000
Subject: [PATCH 11/11] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20co?=
 =?UTF-8?q?de/2021=5Fautumn/=E9=87=91=E4=BD=B3=E6=83=A0-=E5=9F=BA=E4=BA=8E?=
 =?UTF-8?q?Mindspore=E7=9A=84=E4=BA=8C=E5=85=83=E6=83=85=E6=84=9F=E5=88=86?=
 =?UTF-8?q?=E6=9E=90/glove/.keep?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../glove/.keep"                                                  | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 "code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/glove/.keep"

diff --git "a/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/glove/.keep" "b/code/2021_autumn/\351\207\221\344\275\263\346\203\240-\345\237\272\344\272\216Mindspore\347\232\204\344\272\214\345\205\203\346\203\205\346\204\237\345\210\206\346\236\220/glove/.keep"
deleted file mode 100644
index e69de29..0000000
-- 
Gitee