{ "cells": [ { "cell_type": "markdown", "id": "c56fea40", "metadata": {}, "source": [ "## MODEL TRAINING" ] }, { "cell_type": "code", "execution_count": 2, "id": "49c0547d", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package punkt to /Users/xiao_deng/nltk_data...\n", "[nltk_data] Package punkt is already up-to-date!\n", "[nltk_data] Downloading package wordnet to\n", "[nltk_data] /Users/xiao_deng/nltk_data...\n", "[nltk_data] Package wordnet is already up-to-date!\n", "[nltk_data] Downloading package omw-1.4 to\n", "[nltk_data] /Users/xiao_deng/nltk_data...\n", "[nltk_data] Package omw-1.4 is already up-to-date!\n", "[nltk_data] Downloading package stopwords to\n", "[nltk_data] /Users/xiao_deng/nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n", "[nltk_data] Downloading package averaged_perceptron_tagger to\n", "[nltk_data] /Users/xiao_deng/nltk_data...\n", "[nltk_data] Package averaged_perceptron_tagger is already up-to-\n", "[nltk_data] date!\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "1b378787d3434655aa2491f5bebd7faf", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Output()" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n" ], "text/plain": [] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", "\n" ], "text/plain": [ "\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "fb1743fd07404972b8a0b22bedcfa527", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Output()" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n" ], "text/plain": [] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n" ], "text/plain": [ "\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "043d2ab3c31c487da32781800fabdf44", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Output()" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n" ], "text/plain": [] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n" ], "text/plain": [ "\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "168f2251156144d39c7d0bf609653bfb", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Output()" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n" ], "text/plain": [] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n" ], "text/plain": [ "\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# imdb_reviews_cls.ipynb\n", "# train a classification model to predict the reaction of reviews\n", "#\n", "# author : xiao deng\n", "# date : 20210612\n", "# platform: Macbook pro 14\n", "\n", "import os\n", "import pickle\n", "\n", "import nltk\n", "import numpy as np\n", "from rich.progress import track\n", "\n", "\n", "# 1) NLTK init\n", "nltk.download(\"punkt\")\n", "nltk.download('wordnet')\n", "nltk.download('omw-1.4')\n", "nltk.download('stopwords')\n", "nltk.download('averaged_perceptron_tagger')\n", "eng_stopwords = nltk.corpus.stopwords.words(\"english\")\n", "lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()\n", "stemmer = nltk.stem.porter.PorterStemmer()\n", "\n", "# 2) Read text files of train data\n", "train_pos_dir = './aclImdb/train/pos'\n", "train_neg_dir = './aclImdb/train/neg'\n", "pos_paths = [f'{train_pos_dir}/{file}' for file in os.listdir(train_pos_dir)]\n", "neg_paths = [f'{train_neg_dir}/{file}' for file in os.listdir(train_neg_dir)]\n", "\n", "pos_reviews = []\n", "neg_reviews = []\n", "\n", "for path in pos_paths:\n", " with open(path) as f:\n", " pos_reviews.append(f.read())\n", "\n", "for path in neg_paths:\n", " with open(path) as f:\n", " neg_reviews.append(f.read())\n", "\n", "# 3) Text Preprocessing\n", "# Step1: sentence seg\n", "pos_sentences = [nltk.sent_tokenize(review) for review in track(pos_reviews, 'Sentence tokenize pos reivews ...')]\n", "neg_sentences = [nltk.sent_tokenize(review) for review in track(neg_reviews, 'Sentence tokenize neg reivews ...')]\n", "\n", "# Step2: word seg (apply lowercase, mark removal, digit removal, stopword removal, lemma, stemming)\n", "word_code_map = {}\n", "pos_map = {'J': nltk.corpus.wordnet.ADJ,\n", " 'V': nltk.corpus.wordnet.VERB,\n", " 'R': nltk.corpus.wordnet.ADV}\n", "# positive reviews\n", "pos_words = []\n", "for sentences in track(pos_sentences, 'Word tokenize pos reivews ...'):\n", " review_words = []\n", " \n", " for sentence in sentences:\n", " words = nltk.tokenize.word_tokenize(sentence)\n", " words = [word.lower() for word in words if word.isalnum() and not word.isdigit() and word not in eng_stopwords]\n", " pos_tags = [ele[1][0] for ele in nltk.pos_tag(words)]\n", " pos_tags = [pos_map[tag] if tag in pos_map else nltk.corpus.wordnet.NOUN for tag in pos_tags]\n", " words = [lemmatizer.lemmatize(word, pos=pos_tags[i]) for i, word in enumerate(words)]\n", " words = [stemmer.stem(word) for word in words]\n", " for word in words:\n", " if word not in word_code_map:\n", " word_code_map[word] = len(word_code_map)\n", " review_words += words\n", " \n", " pos_words.append(review_words)\n", "\n", "# negative reviews\n", "neg_words = []\n", "for sentences in track(neg_sentences, 'Word tokenize neg reivews ...'):\n", " review_words = []\n", " \n", " for sentence in sentences:\n", " words = nltk.tokenize.word_tokenize(sentence)\n", " words = [word.lower() for word in words if word.isalnum() and not word.isdigit() and word not in eng_stopwords]\n", " pos_tags = [ele[1][0] for ele in nltk.pos_tag(words)]\n", " pos_tags = [pos_map[tag] if tag in pos_map else nltk.corpus.wordnet.NOUN for tag in pos_tags]\n", " words = [lemmatizer.lemmatize(word, pos=pos_tags[i]) for i, word in enumerate(words)]\n", " words = [stemmer.stem(word) for word in words]\n", " for word in words:\n", " if word not in word_code_map:\n", " word_code_map[word] = len(word_code_map)\n", " review_words += words\n", " \n", " neg_words.append(review_words)\n", "\n", "with open('word_code_map.pkl', 'wb') as f:\n", " pickle.dump(word_code_map, f)\n", "\n", "# 4) Encoding\n", "max_len = max([len(review) for review in pos_words+neg_words])\n", "pos_x = []\n", "for sentence in pos_words:\n", " sentence_encode = [word_code_map[word] for word in sentence]\n", " sentence_encode = np.pad(sentence_encode, (0, max_len-len(sentence_encode)), mode='constant')\n", " pos_x.append(sentence_encode)\n", "pos_y = [True] * len(pos_x)\n", "\n", "neg_x = []\n", "for sentence in neg_words:\n", " sentence_encode = [word_code_map[word] for word in sentence]\n", " sentence_encode = np.pad(sentence_encode, (0, max_len-len(sentence_encode)), mode='constant')\n", " neg_x.append(sentence_encode)\n", "neg_y = [False] * len(neg_x)" ] }, { "cell_type": "code", "execution_count": 6, "id": "5f3c6c05", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "train_x shape: (20000, 1486)\n", "train_y shape: (20000,)\n", "valid_x shape: (5000, 1486)\n", "valid_y shape: (5000,)\n", "Model: \"sequential_3\"\n", "_________________________________________________________________\n", " Layer (type) Output Shape Param # \n", "=================================================================\n", " embedding (Embedding) (None, 1486, 128) 6250752 \n", " \n", " dropout (Dropout) (None, 1486, 128) 0 \n", " \n", " conv1d (Conv1D) (None, 1486, 64) 41024 \n", " \n", " global_max_pooling1d (Globa (None, 64) 0 \n", " lMaxPooling1D) \n", " \n", " dense_7 (Dense) (None, 32) 2080 \n", " \n", " batch_normalization_2 (Batc (None, 32) 128 \n", " hNormalization) \n", " \n", " dropout_1 (Dropout) (None, 32) 0 \n", " \n", " dense_8 (Dense) (None, 1) 33 \n", " \n", "=================================================================\n", "Total params: 6,294,017\n", "Trainable params: 6,293,953\n", "Non-trainable params: 64\n", "_________________________________________________________________\n", "Epoch 1/60\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2022-06-13 18:17:45.783779: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "625/625 [==============================] - ETA: 0s - loss: 0.7377 - Accuracy: 0.5212" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2022-06-13 18:18:53.424627: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "625/625 [==============================] - 70s 106ms/step - loss: 0.7377 - Accuracy: 0.5212 - val_loss: 0.7963 - val_Accuracy: 0.5042 - lr: 1.0000e-04\n", "Epoch 2/60\n", "625/625 [==============================] - 63s 101ms/step - loss: 0.6486 - Accuracy: 0.6177 - val_loss: 0.7735 - val_Accuracy: 0.5574 - lr: 1.0000e-04\n", "Epoch 3/60\n", "625/625 [==============================] - 62s 99ms/step - loss: 0.5696 - Accuracy: 0.7036 - val_loss: 0.5795 - val_Accuracy: 0.7050 - lr: 1.0000e-04\n", "Epoch 4/60\n", "625/625 [==============================] - 60s 96ms/step - loss: 0.4933 - Accuracy: 0.7629 - val_loss: 0.4854 - val_Accuracy: 0.7666 - lr: 1.0000e-04\n", "Epoch 5/60\n", "625/625 [==============================] - 63s 101ms/step - loss: 0.4167 - Accuracy: 0.8115 - val_loss: 0.4602 - val_Accuracy: 0.7908 - lr: 1.0000e-04\n", "Epoch 6/60\n", "625/625 [==============================] - 65s 103ms/step - loss: 0.3633 - Accuracy: 0.8471 - val_loss: 0.4089 - val_Accuracy: 0.8174 - lr: 1.0000e-04\n", "Epoch 7/60\n", "625/625 [==============================] - 66s 105ms/step - loss: 0.3154 - Accuracy: 0.8703 - val_loss: 0.3897 - val_Accuracy: 0.8308 - lr: 1.0000e-04\n", "Epoch 8/60\n", "625/625 [==============================] - 61s 98ms/step - loss: 0.2759 - Accuracy: 0.8866 - val_loss: 0.3817 - val_Accuracy: 0.8354 - lr: 1.0000e-04\n", "Epoch 9/60\n", "625/625 [==============================] - 64s 102ms/step - loss: 0.2406 - Accuracy: 0.9042 - val_loss: 0.3684 - val_Accuracy: 0.8470 - lr: 1.0000e-04\n", "Epoch 10/60\n", "625/625 [==============================] - 65s 104ms/step - loss: 0.2168 - Accuracy: 0.9165 - val_loss: 0.3664 - val_Accuracy: 0.8488 - lr: 1.0000e-04\n", "Epoch 11/60\n", "625/625 [==============================] - 64s 102ms/step - loss: 0.1838 - Accuracy: 0.9299 - val_loss: 0.3825 - val_Accuracy: 0.8484 - lr: 1.0000e-04\n", "Epoch 12/60\n", "625/625 [==============================] - 67s 106ms/step - loss: 0.1657 - Accuracy: 0.9376 - val_loss: 0.3853 - val_Accuracy: 0.8546 - lr: 1.0000e-04\n", "Epoch 13/60\n", "625/625 [==============================] - 66s 106ms/step - loss: 0.1514 - Accuracy: 0.9450 - val_loss: 0.3751 - val_Accuracy: 0.8590 - lr: 1.0000e-04\n", "Epoch 14/60\n", "625/625 [==============================] - 66s 106ms/step - loss: 0.1306 - Accuracy: 0.9509 - val_loss: 0.3860 - val_Accuracy: 0.8600 - lr: 1.0000e-04\n", "Epoch 15/60\n", "625/625 [==============================] - ETA: 0s - loss: 0.1237 - Accuracy: 0.9556\n", "Epoch 15: ReduceLROnPlateau reducing learning rate to 1.9999999494757503e-05.\n", "625/625 [==============================] - 68s 108ms/step - loss: 0.1237 - Accuracy: 0.9556 - val_loss: 0.4041 - val_Accuracy: 0.8604 - lr: 1.0000e-04\n", "Epoch 16/60\n", "625/625 [==============================] - 69s 111ms/step - loss: 0.1052 - Accuracy: 0.9626 - val_loss: 0.4071 - val_Accuracy: 0.8608 - lr: 2.0000e-05\n", "Epoch 17/60\n", "625/625 [==============================] - 70s 112ms/step - loss: 0.1012 - Accuracy: 0.9648 - val_loss: 0.4050 - val_Accuracy: 0.8608 - lr: 2.0000e-05\n" ] } ], "source": [ "import random\n", "\n", "from tensorflow.keras import Sequential\n", "from tensorflow.keras.losses import BinaryCrossentropy\n", "from tensorflow.keras.layers import Dense, Embedding, GlobalMaxPooling1D, BatchNormalization\n", "from tensorflow.keras.layers import Dropout, Conv1D\n", "from tensorflow.keras.callbacks import TensorBoard, EarlyStopping, ReduceLROnPlateau\n", "from tensorflow.keras.optimizers import Adam\n", "\n", "\n", "# 5) Split training & validation set\n", "seed = 0\n", "random.Random(seed).shuffle(pos_x)\n", "random.Random(seed).shuffle(neg_x)\n", "\n", "train_ratio = 0.8\n", "idx_p = int(len(pos_x) * train_ratio)\n", "idx_n = int(len(neg_x) * train_ratio)\n", "train_x = pos_x[:idx_p] + neg_x[:idx_n]\n", "train_y = pos_y[:idx_p] + neg_y[:idx_n]\n", "valid_x = pos_x[idx_p:] + neg_x[idx_n:]\n", "valid_y = pos_y[idx_p:] + neg_y[idx_n:]\n", "\n", "seed = 1\n", "random.Random(seed).shuffle(train_x)\n", "random.Random(seed).shuffle(train_y)\n", "random.Random(seed).shuffle(valid_x)\n", "random.Random(seed).shuffle(valid_y)\n", "\n", "train_x = np.array(train_x)\n", "train_y = np.array(train_y)\n", "valid_x = np.array(valid_x)\n", "valid_y = np.array(valid_y)\n", "\n", "print(f'train_x shape: {train_x.shape}')\n", "print(f'train_y shape: {train_y.shape}')\n", "print(f'valid_x shape: {valid_x.shape}')\n", "print(f'valid_y shape: {valid_y.shape}')\n", "\n", "# 6) Build CNN model with word2vec in TF (I used a similar model to classify EMR in my master thesis)\n", "# ref: extension://bfdogplmndidlpjfhoijckpakkdjkkil/pdf/viewer.html?file=https%3A%2F%2Farxiv.org%2Fpdf%2F1408.5882.pdf\n", "embedding_dim = 128\n", "model = Sequential([\n", " Embedding(len(word_code_map), embedding_dim, input_length=max_len),\n", " Dropout(0.5),\n", " Conv1D(64, 5, padding='same', activation='relu', strides=1),\n", " GlobalMaxPooling1D(),\n", " Dense(32, activation='linear'),\n", " BatchNormalization(),\n", " Dropout(0.5),\n", " Dense(1, activation='sigmoid')\n", "])\n", "\n", "init_lr = 1e-4\n", "model.compile(optimizer=Adam(learning_rate=init_lr),\n", " loss=BinaryCrossentropy(from_logits=True),\n", " metrics=['Accuracy'])\n", "\n", "model.summary()\n", "\n", "# 7) Fit the training data\n", "callbacks = [TensorBoard(log_dir=\"logs\"),\n", " EarlyStopping(patience=7), # prevent overfitting\n", " ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, verbose=1, mode='auto', min_lr=init_lr/100)]\n", "\n", "model.fit(\n", " train_x,\n", " train_y,\n", " batch_size=32,\n", " validation_data=(valid_x, valid_y),\n", " epochs=60,\n", " callbacks=callbacks\n", ")\n", "\n", "model.save('imdb_cls_model.h5')" ] }, { "cell_type": "markdown", "id": "d9174db3", "metadata": {}, "source": [ "## MODEL EVALUATION" ] }, { "cell_type": "code", "execution_count": 5, "id": "08ee9c8f", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package punkt to /Users/xiao_deng/nltk_data...\n", "[nltk_data] Package punkt is already up-to-date!\n", "[nltk_data] Downloading package wordnet to\n", "[nltk_data] /Users/xiao_deng/nltk_data...\n", "[nltk_data] Package wordnet is already up-to-date!\n", "[nltk_data] Downloading package omw-1.4 to\n", "[nltk_data] /Users/xiao_deng/nltk_data...\n", "[nltk_data] Package omw-1.4 is already up-to-date!\n", "[nltk_data] Downloading package stopwords to\n", "[nltk_data] /Users/xiao_deng/nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n", "[nltk_data] Downloading package averaged_perceptron_tagger to\n", "[nltk_data] /Users/xiao_deng/nltk_data...\n", "[nltk_data] Package averaged_perceptron_tagger is already up-to-\n", "[nltk_data] date!\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "a91d0135a31f45c18d410bc7f0e7e1ef", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Output()" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n" ], "text/plain": [] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n" ], "text/plain": [ "\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "c5429bdd168d413087e8e6c43e71a5a8", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Output()" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n" ], "text/plain": [] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n" ], "text/plain": [ "\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "fc0aafe9b45b45bda8b1e2bb1617b92f", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Output()" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n" ], "text/plain": [] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n" ], "text/plain": [ "\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "fdafa9095d7c4b6b97f727d2f926df04", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Output()" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n" ], "text/plain": [] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n" ], "text/plain": [ "\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import os\n", "import pickle\n", "\n", "import nltk\n", "import numpy as np\n", "from rich.progress import track\n", "\n", "\n", "# 1) NLTK init\n", "nltk.download(\"punkt\")\n", "nltk.download('wordnet')\n", "nltk.download('omw-1.4')\n", "nltk.download('stopwords')\n", "nltk.download('averaged_perceptron_tagger')\n", "eng_stopwords = nltk.corpus.stopwords.words(\"english\")\n", "lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()\n", "stemmer = nltk.stem.porter.PorterStemmer()\n", "\n", "# 2) Read text files of train data\n", "test_pos_dir = './aclImdb/test/pos'\n", "test_neg_dir = './aclImdb/test/neg'\n", "pos_paths = [f'{test_pos_dir}/{file}' for file in os.listdir(test_pos_dir)]\n", "neg_paths = [f'{test_neg_dir}/{file}' for file in os.listdir(test_neg_dir)]\n", "\n", "pos_reviews = []\n", "neg_reviews = []\n", "\n", "for path in pos_paths:\n", " with open(path) as f:\n", " pos_reviews.append(f.read())\n", "\n", "for path in neg_paths:\n", " with open(path) as f:\n", " neg_reviews.append(f.read())\n", "\n", "# 3) Text Preprocessing\n", "# Step1: sentence seg\n", "pos_sentences = [nltk.sent_tokenize(review) for review in track(pos_reviews, 'Sentence tokenize pos reivews ...')]\n", "neg_sentences = [nltk.sent_tokenize(review) for review in track(neg_reviews, 'Sentence tokenize neg reivews ...')]\n", "\n", "# Step2: word seg (apply lowercase, mark removal, digit removal, stopword removal, lemma, stemming)\n", "with open('word_code_map.pkl', 'rb') as f:\n", " word_code_map = pickle.load(f)\n", "pos_map = {'J': nltk.corpus.wordnet.ADJ,\n", " 'V': nltk.corpus.wordnet.VERB,\n", " 'R': nltk.corpus.wordnet.ADV}\n", "# positive reviews\n", "pos_words = []\n", "for sentences in track(pos_sentences, 'Word tokenize pos reivews ...'):\n", " review_words = []\n", " \n", " for sentence in sentences:\n", " words = nltk.tokenize.word_tokenize(sentence)\n", " words = [word.lower() for word in words if word.isalnum() and not word.isdigit() and word not in eng_stopwords]\n", " pos_tags = [ele[1][0] for ele in nltk.pos_tag(words)]\n", " pos_tags = [pos_map[tag] if tag in pos_map else nltk.corpus.wordnet.NOUN for tag in pos_tags]\n", " words = [lemmatizer.lemmatize(word, pos=pos_tags[i]) for i, word in enumerate(words)]\n", " words = [stemmer.stem(word) for word in words]\n", " review_words += words\n", " \n", " pos_words.append(review_words)\n", "\n", "# negative reviews\n", "neg_words = []\n", "for sentences in track(neg_sentences, 'Word tokenize neg reivews ...'):\n", " review_words = []\n", " \n", " for sentence in sentences:\n", " words = nltk.tokenize.word_tokenize(sentence)\n", " words = [word.lower() for word in words if word.isalnum() and not word.isdigit() and word not in eng_stopwords]\n", " pos_tags = [ele[1][0] for ele in nltk.pos_tag(words)]\n", " pos_tags = [pos_map[tag] if tag in pos_map else nltk.corpus.wordnet.NOUN for tag in pos_tags]\n", " words = [lemmatizer.lemmatize(word, pos=pos_tags[i]) for i, word in enumerate(words)]\n", " words = [stemmer.stem(word) for word in words]\n", " review_words += words\n", " \n", " neg_words.append(review_words)\n", "\n", "# 4) Encoding\n", "max_len = 1486 # max_len of training set\n", "pos_x = []\n", "for sentence in pos_words:\n", " sentence_encode = [word_code_map[word] for word in sentence if word in word_code_map]\n", " if len(sentence_encode) <= max_len:\n", " sentence_encode = np.pad(sentence_encode, (0, max_len-len(sentence_encode)), mode='constant')\n", " else:\n", " sentence_encode = sentence_encode[:max_len]\n", " pos_x.append(sentence_encode)\n", "pos_y = [True] * len(pos_x)\n", "\n", "neg_x = []\n", "for sentence in neg_words:\n", " sentence_encode = [word_code_map[word] for word in sentence if word in word_code_map]\n", " if len(sentence_encode) <= max_len:\n", " sentence_encode = np.pad(sentence_encode, (0, max_len-len(sentence_encode)), mode='constant')\n", " else:\n", " sentence_encode = sentence_encode[:max_len]\n", " neg_x.append(sentence_encode)\n", "neg_y = [False] * len(neg_x)" ] }, { "cell_type": "code", "execution_count": 6, "id": "828afa33", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "test_x shape: (25000, 1486)\n", "test_y shape: (25000,)\n", "Metal device set to: Apple M1 Pro\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2022-06-13 19:45:13.666996: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.\n", "2022-06-13 19:45:13.667118: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: