{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "!pip install transformers accelerate bitsandbytes sentencepiece einops" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import torch\n", "from transformers import AutoModelForCausalLM, AutoTokenizer\n", "from peft import PeftModel\n", "\n", "MODEL_ID = \"AIBunCho/japanese-novel-gpt-j-6b\"\n", "MODEL_QLORA_ID = \"tsukemono/japanese-novel-gpt-j-6b-qlora-marisa\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "VOmiOziuEr6N", "outputId": "678b317e-f235-43f2-e443-1df05bd20253" }, "outputs": [], "source": [ "# tokenizer設定\n", "tokenizer = AutoTokenizer.from_pretrained(\n", " MODEL_ID,\n", " use_fast=True,\n", ")\n", "ret_token = tokenizer(\"[SEP]\", truncation=True, add_special_tokens=False)['input_ids'][-1]\n", "bra_token = tokenizer(\"(\", truncation=True, add_special_tokens=False)['input_ids'][-1]" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Loading checkpoint shards: 100%|██████████| 2/2 [00:24<00:00, 12.18s/it]\n" ] } ], "source": [ "# model設定\n", "model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map=\"auto\", load_in_8bit=True, torch_dtype=torch.float16)\n", "model.eval()\n", "model = PeftModel.from_pretrained(model, MODEL_QLORA_ID, device_map=\"auto\")" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# テキスト生成関数の定義\n", "def generate(text,maxTokens=512):\n", " prompt = f\"ユーザー: {text}[SEP]魔理沙: \"\n", " input_ids = tokenizer(prompt,\n", " return_tensors=\"pt\",\n", " truncation=True,\n", " add_special_tokens=False\n", " ).input_ids.cuda()\n", " with torch.no_grad():\n", " outputs = model.generate(\n", " input_ids = input_ids,\n", " max_length=maxTokens,\n", " do_sample=True,\n", " temperature=0.1,\n", " top_p=0.9,\n", " top_k=20,\n", " no_repeat_ngram_size=2,\n", " repetition_penalty=1.15,\n", " pad_token_id=tokenizer.pad_token_id,\n", " bad_words_ids=[[bra_token]],\n", " eos_token_id = [tokenizer.eos_token_id,ret_token]\n", " )\n", " outputs = tokenizer.decode(outputs.tolist()[0][input_ids.size(1):],skip_special_tokens=True)\n", " return outputs" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'努力と根性だ! '" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "generate(\"強さの秘訣はなんですか?\")" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'良いんじゃないか。私は好きだぜ、この考え方は。ただ、ちょっと極端すぎる気もするけどな。 '" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "generate(\"ブッダの思想についてどう思う?\")" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'富士山だ! '" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "generate(\"日本で一番高い山は?\")" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'そうだな。一言で言えば「忘れられた者の楽園」だな。 '" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "generate(\"幻想郷ってどんな場所?\")" ] } ], "metadata": { "accelerator": "GPU", "colab": { "gpuType": "T4", "provenance": [] }, "gpuClass": "standard", "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 0 }