# Beispiel: Text-Erzeugung

In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
import numpy as np

Daten beschaffen

In [2]:
# Shakespeare-Text laden (eine Beispielzeile)
shakespeare_text = """To be, or not to be, that is the question:
Whether 'tis nobler in the mind to suffer
The slings and arrows of outrageous fortune,
Or to take arms against a sea of troubles
And by opposing end them."""

Daten vorverarbeiten

In [3]:
# Tokenisierung des Textes
tokenizer = Tokenizer()
tokenizer.fit_on_texts([shakespeare_text])
total_words = len(tokenizer.word_index) + 1

# Sequenzen erstellen
input_sequences = []
for line in shakespeare_text.split('\n'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [4]:
token_list

[5, 27, 28, 29, 30]

In [5]:
input_sequences

[[1, 3],
 [1, 3, 4],
 [1, 3, 4, 7],
 [1, 3, 4, 7, 1],
 [1, 3, 4, 7, 1, 3],
 [1, 3, 4, 7, 1, 3, 8],
 [1, 3, 4, 7, 1, 3, 8, 9],
 [1, 3, 4, 7, 1, 3, 8, 9, 2],
 [1, 3, 4, 7, 1, 3, 8, 9, 2, 10],
 [11, 12],
 [11, 12, 13],
 [11, 12, 13, 14],
 [11, 12, 13, 14, 2],
 [11, 12, 13, 14, 2, 15],
 [11, 12, 13, 14, 2, 15, 1],
 [11, 12, 13, 14, 2, 15, 1, 16],
 [2, 17],
 [2, 17, 5],
 [2, 17, 5, 18],
 [2, 17, 5, 18, 6],
 [2, 17, 5, 18, 6, 19],
 [2, 17, 5, 18, 6, 19, 20],
 [4, 1],
 [4, 1, 21],
 [4, 1, 21, 22],
 [4, 1, 21, 22, 23],
 [4, 1, 21, 22, 23, 24],
 [4, 1, 21, 22, 23, 24, 25],
 [4, 1, 21, 22, 23, 24, 25, 6],
 [4, 1, 21, 22, 23, 24, 25, 6, 26],
 [5, 27],
 [5, 27, 28],
 [5, 27, 28, 29],
 [5, 27, 28, 29, 30]]

In [6]:
# Paddeln der Sequenzen
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(tf.keras.preprocessing.sequence.pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
input_sequences

array([[ 0,  0,  0,  0,  0,  0,  0,  0,  1,  3],
       [ 0,  0,  0,  0,  0,  0,  0,  1,  3,  4],
       [ 0,  0,  0,  0,  0,  0,  1,  3,  4,  7],
       [ 0,  0,  0,  0,  0,  1,  3,  4,  7,  1],
       [ 0,  0,  0,  0,  1,  3,  4,  7,  1,  3],
       [ 0,  0,  0,  1,  3,  4,  7,  1,  3,  8],
       [ 0,  0,  1,  3,  4,  7,  1,  3,  8,  9],
       [ 0,  1,  3,  4,  7,  1,  3,  8,  9,  2],
       [ 1,  3,  4,  7,  1,  3,  8,  9,  2, 10],
       [ 0,  0,  0,  0,  0,  0,  0,  0, 11, 12],
       [ 0,  0,  0,  0,  0,  0,  0, 11, 12, 13],
       [ 0,  0,  0,  0,  0,  0, 11, 12, 13, 14],
       [ 0,  0,  0,  0,  0, 11, 12, 13, 14,  2],
       [ 0,  0,  0,  0, 11, 12, 13, 14,  2, 15],
       [ 0,  0,  0, 11, 12, 13, 14,  2, 15,  1],
       [ 0,  0, 11, 12, 13, 14,  2, 15,  1, 16],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  2, 17],
       [ 0,  0,  0,  0,  0,  0,  0,  2, 17,  5],
       [ 0,  0,  0,  0,  0,  0,  2, 17,  5, 18],
       [ 0,  0,  0,  0,  0,  2, 17,  5, 18,  6],
       [ 0,  0,  0, 

In [7]:
# Merkmale und Labels aufteilen
xs, labels = input_sequences[:,:-1], input_sequences[:,-1]
ys = to_categorical(labels, num_classes=total_words)

Modell bauen

In [8]:
# Modell erstellen
model = Sequential([
    Embedding(total_words, 64, input_length=max_sequence_len-1),
    LSTM(64),
    Dense(total_words, activation='softmax')
])

# Modell kompilieren
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

Training

In [9]:
# Modell trainieren
# Empfehlung: 500 epochs. Wir verwenden hier nur 10, um die Laufzeit zu verk√ºrzen.
model.fit(xs, ys, epochs=10, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2cda4927070>

Modell anwenden

In [10]:
# Funktion zur Textgenerierung
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = tf.keras.preprocessing.sequence.pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = np.argmax(model.predict(token_list), axis=-1)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

# Beispielhafte Textgenerierung
print(generate_text("To be", 20, model, max_sequence_len))


To be to to to to be to to be to to be to to be to to be to to be
