1

How do I set up a multi-variate regression problem using Trax?

I get AssertionError: Invalid shape (16, 2); expected (16,). from the code below, coming from the L2Loss object.

The following is my attempt to adapt the sentiment analysis example into a regression problem:

import os
import trax
from trax import layers as tl
from trax.supervised import training
import numpy
import random


#train_stream = trax.data.TFDS('imdb_reviews', keys=('text', 'label'), train=True)()
#eval_stream = trax.data.TFDS('imdb_reviews', keys=('text', 'label'), train=False)()


def generate_samples():
    # (text, lat/lon)
    data= [
        ("Aberdeen MS",  numpy.array((33.824742, -88.554591)) ),
        ("Aberdeen SD", numpy.array((45.463186, -98.471033))),
        ("Aberdeen WA", numpy.array((46.976432, -123.795781))),
        ("Amite City LA", numpy.array((30.733723, -90.5208))),
        ("Amory MS", numpy.array((33.984789, -88.48001))),
        ("Amouli AS", numpy.array((-14.26556, -170.589772))),
        ("Amsterdam NY", numpy.array((42.953149, -74.19505)))
    ]
    for i in range(1024*8):
        yield random.choice(data)


train_stream = generate_samples()
eval_stream = generate_samples()

model = tl.Serial(
    tl.Embedding(vocab_size=8192, d_feature=256),
    tl.Mean(axis=1),  # Average on axis 1 (length of sentence).
    tl.Dense(2),      # Regress to lat/lon
#    tl.LogSoftmax()   # Produce log-probabilities.
)

# You can print model structure.
print(model)



print(next(train_stream))  # See one example.

data_pipeline = trax.data.Serial(
    trax.data.Tokenize(vocab_file='en_8k.subword', keys=[0]),
    trax.data.Shuffle(),
#    trax.data.FilterByLength(max_length=2048, length_keys=[0]),
    trax.data.BucketByLength(boundaries=[   8, 128,],
                             batch_sizes=[256,   64, 4],
                             length_keys=[0]),
    trax.data.AddLossWeights()
  )

train_batches_stream = data_pipeline(train_stream)
eval_batches_stream = data_pipeline(eval_stream)
example_batch = next(train_batches_stream)
print(f'shapes = {[x.shape for x in example_batch]}')  # Check the shapes.:wq



# Training task.
train_task = training.TrainTask(
    labeled_data=train_batches_stream,
#    loss_layer=tl.CrossEntropyLoss(),
    loss_layer=tl.L2Loss(),
    optimizer=trax.optimizers.Adam(0.01),
    n_steps_per_checkpoint=500,
)

# Evaluaton task.
eval_task = training.EvalTask(
    labeled_data=eval_batches_stream,
    metrics=[tl.L2Loss(),],
    n_eval_batches=20  # For less variance in eval numbers.
)
# Training loop saves checkpoints to output_dir.
output_dir = os.path.expanduser('~/output_dir/')
training_loop = training.Loop(model,
                              train_task,
                              eval_tasks=[eval_task],
                              output_dir=output_dir)
# Run 2000 steps (batches).
training_loop.run(2000)

1 Answer 1

1

The problem might be in the generate_samples() generator: This yields only 1024*8 (=8192) samples. If I replace the line

for i in range(1024*8):

by

while True:

so that an infinite amount of samples is generated, your example works on my machine.

Since generate_samples() only yields 8192 samples, train_batches_stream only yields 32 batches of 256 samples each, so that you can only train for at most 32 steps. However, you ask for 2000 steps.

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Not the answer you're looking for? Browse other questions tagged or ask your own question.