TinyML: Data Collection
A model is only as good as the data it trained on. Collecting sensor data from the device itself avoids the distribution shift that sinks most TinyML projects.
Why Collect on the Device
The canonical mistake: collect data from a phone accelerometer, train a model, deploy to the microcontroller, and wonder why accuracy is 60% instead of 95%.
Every sensor has its own noise profile, sample rate, axis orientation, and filtering. A model trained on one sensor will not transfer cleanly to a different one. Collect from the hardware you're deploying to.
Data Collection Firmware
The collection sketch streams sensor readings over Serial. Python reads them and writes CSV files.
This example collects accelerometer data for gesture classification. Three gestures: punch, flex, idle.
// data_collector.ino
#include <Arduino_LSM6DSOX.h> // Nano 33 BLE Sense Rev2
const int SAMPLE_RATE_HZ = 100;
const int CAPTURE_DURATION = 1; // seconds per gesture sample
const int TOTAL_SAMPLES = SAMPLE_RATE_HZ * CAPTURE_DURATION;
bool capturing = false;
int sampleCount = 0;
void setup() {
Serial.begin(115200);
while (!Serial);
if (!IMU.begin()) {
Serial.println("IMU init failed");
while (true);
}
Serial.println("READY"); // Python reads this to know the board is up
}
void loop() {
// Start capture when any character arrives over Serial
if (Serial.available() > 0) {
Serial.read(); // consume the byte
capturing = true;
sampleCount = 0;
}
if (!capturing) return;
float ax, ay, az;
if (IMU.accelerationAvailable()) {
IMU.readAcceleration(ax, ay, az);
// Output as CSV: aX,aY,aZ
Serial.print(ax, 4); Serial.print(',');
Serial.print(ay, 4); Serial.print(',');
Serial.println(az, 4);
sampleCount++;
if (sampleCount >= TOTAL_SAMPLES) {
capturing = false;
Serial.println("DONE"); // Python looks for this
}
}
}
Flash this to the Nano before running the Python collector.
Python Collector Script
# collect.py
import serial
import csv
import os
import sys
import time
PORT = "/dev/ttyACM0" # adjust for your system
BAUD = 115200
LABEL = sys.argv[1] # e.g. "punch", "flex", "idle"
SAMPLES = 20 # number of 1-second captures per label
OUT_DIR = f"data/raw/{LABEL}"
os.makedirs(OUT_DIR, exist_ok=True)
ser = serial.Serial(PORT, BAUD, timeout=5)
print("Waiting for board...")
while True:
line = ser.readline().decode("utf-8", errors="replace").strip()
if line == "READY":
break
print(f"Collecting {SAMPLES} samples for label '{LABEL}'")
print("Press Enter before each gesture.")
for i in range(SAMPLES):
input(f" Sample {i+1}/{SAMPLES}: press Enter to start:")
# Signal the board to start
ser.write(b"g")
rows = []
while True:
line = ser.readline().decode("utf-8", errors="replace").strip()
if line == "DONE":
break
if "," in line:
rows.append(line.split(","))
fname = os.path.join(OUT_DIR, f"{i:03d}.csv")
with open(fname, "w", newline="") as f:
writer = csv.writer(f)
writer.writerow(["aX", "aY", "aZ"])
writer.writerows(rows)
print(f" Saved {len(rows)} rows to {fname}")
ser.close()
print("Done.")
Run it once per label:
python3 collect.py punch
python3 collect.py flex
python3 collect.py idle
You should end up with data/raw/punch/000.csv through data/raw/punch/019.csv and similarly for the other labels.
Dataset Quality Guidelines
How many samples? For a 3-class gesture problem with simple, consistent gestures, 20 samples per class is a starting point. Aim for at least 50 per class for production. If accuracy plateaus and more data doesn't help, the problem is the model or the features, not the quantity.
Vary the conditions. Collect from multiple people if the model needs to generalize across users. Collect at different speeds, with the device mounted at different angles, indoors and outdoors if relevant.
Balance the classes. 20 samples of punch, 20 of flex, 20 of idle. An imbalanced dataset teaches the model to cheat by always predicting the majority class.
Check your data before training. A corrupted serial transfer or an interrupted capture creates malformed CSVs. Visual inspection saves confusion later.
Visualizing the Data
Always plot raw data before training. Patterns that are obvious on a graph but invisible in numbers will tell you whether your features are discriminative.
# visualize.py
import os
import csv
import numpy as np
import matplotlib.pyplot as plt
LABELS = ["punch", "flex", "idle"]
COLORS = ["#e63946", "#457b9d", "#2a9d8f"]
fig, axes = plt.subplots(1, 3, figsize=(15, 4), sharey=True)
fig.suptitle("Accelerometer samples per class")
for ax, label, color in zip(axes, LABELS, COLORS):
folder = f"data/raw/{label}"
for fname in sorted(os.listdir(folder))[:5]: # plot 5 samples
with open(os.path.join(folder, fname)) as f:
reader = csv.DictReader(f)
rows = list(reader)
ax_vals = [float(r["aX"]) for r in rows]
ay_vals = [float(r["aY"]) for r in rows]
az_vals = [float(r["aZ"]) for r in rows]
t = np.linspace(0, 1, len(ax_vals))
ax.plot(t, ax_vals, color=color, alpha=0.4, linewidth=0.8)
ax.set_title(label)
ax.set_xlabel("time (s)")
ax.set_ylabel("acceleration (g)" if label == LABELS[0] else "")
ax.axhline(0, color="black", linewidth=0.3)
plt.tight_layout()
plt.savefig("data/samples_overview.png", dpi=150)
plt.show()
If the three gesture classes look visually similar when plotted, a classifier will struggle. That's useful information to have before investing in training.
Preprocessing and Saving as NumPy Arrays
The training script needs fixed-length arrays. Pad or truncate each sample to TARGET_LEN rows.
# preprocess.py
import os
import csv
import numpy as np
LABELS = ["punch", "flex", "idle"]
TARGET_LEN = 100 # rows per sample (= SAMPLE_RATE_HZ × CAPTURE_DURATION)
RAW_DIR = "data/raw"
OUT_DIR = "data/processed"
os.makedirs(OUT_DIR, exist_ok=True)
X_all = []
y_all = []
for label_idx, label in enumerate(LABELS):
folder = os.path.join(RAW_DIR, label)
for fname in sorted(os.listdir(folder)):
if not fname.endswith(".csv"):
continue
with open(os.path.join(folder, fname)) as f:
reader = csv.DictReader(f)
rows = [[float(r["aX"]), float(r["aY"]), float(r["aZ"])]
for r in reader]
# Pad with last row or truncate
while len(rows) < TARGET_LEN:
rows.append(rows[-1])
rows = rows[:TARGET_LEN]
X_all.append(rows)
y_all.append(label_idx)
X = np.array(X_all, dtype=np.float32) # shape: (N, 100, 3)
y = np.array(y_all, dtype=np.int32)
# Flatten to (N, 300) for dense model, or keep (N, 100, 3) for Conv1D
X_flat = X.reshape(len(X), -1)
np.save(os.path.join(OUT_DIR, "X.npy"), X_flat)
np.save(os.path.join(OUT_DIR, "y.npy"), y)
print(f"Saved {len(X)} samples, shape {X_flat.shape}")
print(f"Class distribution: {dict(zip(LABELS, np.bincount(y)))}")
After running this, data/processed/X.npy and data/processed/y.npy are ready for the training script in chapter 5.
Collecting Audio Data
For keyword spotting, the workflow is similar but the raw signal is 16-bit PCM at 16 kHz.
// audio_collector.ino (sketch fragment)
#include <PDM.h>
const int SAMPLE_RATE = 16000;
const int BUFFER_SIZE = 512;
short sampleBuffer[BUFFER_SIZE];
volatile int samplesRead;
void onPDMdata() {
int bytesAvailable = PDM.available();
PDM.read(sampleBuffer, bytesAvailable);
samplesRead = bytesAvailable / 2;
}
void setup() {
Serial.begin(115200);
PDM.onReceive(onPDMdata);
PDM.begin(1, SAMPLE_RATE);
}
void loop() {
if (samplesRead > 0) {
for (int i = 0; i < samplesRead; i++) {
Serial.println(sampleBuffer[i]);
}
samplesRead = 0;
}
}
The Python side reads the raw PCM integers and writes .wav files. Tools like librosa and soundfile handle the format conversion.
Audio preprocessing (mel spectrogram, MFCC) is substantially more complex than accelerometer preprocessing. Chapter 9 covers it in detail when we build the full inference pipeline for audio.
Common Pitfalls
Off-by-one in sample counts. If the board sends 99 rows and your preprocessing expects 100, the last sample is garbage-padded. Check len(rows) before padding.
Serial buffer overflow. At 100 Hz with 3 floats per row, you're sending ~3 KB per second. Python's readline() loop must keep pace. If the buffer fills, readings are lost silently. Throttle the sample rate or increase the baud rate if you see gaps.
Inconsistent gesture execution. If you rush or hesitate before a gesture, the 1-second window captures the tail of the preparation rather than the gesture itself. Be deliberate and consistent when recording.
Next Steps
Continue to 05-model-training.md to design a small neural network and train it on the data you just collected.