Acoustic Human Pose estimation Tutorial¶

!pip install pysensing

In this tutorial, we will be implementing codes for acoustic Human pose estimation

from pysensing.acoustic.datasets.utils.hpe_vis import *
from pysensing.acoustic.models.hpe import Speech2pose,Wipose_LSTM
from pysensing.acoustic.models.get_model import load_hpe_model
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

Listening Human Behavior: 3D Human Pose Estimation with Acoustic Signals

Implementation of “Listening Human Behavior: 3D Human Pose Estimation with Acoustic Signals”.

This dataset contains the audio reflected by human to estimate the 3D human pose with the acoustic signals.

Reference: https://github.com/YutoShibata07/AcousticPose_Public

Load the data¶

# Method 1: Use get_dataloader
from pysensing.acoustic.datasets.get_dataloader import *
train_loader,val_loader,test_loader = load_hpe_dataset(
    root='./data',
    dataset_name='pose_regression_timeseries_subject_1',
    download=True)

# Method 2
csv = './data/hpe_dataset/csv/pose_regression_timeseries_subject_1/test.csv' # The path contains the samosa dataset
data_dir = './data'
hpe_testdataset = SoundPose2DDataset(csv,sound_length=2400,input_feature='logmel',
                                     mean=np.array(get_mean()).astype("float32")[:4],
                                     std=np.array(get_std()).astype("float32")[:4],
                                     )
index = 10 # Randomly select an index
sample= hpe_testdataset.__getitem__(index)
print(sample['targets'].shape)
print(sample['sound'].shape)

torch.Size([12, 63])
torch.Size([4, 12, 128])

Load Speech2pose model¶

# Method 1
hpe_model = Speech2pose(out_cha=63).to(device)
# model_path = 'path to pretrian weights'
# state_dict = torch.load(model_path,weights_only=True)
# hpe_model.load_state_dict(state_dict)

# Method 2
hpe_model = load_hpe_model('speech2pose',pretrained=True,task='subject8').to(device)

Modle Inference¶

#Method 1
sample= hpe_testdataset.__getitem__(index)
hpe_model.eval()
predicted_result = hpe_model(sample['sound'].unsqueeze(0).float().to(device))
vis_images = make_images(sample['targets'].numpy(),predicted_result.cpu().detach().numpy().squeeze(0))

#Method 2
from pysensing.acoustic.inference.predict import *
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
predicted_result  = hpe_predict(sample['sound'],'SoundPose2DDataset',hpe_model, device=device)
vis_images = make_images(sample['targets'].numpy(),predicted_result.cpu().detach().numpy().squeeze(0))

seq_num = 0
fig = plt.figure(figsize=(12, 12))
plt.imshow(vis_images[seq_num]['img'])
plt.axis('off')
plt.show()

/home/kemove/anaconda3/envs/il/lib/python3.9/site-packages/torch/nn/modules/conv.py:306: UserWarning: Using padding='same' with even kernel lengths and odd dilation may require a zero-padded copy of the input be created (Triggered internally at ../aten/src/ATen/native/Convolution.cpp:1008.)
  return F.conv1d(input, weight, bias, self.stride,

Modle Embedding¶

from pysensing.acoustic.inference.embedding import *
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
sample_embedding = hpe_embedding(sample['sound'],'SoundPose2DDataset',hpe_model, device=device)

Modle Training¶

from pysensing.acoustic.inference.training.AcousticPose_utils.hpe_train import train_model,generate_configs

args = {
    "root_dir": "./data/hpe_dataset/testing_result",
    "save_name": "seq1",
    "input_feature": "logmel",
    "batchsize": 64,
    "max_epoch": 50,
    "csv_path": "./data/hpe_dataset/csv",
    "dataset_name": "pose_regression_timeseries_subject_1",
    "model": "speech2pose",
    "sound_length": 2400,
    "learning_rate": 0.01,
}
config_path = args["root_dir"]+'/'+args["save_name"]+"/config.yaml"
generate_configs(args)
resume_training = False
random_seed = 0

train_model(
    config_path=config_path,
    resume=resume_training,
    seed=random_seed,
)

# Modle Training
# ------------------------
from pysensing.acoustic.inference.training.AcousticPose_utils.hpe_test import evaluate_model
args = {
    "root_dir": "./data/hpe_dataset/testing_result",
    "save_name": "seq1",
    "batchsize": 64,
    "max_epoch": 20,
    "csv_path": "./data/hpe_dataset/csv",
    "dataset_name": "pose_regression_timeseries_subject_1",
    "model": "speech2pose",
    "sound_length": 2400,
    "learning_rate": 0.01,
}
config_path = args["root_dir"]+'/'+args["save_name"]+"/config.yaml"
evaluation_mode = "test"
model_path = None

evaluate_model(
    config_path=config_path,
    mode=evaluation_mode,
    model_path=model_path)

{'csv_path': './data/hpe_dataset/csv', 'model': 'speech2pose', 'pretrained': True, 'use_class_weight': True, 'batch_size': 32, 'width': 224, 'height': 224, 'num_workers': 8, 'max_epoch': 50, 'learning_rate': 0.01, 'sound_length': 2400, 'dataset_name': 'pose_regression_timeseries_subject_1', 'input_feature': 'logmel', 'topk': (1, 3), 'smooth_loss': False, 'ratio': 0.0, 'gan': 'none', 'finetune': 0, 'aug': 'none'}
Finished making configuration files.
/home/kemove/anaconda3/envs/il/lib/python3.9/site-packages/pysensing/acoustic/inference/training/AcousticPose_utils/libs/dataset.py:195: FutureWarning: DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.
  self.df[self.df["sound_length"] == sound_length]
/home/kemove/anaconda3/envs/il/lib/python3.9/site-packages/pysensing/acoustic/inference/training/AcousticPose_utils/libs/dataset.py:195: FutureWarning: DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.
  self.df[self.df["sound_length"] == sound_length]
epoch: 0        epoch time[sec]: 4      train loss: 38.0937     val loss: 163.1282      val_rmse: 12.77217      val_mae: 8.43311
epoch: 1        epoch time[sec]: 1      train loss: 3.3102      val loss: 50.4458       val_rmse: 7.10252       val_mae: 4.40765
epoch: 2        epoch time[sec]: 1      train loss: 1.5292      val loss: 1.8193        val_rmse: 1.34880       val_mae: 0.82698
epoch: 3        epoch time[sec]: 1      train loss: 1.1860      val loss: 1.4522        val_rmse: 1.20507       val_mae: 0.72901
epoch: 4        epoch time[sec]: 1      train loss: 1.0476      val loss: 1.2415        val_rmse: 1.11424       val_mae: 0.64408
epoch: 5        epoch time[sec]: 1      train loss: 0.8655      val loss: 1.5326        val_rmse: 1.23800       val_mae: 0.72606
epoch: 6        epoch time[sec]: 1      train loss: 0.8991      val loss: 1.7280        val_rmse: 1.31454       val_mae: 0.75788
epoch: 7        epoch time[sec]: 1      train loss: 0.7731      val loss: 1.4055        val_rmse: 1.18552       val_mae: 0.67471
epoch: 8        epoch time[sec]: 1      train loss: 1.0249      val loss: 33.7593       val_rmse: 5.81028       val_mae: 3.50439
epoch: 9        epoch time[sec]: 1      train loss: 0.8726      val loss: 3.8771        val_rmse: 1.96904       val_mae: 1.12149
epoch: 10       epoch time[sec]: 1      train loss: 0.7102      val loss: 1.1204        val_rmse: 1.05849       val_mae: 0.56559
epoch: 11       epoch time[sec]: 1      train loss: 0.5611      val loss: 1.5317        val_rmse: 1.23764       val_mae: 0.69886
epoch: 12       epoch time[sec]: 1      train loss: 0.5923      val loss: 3.0224        val_rmse: 1.73850       val_mae: 1.06073
epoch: 13       epoch time[sec]: 1      train loss: 0.5095      val loss: 3.3477        val_rmse: 1.82968       val_mae: 1.13779
epoch: 14       epoch time[sec]: 1      train loss: 0.6626      val loss: 2.4439        val_rmse: 1.56328       val_mae: 0.91174
epoch: 15       epoch time[sec]: 1      train loss: 0.5039      val loss: 1.3499        val_rmse: 1.16187       val_mae: 0.62304
epoch: 16       epoch time[sec]: 1      train loss: 0.4427      val loss: 0.8582        val_rmse: 0.92637       val_mae: 0.49949
epoch: 17       epoch time[sec]: 1      train loss: 0.4141      val loss: 0.7087        val_rmse: 0.84182       val_mae: 0.44726
epoch: 18       epoch time[sec]: 1      train loss: 0.4481      val loss: 1.9027        val_rmse: 1.37937       val_mae: 0.81386
epoch: 19       epoch time[sec]: 1      train loss: 0.4947      val loss: 0.7230        val_rmse: 0.85029       val_mae: 0.46919
epoch: 20       epoch time[sec]: 1      train loss: 0.4163      val loss: 0.7290        val_rmse: 0.85384       val_mae: 0.46518
epoch: 21       epoch time[sec]: 1      train loss: 0.3556      val loss: 0.4851        val_rmse: 0.69652       val_mae: 0.38136
epoch: 22       epoch time[sec]: 1      train loss: 0.3800      val loss: 0.5969        val_rmse: 0.77261       val_mae: 0.41563
epoch: 23       epoch time[sec]: 1      train loss: 0.3724      val loss: 0.6957        val_rmse: 0.83409       val_mae: 0.47226
epoch: 24       epoch time[sec]: 1      train loss: 0.3783      val loss: 0.7569        val_rmse: 0.87001       val_mae: 0.49752
epoch: 25       epoch time[sec]: 1      train loss: 0.4418      val loss: 1.0745        val_rmse: 1.03658       val_mae: 0.56204
epoch: 26       epoch time[sec]: 1      train loss: 0.3735      val loss: 1.5351        val_rmse: 1.23899       val_mae: 0.71568
epoch: 27       epoch time[sec]: 1      train loss: 0.4474      val loss: 1.1726        val_rmse: 1.08285       val_mae: 0.59331
epoch: 28       epoch time[sec]: 1      train loss: 0.4119      val loss: 2.5204        val_rmse: 1.58757       val_mae: 0.93299
epoch: 29       epoch time[sec]: 1      train loss: 0.5752      val loss: 0.9867        val_rmse: 0.99332       val_mae: 0.55090
epoch: 30       epoch time[sec]: 1      train loss: 0.3881      val loss: 0.9181        val_rmse: 0.95816       val_mae: 0.57685
epoch: 31       epoch time[sec]: 1      train loss: 0.3227      val loss: 1.1632        val_rmse: 1.07851       val_mae: 0.59697
epoch: 32       epoch time[sec]: 1      train loss: 0.4059      val loss: 0.7456        val_rmse: 0.86350       val_mae: 0.51384
epoch: 33       epoch time[sec]: 1      train loss: 0.3700      val loss: 0.4349        val_rmse: 0.65946       val_mae: 0.36350
epoch: 34       epoch time[sec]: 1      train loss: 0.2653      val loss: 0.5030        val_rmse: 0.70920       val_mae: 0.42115
epoch: 35       epoch time[sec]: 1      train loss: 0.2663      val loss: 0.5556        val_rmse: 0.74541       val_mae: 0.39126
epoch: 36       epoch time[sec]: 1      train loss: 0.2956      val loss: 0.4150        val_rmse: 0.64422       val_mae: 0.37835
epoch: 37       epoch time[sec]: 1      train loss: 0.2606      val loss: 0.3849        val_rmse: 0.62038       val_mae: 0.36641
epoch: 38       epoch time[sec]: 1      train loss: 0.2569      val loss: 0.4467        val_rmse: 0.66835       val_mae: 0.35664
epoch: 39       epoch time[sec]: 1      train loss: 0.2712      val loss: 0.6237        val_rmse: 0.78972       val_mae: 0.40919
epoch: 40       epoch time[sec]: 1      train loss: 0.2585      val loss: 0.4085        val_rmse: 0.63916       val_mae: 0.37400
epoch: 41       epoch time[sec]: 1      train loss: 0.2806      val loss: 0.5592        val_rmse: 0.74777       val_mae: 0.42447
epoch: 42       epoch time[sec]: 1      train loss: 0.2215      val loss: 0.3250        val_rmse: 0.57006       val_mae: 0.30865
epoch: 43       epoch time[sec]: 1      train loss: 0.2627      val loss: 0.2415        val_rmse: 0.49140       val_mae: 0.26592
epoch: 44       epoch time[sec]: 1      train loss: 0.2536      val loss: 0.3004        val_rmse: 0.54807       val_mae: 0.30688
epoch: 45       epoch time[sec]: 1      train loss: 0.2531      val loss: 0.2808        val_rmse: 0.52989       val_mae: 0.30597
epoch: 46       epoch time[sec]: 1      train loss: 0.2785      val loss: 0.8624        val_rmse: 0.92864       val_mae: 0.49453
epoch: 47       epoch time[sec]: 1      train loss: 0.2600      val loss: 0.6742        val_rmse: 0.82110       val_mae: 0.47749
epoch: 48       epoch time[sec]: 1      train loss: 0.3157      val loss: 0.3889        val_rmse: 0.62361       val_mae: 0.34720
epoch: 49       epoch time[sec]: 1      train loss: 0.2532      val loss: 0.4986        val_rmse: 0.70615       val_mae: 0.38780
/home/kemove/anaconda3/envs/il/lib/python3.9/site-packages/pysensing/acoustic/inference/training/AcousticPose_utils/libs/dataset.py:195: FutureWarning: DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.
  self.df[self.df["sound_length"] == sound_length]
---------- Start evaluation for test data ----------
loss: 0.24147   RMSE: 0.49      MAE: 0.27       Acc: 0.86, 0.96, 0.98, 0.99
arm RMSE: 0.64  arm MAE: 0.38   Acc: 0.79, 0.95, 0.98, 0.99
leg RMSE: 0.40  leg MAE: 0.22   Acc: 0.93, 0.98, 0.99, 0.99
body RMSE: 0.33 body MAE: 0.16  Acc: 0.94, 0.99, 1.00, 1.00
Done.

Load the Wipose_LSTM model¶

# Method 1
hpe_model = Wipose_LSTM(in_cha=4,out_cha=63).to(device)
# model_path = 'path to trained model'
# state_dict = torch.load(model_path,weights_only=True)

# Method 2
hpe_model = load_hpe_model('wipose',pretrained=True,task='subject8').to(device)

/home/kemove/anaconda3/envs/il/lib/python3.9/site-packages/torch/nn/modules/rnn.py:82: UserWarning: dropout option adds dropout after all but last recurrent layer, so non-zero dropout expects num_layers greater than 1, but got dropout=0.1 and num_layers=1
  warnings.warn("dropout option adds dropout after all but last "

Load the data¶

csv = './data/hpe_dataset/csv/pose_regression_timeseries_subject_8/test.csv' # The path contains the samosa dataset
hpe_testdataset = SoundPoseLSTMDataset(csv,sound_length=2400,input_feature='raw',mean=np.array(get_raw_mean()).astype("float32"),std=np.array(get_raw_std()).astype("float32"))
index = 0 # Randomly select an index
sample= hpe_testdataset.__getitem__(index)

Model inference¶

# Method 1
hpe_model.eval()
predicted_result = hpe_model(sample['sound'].unsqueeze(0).float().to(device))
vis_images = make_images(sample['targets'],predicted_result.cpu().detach().squeeze(0))

#Method 2
from pysensing.acoustic.inference.predict import *
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
predicted_result  = hpe_predict(sample['sound'],'SoundPoseLSTMDataset',hpe_model, device=device)
vis_images = make_images(sample['targets'].numpy(),predicted_result.cpu().detach().numpy().squeeze(0))

seq_num = 2
fig = plt.figure(figsize=(12, 12))
plt.imshow(vis_images[seq_num]['img'])
plt.axis('off')
plt.show()

Model embedding¶

from pysensing.acoustic.inference.embedding import *
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
sample_embedding = hpe_embedding(sample['sound'],'SoundPoseLSTMDataset',hpe_model, device=device)

And that’s it. We’re done with our acoustic humna pose estimation tutorials. Thanks for reading.

Total running time of the script: (3 minutes 10.114 seconds)

Gallery generated by Sphinx-Gallery

Acoustic Human Pose estimation Tutorial¶

Load the data¶

Load Speech2pose model¶

Modle Inference¶

Modle Embedding¶

Modle Training¶

Load the Wipose_LSTM model¶

Load the data¶

Model inference¶

Model embedding¶

Docs

Tutorials

Get Started