Source code for torchaudio.prototype.functional._rir
import math
from typing import Optional, Tuple, Union
import torch
import torchaudio
from torch import Tensor
def _compute_image_sources(
room: torch.Tensor,
source: torch.Tensor,
max_order: int,
absorption: torch.Tensor,
scatter: Optional[torch.Tensor] = None,
) -> Tuple[Tensor, Tensor]:
"""Compute image sources in a shoebox-like room.
Args:
room (torch.Tensor): The 1D Tensor to determine the room size. The shape is
`(D,)`, where ``D`` is 2 if room is a 2D room, or 3 if room is a 3D room.
source (torch.Tensor): The coordinate of the sound source. Tensor with dimensions
`(D)`.
max_order (int): The maximum number of reflections of the source.
absorption (torch.Tensor): The absorption coefficients of wall materials.
``absorption`` is a Tensor with dimensions `(num_band, num_wall)`.
The shape options are ``[(1, 4), (1, 6), (7, 4), (7, 6)]``.
``num_band`` is `1` if the coefficients is the same for all frequencies, or is `7`
if the coefficients are different to different frequencies. `7` refers to the default number
of octave bands. (See note in `simulate_rir_ism` method).
``num_wall`` is `4` if the room is a 2D room, representing absorption coefficients
of ``"west"``, ``"east"``, ``"south"``, and ``"north"`` walls, respectively.
Or it is `6` if the room is a 3D room, representing absorption coefficients
of ``"west"``, ``"east"``, ``"south"``, ``"north"``, ``"floor"``, and ``"ceiling"``, respectively.
scatter (torch.Tensor): The scattering coefficients of wall materials.
The shape of ``scatter`` must match that of ``absorption``. If ``None``, it is not
used in image source computation. (Default: ``None``)
Returns:
(torch.Tensor): The coordinates of all image sources within ``max_order`` number of reflections.
Tensor with dimensions `(num_image_source, D)`.
(torch.Tensor): The attenuation of corresponding image sources. Tensor with dimensions
`(num_band, num_image_source)`.
"""
if scatter is None:
tr = torch.sqrt(1 - absorption)
else:
tr = torch.sqrt(1 - absorption) * torch.sqrt(1 - scatter)
ind = torch.arange(-max_order, max_order + 1, device=source.device)
if room.shape[0] == 2:
XYZ = torch.meshgrid(ind, ind, indexing="ij")
else:
XYZ = torch.meshgrid(ind, ind, ind, indexing="ij")
XYZ = torch.stack([c.reshape((-1,)) for c in XYZ], dim=-1)
XYZ = XYZ[XYZ.abs().sum(dim=-1) <= max_order]
# compute locations of image sources
d = room[None, :]
s = source[None, :]
img_loc = torch.where(XYZ % 2 == 1, d * (XYZ + 1) - s, d * XYZ + s)
# attenuation
exp_lo = abs(torch.floor((XYZ / 2)))
exp_hi = abs(torch.floor((XYZ + 1) / 2))
t_lo = tr[:, ::2].unsqueeze(1).repeat(1, XYZ.shape[0], 1) # (num_band, left walls)
t_hi = tr[:, 1::2].unsqueeze(1).repeat(1, XYZ.shape[0], 1) # (num_band, right walls)
att = torch.prod((t_lo**exp_lo) * (t_hi**exp_hi), dim=-1) # (num_band, num_image_source)
return img_loc, att
def _hann(x: torch.Tensor, T: int):
"""Compute the Hann window where the values are truncated based on window length.
torch.hann_window can only sample window function at integer points, the method is to sample
continuous window function at non-integer points.
Args:
x (torch.Tensor): The fractional component of time delay Tensor.
T (torch.Tensor): The window length of sinc function.
Returns:
(torch.Tensor): The hann window Tensor where values outside
the sinc window (`T`) is set to zero.
"""
y = torch.where(
torch.abs(x) <= T / 2,
0.5 * (1 + torch.cos(2 * math.pi * x / T)),
x.new_zeros(1),
)
return y
def _frac_delay(delay: torch.Tensor, delay_i: torch.Tensor, delay_filter_length: int):
"""Compute fractional delay of impulse response signal.
Args:
delay (torch.Tensor): The time delay Tensor in samples.
delay_i (torch.Tensor): The integer part of delay.
delay_filter_length (int): The window length for sinc function.
Returns:
(torch.Tensor): The impulse response Tensor for all image sources.
"""
if delay_filter_length % 2 != 1:
raise ValueError("The filter length must be odd")
pad = delay_filter_length // 2
n = torch.arange(-pad, pad + 1, device=delay.device) + delay_i[..., None]
delay = delay[..., None]
return torch.special.sinc(n - delay) * _hann(n - delay, 2 * pad)
def _adjust_coeff(coeffs: Union[float, torch.Tensor], name: str) -> torch.Tensor:
"""Validates and converts absorption or scattering parameters to a tensor with appropriate shape
Args:
coeff (float or torch.Tensor): The absorption coefficients of wall materials.
If the dtype is ``float``, the absorption coefficient is identical for all walls and
all frequencies.
If ``absorption`` is a 1D Tensor, the shape must be `(2*dim,)`,
where the values represent absorption coefficients of ``"west"``, ``"east"``,
``"south"``, ``"north"``, ``"floor"``, and ``"ceiling"``, respectively.
If ``absorption`` is a 2D Tensor, the shape must be `(7, 2*dim)`,
where 7 represents the number of octave bands.
Returns:
(torch.Tensor): The expanded coefficient.
The shape is `(1, 6)` for single octave band case, and
`(7, 6)` for multi octave band case.
"""
num_walls = 6
if isinstance(coeffs, float):
if coeffs < 0:
raise ValueError(f"`{name}` must be non-negative. Found: {coeffs}")
return torch.full((1, num_walls), coeffs)
if isinstance(coeffs, Tensor):
if torch.any(coeffs < 0):
raise ValueError(f"`{name}` must be non-negative. Found: {coeffs}")
if coeffs.ndim == 1:
if coeffs.numel() != num_walls:
raise ValueError(
f"The shape of `{name}` must be ({num_walls},) when it is a 1D Tensor. "
f"Found the shape {coeffs.shape}."
)
return coeffs.unsqueeze(0)
if coeffs.ndim == 2:
if coeffs.shape[1] != num_walls:
raise ValueError(
f"The shape of `{name}` must be (NUM_BANDS, {num_walls}) when it "
f"is a 2D Tensor. Found: {coeffs.shape}."
)
return coeffs
raise TypeError(f"`{name}` must be float or Tensor.")
def _validate_inputs(
room: torch.Tensor,
source: torch.Tensor,
mic_array: torch.Tensor,
):
"""Validate dimensions of input arguments, and normalize different kinds of absorption into the same dimension.
Args:
room (torch.Tensor): The size of the room. width, length (and height)
source (torch.Tensor): Sound source coordinates. Tensor with dimensions `(dim,)`.
mic_array (torch.Tensor): Microphone coordinates. Tensor with dimensions `(channel, dim)`.
"""
if not (room.ndim == 1 and room.numel() == 3):
raise ValueError(f"`room` must be a 1D Tensor with 3 elements. Found {room.shape}.")
if not (source.ndim == 1 and source.numel() == 3):
raise ValueError(f"`source` must be 1D Tensor with 3 elements. Found {source.shape}.")
if not (mic_array.ndim == 2 and mic_array.shape[1] == 3):
raise ValueError(f"`mic_array` must be a 2D Tensor with shape (num_channels, 3). Found {mic_array.shape}.")
[docs]def simulate_rir_ism(
room: torch.Tensor,
source: torch.Tensor,
mic_array: torch.Tensor,
max_order: int,
absorption: Union[float, torch.Tensor],
output_length: Optional[int] = None,
delay_filter_length: int = 81,
center_frequency: Optional[torch.Tensor] = None,
sound_speed: float = 343.0,
sample_rate: float = 16000.0,
) -> Tensor:
r"""Compute Room Impulse Response (RIR) based on the *image source method* :cite:`allen1979image`.
The implementation is based on *pyroomacoustics* :cite:`scheibler2018pyroomacoustics`.
.. devices:: CPU
.. properties:: TorchScript
Args:
room (torch.Tensor): Room coordinates. The shape of `room` must be `(3,)` which represents
three dimensions of the room.
source (torch.Tensor): Sound source coordinates. Tensor with dimensions `(3,)`.
mic_array (torch.Tensor): Microphone coordinates. Tensor with dimensions `(channel, 3)`.
max_order (int): The maximum number of reflections of the source.
absorption (float or torch.Tensor): The *absorption* :cite:`wiki:Absorption_(acoustics)`
coefficients of wall materials for sound energy.
If the dtype is ``float``, the absorption coefficient is identical for all walls and
all frequencies.
If ``absorption`` is a 1D Tensor, the shape must be `(6,)`, where the values represent
absorption coefficients of ``"west"``, ``"east"``, ``"south"``, ``"north"``, ``"floor"``,
and ``"ceiling"``, respectively.
If ``absorption`` is a 2D Tensor, the shape must be `(7, 6)`, where 7 represents the number of octave bands.
output_length (int or None, optional): The output length of simulated RIR signal. If ``None``,
the length is defined as
.. math::
\frac{\text{max\_d} \cdot \text{sample\_rate}}{\text{sound\_speed}} + \text{delay\_filter\_length}
where ``max_d`` is the maximum distance between image sources and microphones.
delay_filter_length (int, optional): The filter length for computing sinc function. (Default: ``81``)
center_frequency (torch.Tensor, optional): The center frequencies of octave bands for multi-band walls.
Only used when ``absorption`` is a 2D Tensor.
sound_speed (float, optional): The speed of sound. (Default: ``343.0``)
sample_rate (float, optional): The sample rate of the generated room impulse response signal.
(Default: ``16000.0``)
Returns:
(torch.Tensor): The simulated room impulse response waveform. Tensor with dimensions
`(channel, rir_length)`.
Note:
If ``absorption`` is a 2D Tensor and ``center_frequency`` is set to ``None``, the center frequencies
of octave bands are fixed to ``[125.0, 250.0, 500.0, 1000.0, 2000.0, 4000.0, 8000.0]``.
Users need to tune the values of ``absorption`` to the corresponding frequencies.
"""
_validate_inputs(room, source, mic_array)
absorption = _adjust_coeff(absorption, "absorption")
img_location, att = _compute_image_sources(room, source, max_order, absorption)
# compute distances between image sources and microphones
vec = img_location[:, None, :] - mic_array[None, :, :]
dist = torch.linalg.norm(vec, dim=-1) # (image_source, channel)
img_src_att = att[..., None] / dist[None, ...] # (band, image_source, channel)
# separate delays in integer / frac part
delay = dist * sample_rate / sound_speed # distance to delay in samples
delay_i = torch.ceil(delay) # integer part
# compute the shorts IRs corresponding to each image source
irs = img_src_att[..., None] * _frac_delay(delay, delay_i, delay_filter_length)[None, ...]
rir_length = int(delay_i.max() + irs.shape[-1])
rir = torch.ops.torchaudio._simulate_rir(irs, delay_i.type(torch.int32), rir_length)
# multi-band processing
if absorption.shape[0] > 1:
if center_frequency is None:
center = torch.tensor(
[125.0, 250.0, 500.0, 1000.0, 2000.0, 4000.0, 8000.0], dtype=room.dtype, device=room.device
)
else:
center = center_frequency
# n_fft is set to 512 by default.
filters = torch.ops.torchaudio._make_rir_filter(center, sample_rate, n_fft=512)
rir = torchaudio.functional.fftconvolve(rir, filters.unsqueeze(1).repeat(1, rir.shape[1], 1), mode="same")
# sum up rir signals of all image sources into one waveform.
rir = rir.sum(0)
if output_length is not None:
if output_length > rir.shape[-1]:
rir = torch.nn.functional.pad(rir, (0, output_length - rir.shape[-1]), "constant", 0.0)
else:
rir = rir[..., :output_length]
return rir
[docs]def ray_tracing(
room: torch.Tensor,
source: torch.Tensor,
mic_array: torch.Tensor,
num_rays: int,
absorption: Union[float, torch.Tensor] = 0.0,
scattering: Union[float, torch.Tensor] = 0.0,
mic_radius: float = 0.5,
sound_speed: float = 343.0,
energy_thres: float = 1e-7,
time_thres: float = 10.0,
hist_bin_size: float = 0.004,
) -> torch.Tensor:
r"""Compute energy histogram via ray tracing.
The implementation is based on *pyroomacoustics* :cite:`scheibler2018pyroomacoustics`.
``num_rays`` rays are casted uniformly in all directions from the source;
when a ray intersects a wall, it is reflected and part of its energy is absorbed.
It is also scattered (sent directly to the microphone(s)) according to the ``scattering``
coefficient.
When a ray is close to the microphone, its current energy is recorded in the output
histogram for that given time slot.
.. devices:: CPU
.. properties:: TorchScript
Args:
room (torch.Tensor): Room coordinates. The shape of `room` must be `(3,)` which represents
three dimensions of the room.
source (torch.Tensor): Sound source coordinates. Tensor with dimensions `(3,)`.
mic_array (torch.Tensor): Microphone coordinates. Tensor with dimensions `(channel, 3)`.
absorption (float or torch.Tensor, optional): The absorption coefficients of wall materials.
(Default: ``0.0``).
If the type is ``float``, the absorption coefficient is identical to all walls and
all frequencies.
If ``absorption`` is a 1D Tensor, the shape must be `(6,)`, representing absorption
coefficients of ``"west"``, ``"east"``, ``"south"``, ``"north"``, ``"floor"``, and
``"ceiling"``, respectively.
If ``absorption`` is a 2D Tensor, the shape must be `(num_bands, 6)`.
``num_bands`` is the number of frequency bands (usually 7).
scattering(float or torch.Tensor, optional): The scattering coefficients of wall materials. (Default: ``0.0``)
The shape and type of this parameter is the same as for ``absorption``.
mic_radius(float, optional): The radius of the microphone in meters. (Default: 0.5)
sound_speed (float, optional): The speed of sound in meters per second. (Default: ``343.0``)
energy_thres (float, optional): The energy level below which we stop tracing a ray. (Default: ``1e-7``)
The initial energy of each ray is ``2 / num_rays``.
time_thres (float, optional): The maximal duration for which rays are traced. (Unit: seconds) (Default: 10.0)
hist_bin_size (float, optional): The size of each bin in the output histogram. (Unit: seconds) (Default: 0.004)
Returns:
(torch.Tensor): The 3D histogram(s) where the energy of the traced ray is recorded.
Each bin corresponds to a given time slot.
The shape is `(channel, num_bands, num_bins)`, where
``num_bins = ceil(time_thres / hist_bin_size)``.
If both ``absorption`` and ``scattering`` are floats, then ``num_bands == 1``.
"""
if time_thres < hist_bin_size:
raise ValueError(
"`time_thres` must be greater than `hist_bin_size`. "
f"Found: hist_bin_size={hist_bin_size}, time_thres={time_thres}."
)
if room.dtype != source.dtype or source.dtype != mic_array.dtype:
raise ValueError(
"dtype of `room`, `source` and `mic_array` must match. "
f"Found: `room` ({room.dtype}), `source` ({source.dtype}) and "
f"`mic_array` ({mic_array.dtype})"
)
_validate_inputs(room, source, mic_array)
absorption = _adjust_coeff(absorption, "absorption").to(room.dtype)
scattering = _adjust_coeff(scattering, "scattering").to(room.dtype)
# Bring absorption and scattering to the same shape
if absorption.shape[0] == 1 and scattering.shape[0] > 1:
absorption = absorption.expand(scattering.shape)
if scattering.shape[0] == 1 and absorption.shape[0] > 1:
scattering = scattering.expand(absorption.shape)
if absorption.shape != scattering.shape:
raise ValueError(
"`absorption` and `scattering` must be broadcastable to the same number of bands and walls. "
f"Inferred shapes absorption={absorption.shape} and scattering={scattering.shape}"
)
histograms = torch.ops.torchaudio.ray_tracing(
room,
source,
mic_array,
num_rays,
absorption,
scattering,
mic_radius,
sound_speed,
energy_thres,
time_thres,
hist_bin_size,
)
return histograms