Shortcuts

Source code for torchaudio.prototype.functional._rir

import math
from typing import Optional, Tuple, Union

import torch
import torchaudio
from torch import Tensor


def _compute_image_sources(
    room: torch.Tensor,
    source: torch.Tensor,
    max_order: int,
    absorption: torch.Tensor,
    scatter: Optional[torch.Tensor] = None,
) -> Tuple[Tensor, Tensor]:
    """Compute image sources in a shoebox-like room.

    Args:
        room (torch.Tensor): The 1D Tensor to determine the room size. The shape is
            `(D,)`, where ``D`` is 2 if room is a 2D room, or 3 if room is a 3D room.
        source (torch.Tensor): The coordinate of the sound source. Tensor with dimensions
            `(D)`.
        max_order (int): The maximum number of reflections of the source.
        absorption (torch.Tensor): The absorption coefficients of wall materials.
            ``absorption`` is a Tensor with dimensions `(num_band, num_wall)`.
            The shape options are ``[(1, 4), (1, 6), (7, 4), (7, 6)]``.
            ``num_band`` is `1` if the coefficients is the same for all frequencies, or is `7`
            if the coefficients are different to different frequencies. `7` refers to the default number
            of octave bands. (See note in `simulate_rir_ism` method).
            ``num_wall`` is `4` if the room is a 2D room, representing absorption coefficients
            of ``"west"``, ``"east"``, ``"south"``, and ``"north"`` walls, respectively.
            Or it is `6` if the room is a 3D room, representing absorption coefficients
            of ``"west"``, ``"east"``, ``"south"``, ``"north"``, ``"floor"``, and ``"ceiling"``, respectively.
        scatter (torch.Tensor): The scattering coefficients of wall materials.
            The shape of ``scatter`` must match that of ``absorption``. If ``None``, it is not
            used in image source computation. (Default: ``None``)

    Returns:
        (torch.Tensor): The coordinates of all image sources within ``max_order`` number of reflections.
            Tensor with dimensions `(num_image_source, D)`.
        (torch.Tensor): The attenuation of corresponding image sources. Tensor with dimensions
            `(num_band, num_image_source)`.
    """
    if scatter is None:
        tr = torch.sqrt(1 - absorption)
    else:
        tr = torch.sqrt(1 - absorption) * torch.sqrt(1 - scatter)

    ind = torch.arange(-max_order, max_order + 1, device=source.device)
    if room.shape[0] == 2:
        XYZ = torch.meshgrid(ind, ind, indexing="ij")
    else:
        XYZ = torch.meshgrid(ind, ind, ind, indexing="ij")
    XYZ = torch.stack([c.reshape((-1,)) for c in XYZ], dim=-1)
    XYZ = XYZ[XYZ.abs().sum(dim=-1) <= max_order]

    # compute locations of image sources
    d = room[None, :]
    s = source[None, :]
    img_loc = torch.where(XYZ % 2 == 1, d * (XYZ + 1) - s, d * XYZ + s)

    # attenuation
    exp_lo = abs(torch.floor((XYZ / 2)))
    exp_hi = abs(torch.floor((XYZ + 1) / 2))
    t_lo = tr[:, ::2].unsqueeze(1).repeat(1, XYZ.shape[0], 1)  # (num_band, left walls)
    t_hi = tr[:, 1::2].unsqueeze(1).repeat(1, XYZ.shape[0], 1)  # (num_band, right walls)
    att = torch.prod((t_lo**exp_lo) * (t_hi**exp_hi), dim=-1)  # (num_band, num_image_source)
    return img_loc, att


def _hann(x: torch.Tensor, T: int):
    """Compute the Hann window where the values are truncated based on window length.
    torch.hann_window can only sample window function at integer points, the method is to sample
    continuous window function at non-integer points.

    Args:
        x (torch.Tensor): The fractional component of time delay Tensor.
        T (torch.Tensor): The window length of sinc function.

    Returns:
        (torch.Tensor): The hann window Tensor where values outside
            the sinc window (`T`) is set to zero.
    """
    y = torch.where(
        torch.abs(x) <= T / 2,
        0.5 * (1 + torch.cos(2 * math.pi * x / T)),
        x.new_zeros(1),
    )
    return y


def _frac_delay(delay: torch.Tensor, delay_i: torch.Tensor, delay_filter_length: int):
    """Compute fractional delay of impulse response signal.

    Args:
        delay (torch.Tensor): The time delay Tensor in samples.
        delay_i (torch.Tensor): The integer part of delay.
        delay_filter_length (int): The window length for sinc function.

    Returns:
        (torch.Tensor): The impulse response Tensor for all image sources.
    """
    if delay_filter_length % 2 != 1:
        raise ValueError("The filter length must be odd")

    pad = delay_filter_length // 2
    n = torch.arange(-pad, pad + 1, device=delay.device) + delay_i[..., None]
    delay = delay[..., None]

    return torch.special.sinc(n - delay) * _hann(n - delay, 2 * pad)


def _adjust_coeff(coeffs: Union[float, torch.Tensor], name: str) -> torch.Tensor:
    """Validates and converts absorption or scattering parameters to a tensor with appropriate shape

    Args:
        coeff (float or torch.Tensor): The absorption coefficients of wall materials.

            If the dtype is ``float``, the absorption coefficient is identical for all walls and
            all frequencies.

            If ``absorption`` is a 1D Tensor, the shape must be `(2*dim,)`,
            where the values represent absorption coefficients of ``"west"``, ``"east"``,
            ``"south"``, ``"north"``, ``"floor"``, and ``"ceiling"``, respectively.

            If ``absorption`` is a 2D Tensor, the shape must be `(7, 2*dim)`,
            where 7 represents the number of octave bands.

    Returns:
        (torch.Tensor): The expanded coefficient.
            The shape is `(1, 6)` for single octave band case, and
            `(7, 6)` for multi octave band case.
    """
    num_walls = 6
    if isinstance(coeffs, float):
        if coeffs < 0:
            raise ValueError(f"`{name}` must be non-negative. Found: {coeffs}")
        return torch.full((1, num_walls), coeffs)
    if isinstance(coeffs, Tensor):
        if torch.any(coeffs < 0):
            raise ValueError(f"`{name}` must be non-negative. Found: {coeffs}")
        if coeffs.ndim == 1:
            if coeffs.numel() != num_walls:
                raise ValueError(
                    f"The shape of `{name}` must be ({num_walls},) when it is a 1D Tensor. "
                    f"Found the shape {coeffs.shape}."
                )
            return coeffs.unsqueeze(0)
        if coeffs.ndim == 2:
            if coeffs.shape[1] != num_walls:
                raise ValueError(
                    f"The shape of `{name}` must be (NUM_BANDS, {num_walls}) when it "
                    f"is a 2D Tensor. Found: {coeffs.shape}."
                )
            return coeffs
    raise TypeError(f"`{name}` must be float or Tensor.")


def _validate_inputs(
    room: torch.Tensor,
    source: torch.Tensor,
    mic_array: torch.Tensor,
):
    """Validate dimensions of input arguments, and normalize different kinds of absorption into the same dimension.

    Args:
        room (torch.Tensor): The size of the room. width, length (and height)
        source (torch.Tensor): Sound source coordinates. Tensor with dimensions `(dim,)`.
        mic_array (torch.Tensor): Microphone coordinates. Tensor with dimensions `(channel, dim)`.
    """
    if not (room.ndim == 1 and room.numel() == 3):
        raise ValueError(f"`room` must be a 1D Tensor with 3 elements. Found {room.shape}.")
    if not (source.ndim == 1 and source.numel() == 3):
        raise ValueError(f"`source` must be 1D Tensor with 3 elements. Found {source.shape}.")
    if not (mic_array.ndim == 2 and mic_array.shape[1] == 3):
        raise ValueError(f"`mic_array` must be a 2D Tensor with shape (num_channels, 3). Found {mic_array.shape}.")


[docs]def simulate_rir_ism( room: torch.Tensor, source: torch.Tensor, mic_array: torch.Tensor, max_order: int, absorption: Union[float, torch.Tensor], output_length: Optional[int] = None, delay_filter_length: int = 81, center_frequency: Optional[torch.Tensor] = None, sound_speed: float = 343.0, sample_rate: float = 16000.0, ) -> Tensor: r"""Compute Room Impulse Response (RIR) based on the *image source method* :cite:`allen1979image`. The implementation is based on *pyroomacoustics* :cite:`scheibler2018pyroomacoustics`. .. devices:: CPU .. properties:: TorchScript Args: room (torch.Tensor): Room coordinates. The shape of `room` must be `(3,)` which represents three dimensions of the room. source (torch.Tensor): Sound source coordinates. Tensor with dimensions `(3,)`. mic_array (torch.Tensor): Microphone coordinates. Tensor with dimensions `(channel, 3)`. max_order (int): The maximum number of reflections of the source. absorption (float or torch.Tensor): The *absorption* :cite:`wiki:Absorption_(acoustics)` coefficients of wall materials for sound energy. If the dtype is ``float``, the absorption coefficient is identical for all walls and all frequencies. If ``absorption`` is a 1D Tensor, the shape must be `(6,)`, where the values represent absorption coefficients of ``"west"``, ``"east"``, ``"south"``, ``"north"``, ``"floor"``, and ``"ceiling"``, respectively. If ``absorption`` is a 2D Tensor, the shape must be `(7, 6)`, where 7 represents the number of octave bands. output_length (int or None, optional): The output length of simulated RIR signal. If ``None``, the length is defined as .. math:: \frac{\text{max\_d} \cdot \text{sample\_rate}}{\text{sound\_speed}} + \text{delay\_filter\_length} where ``max_d`` is the maximum distance between image sources and microphones. delay_filter_length (int, optional): The filter length for computing sinc function. (Default: ``81``) center_frequency (torch.Tensor, optional): The center frequencies of octave bands for multi-band walls. Only used when ``absorption`` is a 2D Tensor. sound_speed (float, optional): The speed of sound. (Default: ``343.0``) sample_rate (float, optional): The sample rate of the generated room impulse response signal. (Default: ``16000.0``) Returns: (torch.Tensor): The simulated room impulse response waveform. Tensor with dimensions `(channel, rir_length)`. Note: If ``absorption`` is a 2D Tensor and ``center_frequency`` is set to ``None``, the center frequencies of octave bands are fixed to ``[125.0, 250.0, 500.0, 1000.0, 2000.0, 4000.0, 8000.0]``. Users need to tune the values of ``absorption`` to the corresponding frequencies. """ _validate_inputs(room, source, mic_array) absorption = _adjust_coeff(absorption, "absorption") img_location, att = _compute_image_sources(room, source, max_order, absorption) # compute distances between image sources and microphones vec = img_location[:, None, :] - mic_array[None, :, :] dist = torch.linalg.norm(vec, dim=-1) # (image_source, channel) img_src_att = att[..., None] / dist[None, ...] # (band, image_source, channel) # separate delays in integer / frac part delay = dist * sample_rate / sound_speed # distance to delay in samples delay_i = torch.ceil(delay) # integer part # compute the shorts IRs corresponding to each image source irs = img_src_att[..., None] * _frac_delay(delay, delay_i, delay_filter_length)[None, ...] rir_length = int(delay_i.max() + irs.shape[-1]) rir = torch.ops.torchaudio._simulate_rir(irs, delay_i.type(torch.int32), rir_length) # multi-band processing if absorption.shape[0] > 1: if center_frequency is None: center = torch.tensor( [125.0, 250.0, 500.0, 1000.0, 2000.0, 4000.0, 8000.0], dtype=room.dtype, device=room.device ) else: center = center_frequency # n_fft is set to 512 by default. filters = torch.ops.torchaudio._make_rir_filter(center, sample_rate, n_fft=512) rir = torchaudio.functional.fftconvolve(rir, filters.unsqueeze(1).repeat(1, rir.shape[1], 1), mode="same") # sum up rir signals of all image sources into one waveform. rir = rir.sum(0) if output_length is not None: if output_length > rir.shape[-1]: rir = torch.nn.functional.pad(rir, (0, output_length - rir.shape[-1]), "constant", 0.0) else: rir = rir[..., :output_length] return rir
[docs]def ray_tracing( room: torch.Tensor, source: torch.Tensor, mic_array: torch.Tensor, num_rays: int, absorption: Union[float, torch.Tensor] = 0.0, scattering: Union[float, torch.Tensor] = 0.0, mic_radius: float = 0.5, sound_speed: float = 343.0, energy_thres: float = 1e-7, time_thres: float = 10.0, hist_bin_size: float = 0.004, ) -> torch.Tensor: r"""Compute energy histogram via ray tracing. The implementation is based on *pyroomacoustics* :cite:`scheibler2018pyroomacoustics`. ``num_rays`` rays are casted uniformly in all directions from the source; when a ray intersects a wall, it is reflected and part of its energy is absorbed. It is also scattered (sent directly to the microphone(s)) according to the ``scattering`` coefficient. When a ray is close to the microphone, its current energy is recorded in the output histogram for that given time slot. .. devices:: CPU .. properties:: TorchScript Args: room (torch.Tensor): Room coordinates. The shape of `room` must be `(3,)` which represents three dimensions of the room. source (torch.Tensor): Sound source coordinates. Tensor with dimensions `(3,)`. mic_array (torch.Tensor): Microphone coordinates. Tensor with dimensions `(channel, 3)`. absorption (float or torch.Tensor, optional): The absorption coefficients of wall materials. (Default: ``0.0``). If the type is ``float``, the absorption coefficient is identical to all walls and all frequencies. If ``absorption`` is a 1D Tensor, the shape must be `(6,)`, representing absorption coefficients of ``"west"``, ``"east"``, ``"south"``, ``"north"``, ``"floor"``, and ``"ceiling"``, respectively. If ``absorption`` is a 2D Tensor, the shape must be `(num_bands, 6)`. ``num_bands`` is the number of frequency bands (usually 7). scattering(float or torch.Tensor, optional): The scattering coefficients of wall materials. (Default: ``0.0``) The shape and type of this parameter is the same as for ``absorption``. mic_radius(float, optional): The radius of the microphone in meters. (Default: 0.5) sound_speed (float, optional): The speed of sound in meters per second. (Default: ``343.0``) energy_thres (float, optional): The energy level below which we stop tracing a ray. (Default: ``1e-7``) The initial energy of each ray is ``2 / num_rays``. time_thres (float, optional): The maximal duration for which rays are traced. (Unit: seconds) (Default: 10.0) hist_bin_size (float, optional): The size of each bin in the output histogram. (Unit: seconds) (Default: 0.004) Returns: (torch.Tensor): The 3D histogram(s) where the energy of the traced ray is recorded. Each bin corresponds to a given time slot. The shape is `(channel, num_bands, num_bins)`, where ``num_bins = ceil(time_thres / hist_bin_size)``. If both ``absorption`` and ``scattering`` are floats, then ``num_bands == 1``. """ if time_thres < hist_bin_size: raise ValueError( "`time_thres` must be greater than `hist_bin_size`. " f"Found: hist_bin_size={hist_bin_size}, time_thres={time_thres}." ) if room.dtype != source.dtype or source.dtype != mic_array.dtype: raise ValueError( "dtype of `room`, `source` and `mic_array` must match. " f"Found: `room` ({room.dtype}), `source` ({source.dtype}) and " f"`mic_array` ({mic_array.dtype})" ) _validate_inputs(room, source, mic_array) absorption = _adjust_coeff(absorption, "absorption").to(room.dtype) scattering = _adjust_coeff(scattering, "scattering").to(room.dtype) # Bring absorption and scattering to the same shape if absorption.shape[0] == 1 and scattering.shape[0] > 1: absorption = absorption.expand(scattering.shape) if scattering.shape[0] == 1 and absorption.shape[0] > 1: scattering = scattering.expand(absorption.shape) if absorption.shape != scattering.shape: raise ValueError( "`absorption` and `scattering` must be broadcastable to the same number of bands and walls. " f"Inferred shapes absorption={absorption.shape} and scattering={scattering.shape}" ) histograms = torch.ops.torchaudio.ray_tracing( room, source, mic_array, num_rays, absorption, scattering, mic_radius, sound_speed, energy_thres, time_thres, hist_bin_size, ) return histograms

Docs

Access comprehensive developer documentation for PyTorch

View Docs

Tutorials

Get in-depth tutorials for beginners and advanced developers

View Tutorials

Resources

Find development resources and get your questions answered

View Resources