Started code for DSP final project

This commit is contained in:
Aidan Sharpe
2024-04-25 18:38:09 -04:00
parent 824a46b1fd
commit 50a5e57e18
96 changed files with 2160495 additions and 28 deletions

View File

@@ -0,0 +1,26 @@
function mod_data= DC_block( data, Nsamples)
global Downsample DATAPADDING_MSECS SEARCHBUFFER
ofs= SEARCHBUFFER* Downsample;
mod_data= data;
%compute dc component, it is a little weird
facc= sum( data( ofs+ 1: Nsamples- ofs))/ Nsamples;
mod_data( ofs+ 1: Nsamples- ofs)= data( ofs+ 1: Nsamples- ofs)- facc;
mod_data( ofs+ 1: ofs+ Downsample)= mod_data( ofs+ 1: ofs+ Downsample).* ...
( 0.5+ (0: Downsample- 1))/ Downsample;
mod_data( Nsamples- ofs: -1: Nsamples- ofs-Downsample+ 1)= ...
mod_data( Nsamples- ofs: -1: Nsamples- ofs-Downsample+ 1).* ...
( 0.5+ (0: Downsample- 1))/ Downsample;

View File

@@ -0,0 +1,53 @@
function Y= FFTNXCorr( ref_VAD, startr, nr, deg_VAD, startd, nd)
% this function has other simple implementations, current implementation is
% consistent with the C version
% one way to do this (in time domain) =====
x1= ref_VAD( startr: startr+ nr- 1);
x2= deg_VAD( startd: startd+ nd- 1);
x1= fliplr( x1);
Y= conv( x2, x1);
% done =====
% % the other way to do this (in freq domain)===
% Nx= 2^ (ceil( log2( max( nr, nd))));
% x1= zeros( 1, 2* Nx);
% x2= zeros( 1, 2* Nx);
% x1( 1: nr)= fliplr( ref_VAD( startr: startr+ nr- 1));
% x2( 1: nd)= deg_VAD( startd: startd+ nd- 1);
%
% if (nr== 491)
% fid= fopen( 'mat_debug.txt', 'wt');
% fprintf( fid, '%f\n', x1);
% fclose( fid);
% end
%
% x1_fft= fft( x1, 2* Nx);
% x2_fft= fft( x2, 2* Nx);
%
% tmp1= ifft( x1_fft.* x2_fft, 2* Nx);
%
% Ny= nr+ nd- 1;
% Y= tmp1( 1: Ny);
% % done ===========

View File

@@ -0,0 +1,162 @@
function [VAD, logVAD]= apply_VAD( data, Nsamples)
global Downsample MINSPEECHLGTH JOINSPEECHLGTH
Nwindows= floor( Nsamples/ Downsample);
%number of 4ms window
VAD= zeros( 1, Nwindows);
for count= 1: Nwindows
VAD( count)= sum( data( (count-1)* Downsample+ 1: ...
count* Downsample).^ 2)/ Downsample;
end
%VAD is the power of each 4ms window
LevelThresh = sum( VAD)/ Nwindows;
%LevelThresh is set to mean value of VAD
LevelMin= max( VAD);
if( LevelMin > 0 )
LevelMin= LevelMin* 1.0e-4;
else
LevelMin = 1.0;
end
%fprintf( 1, 'LevelMin is %f\n', LevelMin);
VAD( find( VAD< LevelMin))= LevelMin;
for iteration= 1: 12
LevelNoise= 0;
len= 0;
StDNoise= 0;
VAD_lessthan_LevelThresh= VAD( find( VAD<= LevelThresh));
len= length( VAD_lessthan_LevelThresh);
LevelNoise= sum( VAD_lessthan_LevelThresh);
if (len> 0)
LevelNoise= LevelNoise/ len;
StDNoise= sqrt( sum( ...
(VAD_lessthan_LevelThresh- LevelNoise).^ 2)/ len);
end
LevelThresh= 1.001* (LevelNoise+ 2* StDNoise);
end
%fprintf( 1, 'LevelThresh is %f\n', LevelThresh);
LevelNoise= 0;
LevelSig= 0;
len= 0;
VAD_greaterthan_LevelThresh= VAD( find( VAD> LevelThresh));
len= length( VAD_greaterthan_LevelThresh);
LevelSig= sum( VAD_greaterthan_LevelThresh);
VAD_lessorequal_LevelThresh= VAD( find( VAD<= LevelThresh));
LevelNoise= sum( VAD_lessorequal_LevelThresh);
if (len> 0)
LevelSig= LevelSig/ len;
else
LevelThresh= -1;
end
%fprintf( 1, 'LevelSig is %f\n', LevelSig);
if (len< Nwindows)
LevelNoise= LevelNoise/( Nwindows- len);
else
LevelNoise= 1;
end
%fprintf( 1, 'LevelNoise is %f\n', LevelNoise);
VAD( find( VAD<= LevelThresh))= -VAD( find( VAD<= LevelThresh));
VAD(1)= -LevelMin;
VAD(Nwindows)= -LevelMin;
start= 0;
finish= 0;
for count= 2: Nwindows
if( (VAD(count) > 0.0) && (VAD(count-1) <= 0.0) )
start = count;
end
if( (VAD(count) <= 0.0) && (VAD(count-1) > 0.0) )
finish = count;
if( (finish - start)<= MINSPEECHLGTH )
VAD( start: finish- 1)= -VAD( start: finish- 1);
end
end
end
%to make sure finish- start is more than 4
if( LevelSig >= (LevelNoise* 1000) )
for count= 2: Nwindows
if( (VAD(count)> 0) && (VAD(count-1)<= 0) )
start= count;
end
if( (VAD(count)<= 0) && (VAD(count-1)> 0) )
finish = count;
g = sum( VAD( start: finish- 1));
if( g< 3.0* LevelThresh* (finish - start) )
VAD( start: finish- 1)= -VAD( start: finish- 1);
end
end
end
end
start = 0;
finish = 0;
for count= 2: Nwindows
if( (VAD(count) > 0.0) && (VAD(count-1) <= 0.0) )
start = count;
if( (finish > 0) && ((start - finish) <= JOINSPEECHLGTH) )
VAD( finish: start- 1)= LevelMin;
end
end
if( (VAD(count) <= 0.0) && (VAD(count-1) > 0.0) )
finish = count;
end
end
start= 0;
for count= 2: Nwindows
if( (VAD(count)> 0) && (VAD(count-1)<= 0) )
start= count;
end
end
if( start== 0 )
VAD= abs(VAD);
VAD(1) = -LevelMin;
VAD(Nwindows) = -LevelMin;
end
count = 4;
while( count< (Nwindows-1) )
if( (VAD(count)> 0) && (VAD(count-2) <= 0) )
VAD(count-2)= VAD(count)* 0.1;
VAD(count-1)= VAD(count)* 0.3;
count= count+ 1;
end
if( (VAD(count)<= 0) && (VAD(count-1)> 0) )
VAD(count)= VAD(count-1)* 0.3;
VAD(count+ 1)= VAD(count-1)* 0.1;
count= count+ 3;
end
count= count+ 1;
end
VAD( find( VAD< 0))= 0;
% fid= fopen( 'mat_vad.txt', 'wt');
% fprintf( fid, '%f\n', VAD);
% fclose( fid);
if( LevelThresh<= 0 )
LevelThresh= LevelMin;
end
logVAD( find( VAD<= LevelThresh))= 0;
VAD_greaterthan_LevelThresh= find( VAD> LevelThresh);
logVAD( VAD_greaterthan_LevelThresh)= log( VAD( ...
VAD_greaterthan_LevelThresh)/ LevelThresh);

View File

@@ -0,0 +1,40 @@
function align_filtered= apply_filter( data, data_Nsamples, align_filter_dB)
global Downsample DATAPADDING_MSECS SEARCHBUFFER Fs
align_filtered= data;
n= data_Nsamples- 2* SEARCHBUFFER* Downsample+ DATAPADDING_MSECS* (Fs/ 1000);
% now find the next power of 2 which is greater or equal to n
pow_of_2= 2^ (ceil( log2( n)));
[number_of_points, trivial]= size( align_filter_dB);
overallGainFilter= interp1( align_filter_dB( :, 1), align_filter_dB( :, 2), ...
1000);
x= zeros( 1, pow_of_2);
x( 1: n)= data( SEARCHBUFFER* Downsample+ 1: SEARCHBUFFER* Downsample+ n);
x_fft= fft( x, pow_of_2);
freq_resolution= Fs/ pow_of_2;
factorDb( 1: pow_of_2/2+ 1)= interp1( align_filter_dB( :, 1), ...
align_filter_dB( :, 2), (0: pow_of_2/2)* freq_resolution)- ...
overallGainFilter;
factor= 10.^ (factorDb/ 20);
factor= [factor, fliplr( factor( 2: pow_of_2/2))];
x_fft= x_fft.* factor;
y= ifft( x_fft, pow_of_2);
align_filtered( SEARCHBUFFER* Downsample+ 1: SEARCHBUFFER* Downsample+ n)...
= y( 1: n);
% fid= fopen( 'log_mat.txt', 'wt');
% fprintf( fid, '%f\n', y( 1: n));
% fclose( fid);

View File

@@ -0,0 +1,26 @@
function mod_data= apply_filters( data, Nsamples)
%IIRFilt( InIIR_Hsos, InIIR_Nsos, data, data_Nsamples);
global InIIR_Hsos InIIR_Nsos DATAPADDING_MSECS Fs
% data_Nsamples= Nsamples+ DATAPADDING_MSECS* (Fs/ 1000);
% now we construct the second order section matrix
sosMatrix= zeros( InIIR_Nsos, 6);
sosMatrix( :, 4)= 1; %set a(1) to 1
% each row of sosMatrix holds [b(1*3) a(1*3)] for each section
sosMatrix( :, 1: 3)= InIIR_Hsos( :, 1: 3);
sosMatrix( :, 5: 6)= InIIR_Hsos( :, 4: 5);
%sosMatrix
% now we construct second order section direct form II filter
iirdf2= dfilt.df2sos( sosMatrix);
mod_data= filter( iirdf2, data);

View File

@@ -0,0 +1,200 @@
function cep_mean= comp_cep(cleanFile, enhdFile);
% ----------------------------------------------------------------------
% Cepstrum Distance Objective Speech Quality Measure
%
% This function implements the cepstrum distance measure used
% in [1]
%
% Usage: CEP=comp_cep(cleanFile.wav, enhancedFile.wav)
%
% cleanFile.wav - clean input file in .wav format
% enhancedFile - enhanced output file in .wav format
% CEP - computed cepstrum distance measure
%
% Note that the cepstrum measure is limited in the range [0, 10].
%
% Example call: CEP =comp_cep('sp04.wav','enhanced.wav')
%
%
% References:
%
% [1] Kitawaki, N., Nagabuchi, H., and Itoh, K. (1988). Objective quality
% evaluation for low bit-rate speech coding systems. IEEE J. Select.
% Areas in Comm., 6(2), 262-273.
%
% Author: Philipos C. Loizou
% (LPC routines were written by Bryan Pellom & John Hansen)
%
% Copyright (c) 2006 by Philipos C. Loizou
% $Revision: 0.0 $ $Date: 10/09/2006 $
% ----------------------------------------------------------------------
if nargin~=2
fprintf('USAGE: CEP=comp_cep(cleanFile.wav, enhancedFile.wav)\n');
fprintf('For more help, type: help comp_cep\n\n');
return;
end
alpha=0.95;
[data1, Srate1, Nbits1]= wavread(cleanFile);
[data2, Srate2, Nbits2]= wavread(enhdFile);
if ( Srate1~= Srate2) | ( Nbits1~= Nbits2)
error( 'The two files do not match!\n');
end
len= min( length( data1), length( data2));
data1= data1( 1: len)+eps;
data2= data2( 1: len)+eps;
IS_dist= cepstrum( data1, data2,Srate1);
IS_len= round( length( IS_dist)* alpha);
IS= sort( IS_dist);
cep_mean= mean( IS( 1: IS_len));
function distortion = cepstrum(clean_speech, processed_speech,sample_rate)
% ----------------------------------------------------------------------
% Check the length of the clean and processed speech. Must be the same.
% ----------------------------------------------------------------------
clean_length = length(clean_speech);
processed_length = length(processed_speech);
if (clean_length ~= processed_length)
disp('Error: Both Speech Files must be same length.');
return
end
% ----------------------------------------------------------------------
% Scale both clean speech and processed speech to have same dynamic
% range. Also remove DC component from each signal
% ----------------------------------------------------------------------
%clean_speech = clean_speech - mean(clean_speech);
%processed_speech = processed_speech - mean(processed_speech);
%processed_speech = processed_speech.*(max(abs(clean_speech))/ max(abs(processed_speech)));
% ----------------------------------------------------------------------
% Global Variables
% ----------------------------------------------------------------------
winlength = round(30*sample_rate/1000); %240; % window length in samples
skiprate = floor(winlength/4); % window skip in samples
if sample_rate<10000
P = 10; % LPC Analysis Order
else
P=16; % this could vary depending on sampling frequency.
end
C=10*sqrt(2)/log(10);
% ----------------------------------------------------------------------
% For each frame of input speech, calculate the Itakura-Saito Measure
% ----------------------------------------------------------------------
num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames
start = 1; % starting sample
window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
for frame_count = 1:num_frames
% ----------------------------------------------------------
% (1) Get the Frames for the test and reference speech.
% Multiply by Hanning Window.
% ----------------------------------------------------------
clean_frame = clean_speech(start:start+winlength-1);
processed_frame = processed_speech(start:start+winlength-1);
clean_frame = clean_frame.*window;
processed_frame = processed_frame.*window;
% ----------------------------------------------------------
% (2) Get the autocorrelation lags and LPC parameters used
% to compute the IS measure.
% ----------------------------------------------------------
[R_clean, Ref_clean, A_clean] = ...
lpcoeff(clean_frame, P);
[R_processed, Ref_processed, A_processed] = ...
lpcoeff(processed_frame, P);
C_clean=lpc2cep(A_clean);
C_processed=lpc2cep(A_processed);
% ----------------------------------------------------------
% (3) Compute the cepstrum-distance measure
% ----------------------------------------------------------
distortion(frame_count) = min(10,C*norm(C_clean-C_processed,2));
start = start + skiprate;
end
function [acorr, refcoeff, lpparams] = lpcoeff(speech_frame, model_order)
% ----------------------------------------------------------
% (1) Compute Autocorrelation Lags
% ----------------------------------------------------------
winlength = max(size(speech_frame));
for k=1:model_order+1
R(k) = sum(speech_frame(1:winlength-k+1) ...
.*speech_frame(k:winlength));
end
% ----------------------------------------------------------
% (2) Levinson-Durbin
% ----------------------------------------------------------
a = ones(1,model_order);
E(1)=R(1);
for i=1:model_order
a_past(1:i-1) = a(1:i-1);
sum_term = sum(a_past(1:i-1).*R(i:-1:2));
rcoeff(i)=(R(i+1) - sum_term) / E(i);
a(i)=rcoeff(i);
a(1:i-1) = a_past(1:i-1) - rcoeff(i).*a_past(i-1:-1:1);
E(i+1)=(1-rcoeff(i)*rcoeff(i))*E(i);
end
acorr = R;
refcoeff = rcoeff;
lpparams = [1 -a];
%----------------------------------------------
function [cep]=lpc2cep(a)
%
% converts prediction to cepstrum coefficients
%
% Author: Philipos C. Loizou
M=length(a);
cep=zeros(1,M-1);
cep(1)=-a(2);
for k=2:M-1
ix=1:k-1;
vec1=cep(ix).*a(k-1+1:-1:2).*ix;
cep(k)=-(a(k+1)+sum(vec1)/k);
end

View File

@@ -0,0 +1,259 @@
function fwseg_dist= comp_fwseg(cleanFile, enhancedFile);
% ----------------------------------------------------------------------
% Frequency weighted SNRseg Objective Speech Quality Measure
%
% This function implements the frequency-weighted SNRseg measure [1]
% using a different weighting function, the clean spectrum.
%
% Usage: fwSNRseg=comp_fwseg(cleanFile.wav, enhancedFile.wav)
%
% cleanFile.wav - clean input file in .wav format
% enhancedFile - enhanced output file in .wav format
% fwSNRseg - computed frequency weighted SNRseg in dB
%
% Note that large numbers of fwSNRseg are better.
%
% Example call: fwSNRseg =comp_fwseg('sp04.wav','enhanced.wav')
%
%
% References:
% [1] Tribolet, J., Noll, P., McDermott, B., and Crochiere, R. E. (1978).
% A study of complexity and quality of speech waveform coders. Proc.
% IEEE Int. Conf. Acoust. , Speech, Signal Processing, 586-590.
%
% Author: Philipos C. Loizou
% (critical-band filtering routines were written by Bryan Pellom & John Hansen)
%
% Copyright (c) 2006 by Philipos C. Loizou
% $Revision: 0.0 $ $Date: 10/09/2006 $
% ----------------------------------------------------------------------
if nargin~=2
fprintf('USAGE: fwSNRseg=comp_fwseg(cleanFile.wav, enhancedFile.wav)\n');
fprintf('For more help, type: help comp_fwseg\n\n');
return;
end
[data1, Srate1, Nbits1]= wavread(cleanFile);
[data2, Srate2, Nbits2]= wavread(enhancedFile);
if ( Srate1~= Srate2) | ( Nbits1~= Nbits2)
error( 'The two files do not match!\n');
end
len= min( length( data1), length( data2));
data1= data1( 1: len)+eps;
data2= data2( 1: len)+eps;
wss_dist_vec= fwseg( data1, data2,Srate1);
fwseg_dist=mean(wss_dist_vec);
% ----------------------------------------------------------------------
function distortion = fwseg(clean_speech, processed_speech,sample_rate)
% ----------------------------------------------------------------------
% Check the length of the clean and processed speech. Must be the same.
% ----------------------------------------------------------------------
clean_length = length(clean_speech);
processed_length = length(processed_speech);
if (clean_length ~= processed_length)
disp('Error: Files must have same length.');
return
end
% ----------------------------------------------------------------------
% Global Variables
% ----------------------------------------------------------------------
winlength = round(30*sample_rate/1000); % window length in samples
skiprate = floor(winlength/4); % window skip in samples
max_freq = sample_rate/2; % maximum bandwidth
num_crit = 25; % number of critical bands
USE_25=1;
n_fft = 2^nextpow2(2*winlength);
n_fftby2 = n_fft/2; % FFT size/2
gamma=0.2; % power exponent
% ----------------------------------------------------------------------
% Critical Band Filter Definitions (Center Frequency and Bandwidths in Hz)
% ----------------------------------------------------------------------
cent_freq(1) = 50.0000; bandwidth(1) = 70.0000;
cent_freq(2) = 120.000; bandwidth(2) = 70.0000;
cent_freq(3) = 190.000; bandwidth(3) = 70.0000;
cent_freq(4) = 260.000; bandwidth(4) = 70.0000;
cent_freq(5) = 330.000; bandwidth(5) = 70.0000;
cent_freq(6) = 400.000; bandwidth(6) = 70.0000;
cent_freq(7) = 470.000; bandwidth(7) = 70.0000;
cent_freq(8) = 540.000; bandwidth(8) = 77.3724;
cent_freq(9) = 617.372; bandwidth(9) = 86.0056;
cent_freq(10) = 703.378; bandwidth(10) = 95.3398;
cent_freq(11) = 798.717; bandwidth(11) = 105.411;
cent_freq(12) = 904.128; bandwidth(12) = 116.256;
cent_freq(13) = 1020.38; bandwidth(13) = 127.914;
cent_freq(14) = 1148.30; bandwidth(14) = 140.423;
cent_freq(15) = 1288.72; bandwidth(15) = 153.823;
cent_freq(16) = 1442.54; bandwidth(16) = 168.154;
cent_freq(17) = 1610.70; bandwidth(17) = 183.457;
cent_freq(18) = 1794.16; bandwidth(18) = 199.776;
cent_freq(19) = 1993.93; bandwidth(19) = 217.153;
cent_freq(20) = 2211.08; bandwidth(20) = 235.631;
cent_freq(21) = 2446.71; bandwidth(21) = 255.255;
cent_freq(22) = 2701.97; bandwidth(22) = 276.072;
cent_freq(23) = 2978.04; bandwidth(23) = 298.126;
cent_freq(24) = 3276.17; bandwidth(24) = 321.465;
cent_freq(25) = 3597.63; bandwidth(25) = 346.136;
W=[ % articulation index weights
0.003
0.003
0.003
0.007
0.010
0.016
0.016
0.017
0.017
0.022
0.027
0.028
0.030
0.032
0.034
0.035
0.037
0.036
0.036
0.033
0.030
0.029
0.027
0.026
0.026];
W=W';
if USE_25==0 % use 13 bands
% ----- lump adjacent filters together ----------------
k=2;
cent_freq2(1)=cent_freq(1);
bandwidth2(1)=bandwidth(1)+bandwidth(2);
W2(1)=W(1);
for i=2:13
cent_freq2(i)=cent_freq2(i-1)+bandwidth2(i-1);
bandwidth2(i)=bandwidth(k)+bandwidth(k+1);
W2(i)=0.5*(W(k)+W(k+1));
k=k+2;
end
sumW=sum(W2);
bw_min = bandwidth2 (1); % minimum critical bandwidth
else
sumW=sum(W);
bw_min=bandwidth(1);
end
% ----------------------------------------------------------------------
% Set up the critical band filters. Note here that Gaussianly shaped
% filters are used. Also, the sum of the filter weights are equivalent
% for each critical band filter. Filter less than -30 dB and set to
% zero.
% ----------------------------------------------------------------------
min_factor = exp (-30.0 / (2.0 * 2.303)); % -30 dB point of filter
if USE_25==0
num_crit=length(cent_freq2);
for i = 1:num_crit
f0 = (cent_freq2 (i) / max_freq) * (n_fftby2);
all_f0(i) = floor(f0);
bw = (bandwidth2 (i) / max_freq) * (n_fftby2);
norm_factor = log(bw_min) - log(bandwidth2(i));
j = 0:1:n_fftby2-1;
crit_filter(i,:) = exp (-11 *(((j - floor(f0)) ./bw).^2) + norm_factor);
crit_filter(i,:) = crit_filter(i,:).*(crit_filter(i,:) > min_factor);
end
else
for i = 1:num_crit
f0 = (cent_freq (i) / max_freq) * (n_fftby2);
all_f0(i) = floor(f0);
bw = (bandwidth (i) / max_freq) * (n_fftby2);
norm_factor = log(bw_min) - log(bandwidth(i));
j = 0:1:n_fftby2-1;
crit_filter(i,:) = exp (-11 *(((j - floor(f0)) ./bw).^2) + norm_factor);
crit_filter(i,:) = crit_filter(i,:).*(crit_filter(i,:) > min_factor);
end
end
num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames
start = 1; % starting sample
window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
for frame_count = 1:num_frames
% ----------------------------------------------------------
% (1) Get the Frames for the test and reference speech.
% Multiply by Hanning Window.
% ----------------------------------------------------------
clean_frame = clean_speech(start:start+winlength-1);
processed_frame = processed_speech(start:start+winlength-1);
clean_frame = clean_frame.*window;
processed_frame = processed_frame.*window;
% ----------------------------------------------------------
% (2) Compute the magnitude Spectrum of Clean and Processed
% ----------------------------------------------------------
clean_spec = abs(fft(clean_frame,n_fft));
processed_spec = abs(fft(processed_frame,n_fft));
% normalize spectra to have area of one
%
clean_spec=clean_spec/sum(clean_spec(1:n_fftby2));
processed_spec=processed_spec/sum(processed_spec(1:n_fftby2));
% ----------------------------------------------------------
% (3) Compute Filterbank Output Energies
% ----------------------------------------------------------
clean_energy=zeros(1,num_crit);
processed_energy=zeros(1,num_crit);
error_energy=zeros(1,num_crit);
W_freq=zeros(1,num_crit);
for i = 1:num_crit
clean_energy(i) = sum(clean_spec(1:n_fftby2) ...
.*crit_filter(i,:)');
processed_energy(i) = sum(processed_spec(1:n_fftby2) ...
.*crit_filter(i,:)');
error_energy(i)=max((clean_energy(i)-processed_energy(i))^2,eps);
W_freq(i)=(clean_energy(i))^gamma;
end
SNRlog=10*log10((clean_energy.^2)./error_energy);
fwSNR=sum(W_freq.*SNRlog)/sum(W_freq);
distortion(frame_count)=min(max(fwSNR,-10),35);
start = start + skiprate;
end

View File

@@ -0,0 +1,493 @@
function [SIG,BAK,OVL]= comp_fwseg_mars(cleanFile, enhancedFile);
% ----------------------------------------------------------------------
% MARS Frequency-variant fwSNRseg objective speech quality measure
%
% This function implements the frequency-variant fwSNRseg measure based
% on MARS analysis (see Chap. 10, Sec. 10.5.4)
%
%
% Usage: [sig,bak,ovl]=comp_fwseg_mars(cleanFile.wav, enhancedFile.wav)
%
% cleanFile.wav - clean input file in .wav format
% enhancedFile - enhanced output file in .wav format
% sig - predicted rating [1-5] of speech distortion
% bak - predicted rating [1-5] of noise distortion
% ovl - predicted rating [1-5] of overall quality
%
%
% Example call: [s,b,o] =comp_fwseg_mars('sp04.wav','enhanced.wav')
%
%
% References:
% [1] Chapter 10, Sec 10.5.4,
% [2] Chapter 11
%
% Authors: Yi Hu and Philipos C. Loizou
% (critical-band filtering routines were written by Bryan Pellom & John Hansen)
%
% Copyright (c) 2006 by Philipos C. Loizou
% $Revision: 0.0 $ $Date: 10/09/2006 $
% ----------------------------------------------------------------------
if nargin~=2
fprintf('USAGE: [sig,bak,ovl]=comp_fwseg_mars(cleanFile.wav, enhancedFile.wav)\n');
fprintf('For more help, type: help comp_fwseg_mars\n\n');
return;
end
[data1, Srate1, Nbits1]= wavread(cleanFile);
[data2, Srate2, Nbits2]= wavread(enhancedFile);
if ( Srate1~= Srate2) | ( Nbits1~= Nbits2)
error( 'The two files do not match!\n');
end
len= min( length( data1), length( data2));
data1= data1( 1: len)+eps;
data2= data2( 1: len)+eps;
wss_dist_matrix= fwseg( data1, data2,Srate1);
wss_dist=mean(wss_dist_matrix);
SIG= sig_mars( wss_dist( 1), wss_dist( 2), wss_dist( 3), wss_dist( 4), ...
wss_dist( 5), wss_dist( 6), wss_dist( 7), wss_dist( 8), ...
wss_dist( 9), wss_dist( 10), wss_dist( 11), wss_dist( 12), ...
wss_dist( 13), wss_dist( 14), wss_dist( 15), wss_dist( 16), ...
wss_dist( 17), wss_dist( 18), wss_dist( 19), wss_dist( 20), ...
wss_dist( 21), wss_dist( 22), wss_dist( 23), wss_dist( 24), ...
wss_dist( 25));
SIG=max(1,SIG); SIG=min(5, SIG); % limit values to [1, 5]
BAK= bak_mars( wss_dist( 1), wss_dist( 2), wss_dist( 3), wss_dist( 4), ...
wss_dist( 5), wss_dist( 6), wss_dist( 7), wss_dist( 8), ...
wss_dist( 9), wss_dist( 10), wss_dist( 11), wss_dist( 12), ...
wss_dist( 13), wss_dist( 14), wss_dist( 15), wss_dist( 16), ...
wss_dist( 17), wss_dist( 18), wss_dist( 19), wss_dist( 20), ...
wss_dist( 21), wss_dist( 22), wss_dist( 23), wss_dist( 24), ...
wss_dist( 25));
BAK=max(1,BAK); BAK=min(5, BAK); % limit values to [1, 5]
OVL= ovl_mars( wss_dist( 1), wss_dist( 2), wss_dist( 3), wss_dist( 4), ...
wss_dist( 5), wss_dist( 6), wss_dist( 7), wss_dist( 8), ...
wss_dist( 9), wss_dist( 10), wss_dist( 11), wss_dist( 12), ...
wss_dist( 13), wss_dist( 14), wss_dist( 15), wss_dist( 16), ...
wss_dist( 17), wss_dist( 18), wss_dist( 19), wss_dist( 20), ...
wss_dist( 21), wss_dist( 22), wss_dist( 23), wss_dist( 24), ...
wss_dist( 25));
OVL=max(1,OVL); OVL=min(5, OVL); % limit values to [1, 5]
%-------------------------------------------------
function Y= bak_mars( FWSEG_VA, V5, V6, V7, V8, V9, V10, V11, V12, ...
V13, V14, V15, V16, V17, V18, V19, V20, ...
V21, V22, V23, V24, V25, V26, V27, V28)
BF1 = max(0, V21 - 0.282);
BF2 = max(0, FWSEG_VA + 9.094);
BF3 = max(0, - 9.094 - FWSEG_VA );
BF5 = max(0, 10.089 - V11 );
BF7 = max(0, 3.624 - V26 ) * BF3;
BF8 = max(0, V24 - 5.584) * BF5;
BF9 = max(0, 5.584 - V24 ) * BF5;
BF10 = max(0, V19 - 8.030) * BF1;
BF11 = max(0, 8.030 - V19 ) * BF1;
BF12 = max(0, V27 - 4.858) * BF1;
BF13 = max(0, 4.858 - V27 ) * BF1;
BF14 = max(0, FWSEG_VA + 7.282) * BF1;
BF15 = max(0, - 7.282 - FWSEG_VA ) * BF1;
BF17 = max(0, 9.458 - V16 ) * BF10;
BF18 = max(0, V27 - 10.431) * BF11;
BF19 = max(0, 10.431 - V27 ) * BF11;
BF21 = max(0, 11.059 - V22 ) * BF1;
BF22 = max(0, V26 - 8.675) * BF1;
BF23 = max(0, 8.675 - V26 ) * BF1;
BF25 = max(0, 11.195 - V6 ) * BF10;
BF26 = max(0, V8 - 7.138) * BF1;
BF27 = max(0, 7.138 - V8 ) * BF1;
BF29 = max(0, 9.006 - V10 ) * BF26;
BF30 = max(0, V14 - 8.210) * BF15;
BF35 = max(0, 7.026 - V19 ) * BF15;
BF36 = max(0, V11 - 3.424) * BF27;
BF39 = max(0, 5.418 - V17 ) * BF23;
BF40 = max(0, V28 - 6.813);
BF41 = max(0, 6.813 - V28 );
BF42 = max(0, V26 - 5.998) * BF14;
BF43 = max(0, 5.998 - V26 ) * BF14;
BF44 = max(0, V5 + 0.206) * BF41;
BF45 = max(0, - 0.206 - V5 ) * BF41;
BF46 = max(0, V22 - 7.901) * BF45;
BF49 = max(0, 7.496 - V8 ) * BF44;
BF51 = max(0, 7.904 - V11 ) * BF45;
BF52 = max(0, V26 - 10.938) * BF27;
BF54 = max(0, V9 - 4.507) * BF26;
BF56 = max(0, V28 - 0.549) * BF15;
BF57 = max(0, 0.549 - V28 ) * BF15;
BF58 = max(0, V25 - 3.252) * BF41;
BF59 = max(0, 3.252 - V25 ) * BF41;
BF60 = max(0, V23 - 7.650) * BF58;
BF61 = max(0, 7.650 - V23 ) * BF58;
BF62 = max(0, V25 - 9.931) * BF44;
BF63 = max(0, 9.931 - V25 ) * BF44;
BF64 = max(0, V25 - 4.923) * BF21;
BF65 = max(0, 4.923 - V25 ) * BF21;
BF67 = max(0, 3.746 - V28 ) * BF10;
BF68 = max(0, V11 - 5.346) * BF41;
BF69 = max(0, 5.346 - V11 ) * BF41;
BF70 = max(0, V12 - 9.026) * BF68;
BF71 = max(0, 9.026 - V12 ) * BF68;
BF73 = max(0, - 2.668 - V28 ) * BF21;
BF74 = max(0, V24 - 7.028) * BF41;
BF75 = max(0, 7.028 - V24 ) * BF41;
BF77 = max(0, - 0.224 - V6 ) * BF74;
BF78 = max(0, V5 - 3.884);
BF79 = max(0, 3.884 - V5 );
BF80 = max(0, V15 - 5.019) * BF78;
BF83 = max(0, - 1.880 - V28 ) * BF13;
BF84 = max(0, V7 - 3.067) * BF12;
BF85 = max(0, 3.067 - V7 ) * BF12;
BF87 = max(0, 5.353 - V6 );
BF88 = max(0, V13 - 3.405) * BF9;
BF89 = max(0, 3.405 - V13 ) * BF9;
BF91 = max(0, 5.599 - V13 ) * BF45;
BF92 = max(0, V15 - 9.821) * BF8;
BF94 = max(0, V14 + 2.594) * BF79;
BF97 = max(0, 8.635 - V23 ) * BF94;
BF99 = max(0, 1.332 - V24 ) * BF45;
BF100 = max(0, V7 - 0.209) * BF1;
Y = 2.751 + 0.135 * BF1 - 0.037 * BF2 + 0.328 * BF3 - 0.098 * BF5 ...
+ 0.988 * BF7 + 0.014 * BF8 - 0.034 * BF11 - 0.011 * BF12 ...
- 0.013 * BF13 - 0.002 * BF17 + 0.014 * BF18 ...
+ 0.004 * BF19 - 0.007 * BF21 - 0.017 * BF22 ...
- .895791E-03 * BF25 + 0.011 * BF26 - 0.009 * BF27 ...
- 0.007 * BF29 + 0.052 * BF30 + 0.022 * BF35 ...
- 0.002 * BF36 - 0.005 * BF39 - 0.059 * BF40 ...
- 0.050 * BF41 + 0.001 * BF42 + .743730E-03 * BF43 ...
+ 0.011 * BF44 + 0.022 * BF45 + 0.009 * BF46 ...
+ 0.004 * BF49 - 0.005 * BF51 + 0.010 * BF52 ...
- 0.001 * BF54 - 0.005 * BF56 - 0.015 * BF57 ...
- 0.032 * BF59 + 0.009 * BF60 - 0.002 * BF61 ...
- 0.009 * BF62 - 0.001 * BF63 + .819374E-03 * BF64 ...
+ 0.002 * BF65 + 0.003 * BF67 + 0.024 * BF69 ...
- 0.011 * BF70 - 0.004 * BF71 + 0.013 * BF73 ...
- 0.026 * BF74 + 0.005 * BF75 + 0.253 * BF77 ...
- 0.065 * BF78 + 0.014 * BF80 - 0.010 * BF83 ...
+ 0.001 * BF84 + 0.018 * BF85 - 0.050 * BF87 ...
- 0.002 * BF88 - 0.020 * BF89 + 0.003 * BF91 ...
- 0.043 * BF92 + .707581E-03 * BF97 - 0.015 * BF99 ...
- 0.005 * BF100;
function Y= sig_mars( FWSEG_VA, V5, V6, V7, V8, V9, V10, V11, V12, ...
V13, V14, V15, V16, V17, V18, V19, V20, ...
V21, V22, V23, V24, V25, V26, V27, V28)
BF1 = max(0, V7 - 9.535);
BF2 = max(0, 9.535 - V7 );
BF3 = max(0, V27 - 1.578);
BF5 = max(0, V6 - 5.422);
BF6 = max(0, 5.422 - V6 );
BF8 = max(0, 11.333 - V19 );
BF10 = max(0, - 6.774 - FWSEG_VA );
BF11 = max(0, V10 - 6.255) * BF8;
BF12 = max(0, 6.255 - V10 ) * BF8;
BF13 = max(0, V24 - 3.894);
BF15 = max(0, V5 - 3.884);
BF16 = max(0, 3.884 - V5 );
BF17 = max(0, V28 - 7.918);
BF18 = max(0, 7.918 - V28 );
BF19 = max(0, V13 - 6.077) * BF18;
BF20 = max(0, 6.077 - V13 ) * BF18;
BF22 = max(0, 6.614 - V20 ) * BF10;
BF23 = max(0, FWSEG_VA + 0.936) * BF8;
BF25 = max(0, V23 - 5.039);
BF26 = max(0, 5.039 - V23 );
BF28 = max(0, 9.007 - V20 ) * BF25;
BF29 = max(0, V25 - 7.582);
BF30 = max(0, 7.582 - V25 );
BF31 = max(0, V11 + 3.336) * BF16;
BF32 = max(0, V26 - 1.877);
BF35 = max(0, - 5.749 - FWSEG_VA ) * BF6;
BF36 = max(0, V7 - 4.451) * BF29;
BF37 = max(0, 4.451 - V7 ) * BF29;
BF38 = max(0, V14 - 10.158);
BF39 = max(0, 10.158 - V14 );
BF41 = max(0, 7.172 - V17 ) * BF39;
BF43 = max(0, 7.810 - V24 ) * BF26;
BF44 = max(0, V8 + 1.636) * BF3;
BF45 = max(0, FWSEG_VA - 10.068) * BF39;
BF47 = max(0, V23 - 4.721) * BF30;
BF48 = max(0, 4.721 - V23 ) * BF30;
BF50 = max(0, - 2.397 - V24 ) * BF16;
BF51 = max(0, V14 - 1.428) * BF17;
BF53 = max(0, V16 + 1.940) * BF18;
BF54 = max(0, V10 - 9.442) * BF18;
BF56 = max(0, V10 + 2.144) * BF16;
BF58 = max(0, 1.969 - V26 ) * BF2;
BF59 = max(0, V19 - 6.089) * BF16;
BF62 = max(0, 8.952 - V21 ) * BF15;
BF63 = max(0, V24 - 7.371) * BF3;
BF65 = max(0, V22 - 8.908) * BF6;
BF66 = max(0, 8.908 - V22 ) * BF6;
BF67 = max(0, V27 - 9.485) * BF30;
BF69 = max(0, V18 - 8.608) * BF10;
BF71 = max(0, V13 - 3.374) * BF25;
BF73 = max(0, V14 - 3.616) * BF13;
BF75 = max(0, V18 - 10.321) * BF32;
BF76 = max(0, 10.321 - V18 ) * BF32;
BF78 = max(0, 3.972 - V15 ) * BF26;
BF79 = max(0, V14 - 7.105) * BF26;
BF80 = max(0, 7.105 - V14 ) * BF26;
Y = 2.638 - 0.089 * BF1 + 0.083 * BF5 - 0.162 * BF6 - 0.037 * BF8 ...
- 0.241 * BF10 + 0.018 * BF11 - 0.008 * BF12 ...
+ 0.059 * BF13 - 0.144 * BF17 - 0.116 * BF18 ...
+ 0.010 * BF19 - 0.012 * BF20 + 0.085 * BF22 ...
+ 0.011 * BF23 + 0.049 * BF25 - 0.159 * BF26 ...
- 0.016 * BF28 - 0.138 * BF29 + 0.010 * BF31 ...
+ 0.016 * BF35 + 0.018 * BF36 + 0.246 * BF37 ...
- 0.417 * BF38 + 0.052 * BF39 - 0.005 * BF41 ...
+ 0.021 * BF43 + 0.006 * BF44 - 0.047 * BF45 ...
- 0.051 * BF47 - 0.014 * BF48 - 0.113 * BF50 ...
+ 0.019 * BF51 + 0.007 * BF53 + 0.017 * BF54 ...
- 0.007 * BF56 - 0.098 * BF58 + 0.011 * BF59 ...
- 0.016 * BF62 - 0.012 * BF63 + 0.113 * BF65 ...
+ 0.016 * BF66 + 0.040 * BF67 - 0.065 * BF69 ...
- 0.018 * BF71 + 0.014 * BF73 - 0.009 * BF75 ...
- 0.008 * BF76 - 0.032 * BF78 + 0.032 * BF79 ...
+ 0.011 * BF80;
function Y= ovl_mars( FWSEG_VA, V5, V6, V7, V8, V9, V10, V11, V12, ...
V13, V14, V15, V16, V17, V18, V19, V20, ...
V21, V22, V23, V24, V25, V26, V27, V28)
BF1 = max(0, V21 - 4.671);
BF3 = max(0, V6 - 5.396);
BF4 = max(0, 5.396 - V6 );
BF7 = max(0, V11 - 7.884);
BF8 = max(0, 7.884 - V11 );
BF9 = max(0, FWSEG_VA + 7.229) * BF1;
BF10 = max(0, - 7.229 - FWSEG_VA ) * BF1;
BF11 = max(0, V19 - 8.128) * BF1;
BF12 = max(0, 8.128 - V19 ) * BF1;
BF13 = max(0, V28 - 7.918);
BF14 = max(0, 7.918 - V28 );
BF15 = max(0, V5 + 2.888) * BF14;
BF16 = max(0, - 2.888 - V5 ) * BF14;
BF17 = max(0, V24 - 2.924) * BF8;
BF18 = max(0, 2.924 - V24 ) * BF8;
BF20 = max(0, 9.071 - V16 ) * BF15;
BF21 = max(0, V10 - 6.286) * BF14;
BF22 = max(0, 6.286 - V10 ) * BF14;
BF24 = max(0, V23 - 5.173);
BF25 = max(0, 5.173 - V23 );
BF26 = max(0, V26 - 8.987);
BF29 = max(0, 12.216 - V27 ) * BF3;
BF30 = max(0, V8 - 4.306) * BF16;
BF34 = max(0, V23 - 7.630) * BF21;
BF35 = max(0, 7.630 - V23 ) * BF21;
BF37 = max(0, 3.638 - V7 ) * BF1;
BF39 = max(0, 8.337 - V21 ) * BF17;
BF41 = max(0, 1.590 - V5 ) * BF11;
BF43 = max(0, 13.993 - V8 ) * BF11;
BF44 = max(0, V14 - 5.993) * BF25;
BF45 = max(0, 5.993 - V14 ) * BF25;
BF46 = max(0, V24 - 1.035);
BF47 = max(0, 1.035 - V24 );
BF49 = max(0, 8.915 - V23 ) * BF12;
BF51 = max(0, - 0.004 - FWSEG_VA );
BF52 = max(0, V27 - 6.520) * BF24;
BF53 = max(0, 6.520 - V27 ) * BF24;
BF54 = max(0, V7 - 11.484) * BF8;
BF55 = max(0, 11.484 - V7 ) * BF8;
BF57 = max(0, 5.742 - V17 ) * BF25;
BF58 = max(0, V12 - 6.949) * BF12;
BF59 = max(0, 6.949 - V12 ) * BF12;
BF60 = max(0, V25 - 9.203) * BF45;
BF63 = max(0, 1.887 - V13 ) * BF7;
BF65 = max(0, 9.498 - V26 ) * BF15;
BF66 = max(0, V5 - 6.566) * BF22;
BF71 = max(0, 13.239 - V19 ) * BF46;
BF72 = max(0, V19 - 9.925) * BF55;
BF77 = max(0, 3.430 - V22 ) * BF18;
BF78 = max(0, V27 - 6.513) * BF45;
BF79 = max(0, 6.513 - V27 ) * BF45;
BF81 = max(0, 12.511 - V18 );
BF82 = max(0, V11 - 6.777) * BF81;
BF83 = max(0, 6.777 - V11 ) * BF81;
BF85 = max(0, 3.433 - V5 ) * BF47;
BF87 = max(0, - 3.524 - FWSEG_VA ) * BF47;
BF88 = max(0, V27 - 11.604) * BF9;
BF91 = max(0, 8.845 - V26 ) * BF52;
BF92 = max(0, V14 - 5.931) * BF82;
BF93 = max(0, 5.931 - V14 ) * BF82;
BF94 = max(0, V21 - 7.245) * BF25;
BF95 = max(0, 7.245 - V21 ) * BF25;
BF96 = max(0, V14 - 5.323) * BF7;
BF98 = max(0, V10 - 6.248) * BF71;
BF100 = max(0, V18 - 0.602) * BF95;
Y = 2.936 + 0.047 * BF1 + 0.061 * BF3 - 0.084 * BF4 - 0.139 * BF8 ...
- 0.064 * BF10 - 0.030 * BF12 - 0.103 * BF13 ...
- 0.039 * BF14 + 0.020 * BF17 - 0.002 * BF20 ...
- 0.005 * BF22 - 0.114 * BF25 - 0.090 * BF26 ...
- 0.011 * BF29 + 0.010 * BF30 + 0.009 * BF34 ...
+ 0.002 * BF35 + 0.079 * BF37 - 0.006 * BF39 ...
+ 0.007 * BF41 - 0.003 * BF43 + 0.017 * BF44 ...
+ 0.076 * BF47 + 0.009 * BF49 + 0.016 * BF51 ...
- 0.042 * BF53 - 0.079 * BF54 - 0.030 * BF57 ...
- 0.018 * BF58 - 0.009 * BF59 - 0.119 * BF60 ...
- 0.210 * BF63 - .456802E-03 * BF65 + 0.028 * BF66 ...
+ 0.020 * BF72 + 0.011 * BF77 + 0.005 * BF78 ...
+ 0.003 * BF79 - 0.049 * BF81 + 0.012 * BF83 ...
- 0.030 * BF85 + 0.070 * BF87 + 0.008 * BF88 ...
- 0.008 * BF91 + 0.010 * BF92 + 0.003 * BF93 ...
+ 0.022 * BF94 - 0.038 * BF96 + .933766E-03 * BF98 ...
+ 0.002 * BF100;
function distortion = fwseg(clean_speech, processed_speech,sample_rate)
% ----------------------------------------------------------------------
% Check the length of the clean and processed speech. Must be the same.
% ----------------------------------------------------------------------
clean_length = length(clean_speech);
processed_length = length(processed_speech);
if (clean_length ~= processed_length)
disp('Error: Files must have same length.');
return
end
% ----------------------------------------------------------------------
% Global Variables
% ----------------------------------------------------------------------
winlength = round(30*sample_rate/1000); % window length in samples
skiprate = floor(winlength/4); % window skip in samples
max_freq = sample_rate/2; % maximum bandwidth
num_crit = 25; % number of critical bands
n_fft = 2^nextpow2(2*winlength);
n_fftby2 = n_fft/2; % FFT size/2
% ----------------------------------------------------------------------
% Critical Band Filter Definitions (Center Frequency and Bandwidths in Hz)
% ----------------------------------------------------------------------
cent_freq(1) = 50.0000; bandwidth(1) = 70.0000;
cent_freq(2) = 120.000; bandwidth(2) = 70.0000;
cent_freq(3) = 190.000; bandwidth(3) = 70.0000;
cent_freq(4) = 260.000; bandwidth(4) = 70.0000;
cent_freq(5) = 330.000; bandwidth(5) = 70.0000;
cent_freq(6) = 400.000; bandwidth(6) = 70.0000;
cent_freq(7) = 470.000; bandwidth(7) = 70.0000;
cent_freq(8) = 540.000; bandwidth(8) = 77.3724;
cent_freq(9) = 617.372; bandwidth(9) = 86.0056;
cent_freq(10) = 703.378; bandwidth(10) = 95.3398;
cent_freq(11) = 798.717; bandwidth(11) = 105.411;
cent_freq(12) = 904.128; bandwidth(12) = 116.256;
cent_freq(13) = 1020.38; bandwidth(13) = 127.914;
cent_freq(14) = 1148.30; bandwidth(14) = 140.423;
cent_freq(15) = 1288.72; bandwidth(15) = 153.823;
cent_freq(16) = 1442.54; bandwidth(16) = 168.154;
cent_freq(17) = 1610.70; bandwidth(17) = 183.457;
cent_freq(18) = 1794.16; bandwidth(18) = 199.776;
cent_freq(19) = 1993.93; bandwidth(19) = 217.153;
cent_freq(20) = 2211.08; bandwidth(20) = 235.631;
cent_freq(21) = 2446.71; bandwidth(21) = 255.255;
cent_freq(22) = 2701.97; bandwidth(22) = 276.072;
cent_freq(23) = 2978.04; bandwidth(23) = 298.126;
cent_freq(24) = 3276.17; bandwidth(24) = 321.465;
cent_freq(25) = 3597.63; bandwidth(25) = 346.136;
bw_min = bandwidth (1); % minimum critical bandwidth
% ----------------------------------------------------------------------
% Set up the critical band filters. Note here that Gaussianly shaped
% filters are used. Also, the sum of the filter weights are equivalent
% for each critical band filter. Filter less than -30 dB and set to
% zero.
% ----------------------------------------------------------------------
min_factor = exp (-30.0 / (2.0 * 2.303)); % -30 dB point of filter
for i = 1:num_crit
f0 = (cent_freq (i) / max_freq) * (n_fftby2);
all_f0(i) = floor(f0);
bw = (bandwidth (i) / max_freq) * (n_fftby2);
norm_factor = log(bw_min) - log(bandwidth(i));
j = 0:1:n_fftby2-1;
crit_filter(i,:) = exp (-11 *(((j - floor(f0)) ./bw).^2) + norm_factor);
crit_filter(i,:) = crit_filter(i,:).*(crit_filter(i,:) > min_factor);
end
% ----------------------------------------------------------------------
% For each frame of input speech, calculate the Weighted Spectral
% Slope Measure
% ----------------------------------------------------------------------
num_frames = floor(clean_length/skiprate-(winlength/skiprate)); % number of frames
start = 1; % starting sample
window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
distortion=zeros(num_frames,num_crit);
for frame_count = 1:num_frames
% ----------------------------------------------------------
% (1) Get the Frames for the test and reference speech.
% Multiply by Hanning Window.
% ----------------------------------------------------------
clean_frame = clean_speech(start:start+winlength-1);
processed_frame = processed_speech(start:start+winlength-1);
clean_frame = clean_frame.*window;
processed_frame = processed_frame.*window;
% ----------------------------------------------------------
% (2) Compute the magnitude Spectrum of Clean and Processed
% ----------------------------------------------------------
clean_spec = abs(fft(clean_frame,n_fft));
processed_spec = abs(fft(processed_frame,n_fft));
% normalize so that spectra have unit area ----
clean_spec=clean_spec/sum(clean_spec(1:n_fftby2));
processed_spec=processed_spec/sum(processed_spec(1:n_fftby2));
% ----------------------------------------------------------
% (3) Compute Filterbank Output Energies
% ----------------------------------------------------------
clean_energy=zeros(1,num_crit);
processed_energy=zeros(1,num_crit);
error_energy=zeros(1,num_crit);
for i = 1:num_crit
clean_energy(i) = sum(clean_spec(1:n_fftby2) ...
.*crit_filter(i,:)');
processed_energy(i) = sum(processed_spec(1:n_fftby2) ...
.*crit_filter(i,:)');
error_energy(i)=max((clean_energy(i)-processed_energy(i))^2,eps);
end
SNRlog=10*log10((clean_energy.^2)./error_energy);
distortion(frame_count,:)=min(max(SNRlog,-10),35);
start = start + skiprate;
end

View File

@@ -0,0 +1,221 @@
function [SIG,BAK,OVL]= comp_fwseg_variant(cleanFile, enhancedFile);
% ----------------------------------------------------------------------
% Frequency-variant fwSNRseg Objective Speech Quality Measure
%
% This function implements the frequency-variant fwSNRseg measure [1]
% (see also Chap. 10, Eq. 10.24)
%
%
% Usage: [sig,bak,ovl]=comp_fwseg_variant(cleanFile.wav, enhancedFile.wav)
%
% cleanFile.wav - clean input file in .wav format
% enhancedFile - enhanced output file in .wav format
% sig - predicted rating [1-5] of speech distortion
% bak - predicted rating [1-5] of noise distortion
% ovl - predicted rating [1-5] of overall quality
%
%
% Example call: [s,b,o] =comp_fwseg_variant('sp04.wav','enhanced.wav')
%
%
% References:
% [1] S. R. Quackenbush, T. P. Barnwell, and M. A. Clements,
% Objective Measures of Speech Quality. Prentice Hall
% Advanced Reference Series, Englewood Cliffs, NJ, 1988,
% ISBN: 0-13-629056-6.
%
% Author: Philipos C. Loizou
% (critical-band filtering routines were written by Bryan Pellom & John Hansen)
%
% Copyright (c) 2006 by Philipos C. Loizou
% $Revision: 0.0 $ $Date: 10/09/2006 $
% ----------------------------------------------------------------------
if nargin~=2
fprintf('USAGE: [sig,bak,ovl]=comp_fwseg_variant(cleanFile.wav, enhancedFile.wav)\n');
fprintf('For more help, type: help comp_fwseg_variant\n\n');
return;
end
[data1, Srate1, Nbits1]= wavread(cleanFile);
[data2, Srate2, Nbits2]= wavread(enhancedFile);
if ( Srate1~= Srate2) | ( Nbits1~= Nbits2)
error( 'The two files do not match!\n');
end
len= min( length( data1), length( data2));
data1= data1( 1: len)+eps;
data2= data2( 1: len)+eps;
wss_dist_matrix= fwseg( data1, data2,Srate1);
wss_dist=mean(wss_dist_matrix);
% initialize coefficients obtained from multiple linear
% regression analysis
%
b_sig=[0.021,-0.028,0.088,-0.031,0.048,-0.049,0.065,0.009,0.011,0.033,...
-0.040,-0.002,0.041,-0.007,0.033,0.018,-0.007,0.044,-0.001,0.021,...
-0.002,0.017,-0.03,0.073,0.043];
b_ovl=[-0.003,-0.026,0.066,-0.036,0.038,-0.023,0.037,0.022,0.014,0.009,...
-0.03,0.004,0.044,-0.005,0.017,0.018,-0.001,0.051,0.009,0.011,...
0.011,-0.002,-0.021,0.043,0.031];
b_bak=[-0.03,-0.022,0.03,-0.048,0.034,0.002,0.006,0.037,0.017,-0.016,-0.008,...
0.019,0.024,-0.002,0.01,0.03,-0.018,0.046,0.022,0.005,0.03,-0.028,...
-0.028,0.019,0.005];
SIG=0.567+sum(b_sig.*wss_dist);
SIG=max(1,SIG); SIG=min(5, SIG); % limit values to [1, 5]
BAK=1.013+sum(b_bak.*wss_dist);
BAK=max(1,BAK); BAK=min(5, BAK); % limit values to [1, 5]
OVL=0.446+sum(b_ovl.*wss_dist);
OVL=max(1,OVL); OVL=min(5, OVL); % limit values to [1, 5]
% ----------------------------------------------------------------------
function distortion = fwseg(clean_speech, processed_speech,sample_rate)
% ----------------------------------------------------------------------
% Check the length of the clean and processed speech. Must be the same.
% ----------------------------------------------------------------------
clean_length = length(clean_speech);
processed_length = length(processed_speech);
if (clean_length ~= processed_length)
disp('Error: Files must have same length.');
return
end
% ----------------------------------------------------------------------
% Global Variables
% ----------------------------------------------------------------------
winlength = round(30*sample_rate/1000); % window length in samples
skiprate = floor(winlength/4); % window skip in samples
max_freq = sample_rate/2; % maximum bandwidth
num_crit = 25; % number of critical bands
n_fft = 2^nextpow2(2*winlength);
n_fftby2 = n_fft/2; % FFT size/2
% ----------------------------------------------------------------------
% Critical Band Filter Definitions (Center Frequency and Bandwidths in Hz)
% ----------------------------------------------------------------------
cent_freq(1) = 50.0000; bandwidth(1) = 70.0000;
cent_freq(2) = 120.000; bandwidth(2) = 70.0000;
cent_freq(3) = 190.000; bandwidth(3) = 70.0000;
cent_freq(4) = 260.000; bandwidth(4) = 70.0000;
cent_freq(5) = 330.000; bandwidth(5) = 70.0000;
cent_freq(6) = 400.000; bandwidth(6) = 70.0000;
cent_freq(7) = 470.000; bandwidth(7) = 70.0000;
cent_freq(8) = 540.000; bandwidth(8) = 77.3724;
cent_freq(9) = 617.372; bandwidth(9) = 86.0056;
cent_freq(10) = 703.378; bandwidth(10) = 95.3398;
cent_freq(11) = 798.717; bandwidth(11) = 105.411;
cent_freq(12) = 904.128; bandwidth(12) = 116.256;
cent_freq(13) = 1020.38; bandwidth(13) = 127.914;
cent_freq(14) = 1148.30; bandwidth(14) = 140.423;
cent_freq(15) = 1288.72; bandwidth(15) = 153.823;
cent_freq(16) = 1442.54; bandwidth(16) = 168.154;
cent_freq(17) = 1610.70; bandwidth(17) = 183.457;
cent_freq(18) = 1794.16; bandwidth(18) = 199.776;
cent_freq(19) = 1993.93; bandwidth(19) = 217.153;
cent_freq(20) = 2211.08; bandwidth(20) = 235.631;
cent_freq(21) = 2446.71; bandwidth(21) = 255.255;
cent_freq(22) = 2701.97; bandwidth(22) = 276.072;
cent_freq(23) = 2978.04; bandwidth(23) = 298.126;
cent_freq(24) = 3276.17; bandwidth(24) = 321.465;
cent_freq(25) = 3597.63; bandwidth(25) = 346.136;
bw_min = bandwidth (1); % minimum critical bandwidth
% ----------------------------------------------------------------------
% Set up the critical band filters. Note here that Gaussianly shaped
% filters are used. Also, the sum of the filter weights are equivalent
% for each critical band filter. Filter less than -30 dB and set to
% zero.
% ----------------------------------------------------------------------
min_factor = exp (-30.0 / (2.0 * 2.303)); % -30 dB point of filter
for i = 1:num_crit
f0 = (cent_freq (i) / max_freq) * (n_fftby2);
all_f0(i) = floor(f0);
bw = (bandwidth (i) / max_freq) * (n_fftby2);
norm_factor = log(bw_min) - log(bandwidth(i));
j = 0:1:n_fftby2-1;
crit_filter(i,:) = exp (-11 *(((j - floor(f0)) ./bw).^2) + norm_factor);
crit_filter(i,:) = crit_filter(i,:).*(crit_filter(i,:) > min_factor);
end
% ----------------------------------------------------------------------
% For each frame of input speech, calculate the Weighted Spectral
% Slope Measure
% ----------------------------------------------------------------------
num_frames = floor(clean_length/skiprate-(winlength/skiprate)); % number of frames
start = 1; % starting sample
window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
distortion=zeros(num_frames,num_crit);
for frame_count = 1:num_frames
% ----------------------------------------------------------
% (1) Get the Frames for the test and reference speech.
% Multiply by Hanning Window.
% ----------------------------------------------------------
clean_frame = clean_speech(start:start+winlength-1);
processed_frame = processed_speech(start:start+winlength-1);
clean_frame = clean_frame.*window;
processed_frame = processed_frame.*window;
% ----------------------------------------------------------
% (2) Compute the magnitude Spectrum of Clean and Processed
% ----------------------------------------------------------
clean_spec = abs(fft(clean_frame,n_fft));
processed_spec = abs(fft(processed_frame,n_fft));
% normalize so that spectra have unit area ----
clean_spec=clean_spec/sum(clean_spec(1:n_fftby2));
processed_spec=processed_spec/sum(processed_spec(1:n_fftby2));
% ----------------------------------------------------------
% (3) Compute Filterbank Output Energies (in dB scale)
% ----------------------------------------------------------
clean_energy=zeros(1,num_crit);
processed_energy=zeros(1,num_crit);
error_energy=zeros(1,num_crit);
for i = 1:num_crit
clean_energy(i) = sum(clean_spec(1:n_fftby2) ...
.*crit_filter(i,:)');
processed_energy(i) = sum(processed_spec(1:n_fftby2) ...
.*crit_filter(i,:)');
error_energy(i)=max((clean_energy(i)-processed_energy(i))^2,eps);
end
SNRlog=10*log10((clean_energy.^2)./error_energy);
distortion(frame_count,:)=min(max(SNRlog,-10),35);
start = start + skiprate;
end

View File

@@ -0,0 +1,188 @@
function is_mean= comp_is(cleanFile, enhdFile);
% ----------------------------------------------------------------------
% Itakura-Saito (IS) Objective Speech Quality Measure
%
% This function implements the Itakura-Saito distance measure
% defined on page 50 of [1] (see Equation 2.26). See also
% Equation 12 (page 1480) of [2].
%
% Usage: IS=comp_is(cleanFile.wav, enhancedFile.wav)
%
% cleanFile.wav - clean input file in .wav format
% enhancedFile - enhanced output file in .wav format
% IS - computed Itakura Saito measure
%
% Note that the IS measure is limited in the range [0, 100].
%
% Example call: IS =comp_is('sp04.wav','enhanced.wav')
%
%
% References:
%
% [1] S. R. Quackenbush, T. P. Barnwell, and M. A. Clements,
% Objective Measures of Speech Quality. Prentice Hall
% Advanced Reference Series, Englewood Cliffs, NJ, 1988,
% ISBN: 0-13-629056-6.
%
% [2] B.-H. Juang, "On Using the Itakura-Saito Measures for
% Speech Coder Performance Evaluation", AT&T Bell
% Laboratories Technical Journal, Vol. 63, No. 8,
% October 1984, pp. 1477-1498.
%
% Authors: Bryan L. Pellom and John H. L. Hansen (July 1998)
% Modified by: Philipos C. Loizou (Oct 2006) - limited IS to be in [0,100]
%
% Copyright (c) 2006 by Philipos C. Loizou
% $Revision: 0.0 $ $Date: 10/09/2006 $
% ----------------------------------------------------------------------
if nargin~=2
fprintf('USAGE: IS=comp_is(cleanFile.wav, enhancedFile.wav)\n');
fprintf('For more help, type: help comp_is\n\n');
return;
end
alpha=0.95;
[data1, Srate1, Nbits1]= wavread(cleanFile);
[data2, Srate2, Nbits2]= wavread(enhdFile);
if ( Srate1~= Srate2) | ( Nbits1~= Nbits2)
error( 'The two files do not match!\n');
end
len= min( length( data1), length( data2));
data1= data1( 1: len)+eps;
data2= data2( 1: len)+eps;
IS_dist= is( data1, data2,Srate1);
IS_len= round( length( IS_dist)* alpha);
IS= sort( IS_dist);
is_mean= mean( IS( 1: IS_len));
function distortion = is(clean_speech, processed_speech,sample_rate)
% ----------------------------------------------------------------------
% Check the length of the clean and processed speech. Must be the same.
% ----------------------------------------------------------------------
clean_length = length(clean_speech);
processed_length = length(processed_speech);
if (clean_length ~= processed_length)
disp('Error: Both Speech Files must be same length.');
return
end
% ----------------------------------------------------------------------
% Scale both clean speech and processed speech to have same dynamic
% range. Also remove DC component from each signal
% ----------------------------------------------------------------------
%clean_speech = clean_speech - mean(clean_speech);
%processed_speech = processed_speech - mean(processed_speech);
%processed_speech = processed_speech.*(max(abs(clean_speech))/ max(abs(processed_speech)));
% ----------------------------------------------------------------------
% Global Variables
% ----------------------------------------------------------------------
%sample_rate = 8000; % default sample rate
winlength = round(30*sample_rate/1000); %240; % window length in samples
skiprate = floor(winlength/4); % window skip in samples
if sample_rate<10000
P = 10; % LPC Analysis Order
else
P=16; % this could vary depending on sampling frequency.
end
% ----------------------------------------------------------------------
% For each frame of input speech, calculate the Itakura-Saito Measure
% ----------------------------------------------------------------------
num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames
start = 1; % starting sample
window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
for frame_count = 1:num_frames
% ----------------------------------------------------------
% (1) Get the Frames for the test and reference speech.
% Multiply by Hanning Window.
% ----------------------------------------------------------
clean_frame = clean_speech(start:start+winlength-1);
processed_frame = processed_speech(start:start+winlength-1);
clean_frame = clean_frame.*window;
processed_frame = processed_frame.*window;
% ----------------------------------------------------------
% (2) Get the autocorrelation lags and LPC parameters used
% to compute the IS measure.
% ----------------------------------------------------------
[R_clean, Ref_clean, A_clean] = ...
lpcoeff(clean_frame, P);
[R_processed, Ref_processed, A_processed] = ...
lpcoeff(processed_frame, P);
% ----------------------------------------------------------
% (3) Compute the IS measure
% ----------------------------------------------------------
numerator = A_processed*toeplitz(R_clean)*A_processed';
denominator = max(A_clean*toeplitz(R_clean)*A_clean',eps);
gain_clean = max(R_clean*A_clean',eps); % this is gain
gain_processed = max(R_processed*A_processed',eps); % squared (sigma^2)
ISvalue=(gain_clean/gain_processed)*(numerator/denominator) + ...
log(gain_processed/gain_clean)-1;
distortion(frame_count) = min(ISvalue,100);
start = start + skiprate;
end
function [acorr, refcoeff, lpparams] = lpcoeff(speech_frame, model_order)
% ----------------------------------------------------------
% (1) Compute Autocorrelation Lags
% ----------------------------------------------------------
winlength = max(size(speech_frame));
for k=1:model_order+1
R(k) = sum(speech_frame(1:winlength-k+1) ...
.*speech_frame(k:winlength));
end
% ----------------------------------------------------------
% (2) Levinson-Durbin
% ----------------------------------------------------------
a = ones(1,model_order);
E(1)=R(1);
for i=1:model_order
a_past(1:i-1) = a(1:i-1);
sum_term = sum(a_past(1:i-1).*R(i:-1:2));
rcoeff(i)=(R(i+1) - sum_term) / E(i);
a(i)=rcoeff(i);
a(1:i-1) = a_past(1:i-1) - rcoeff(i).*a_past(i-1:-1:1);
E(i+1)=(1-rcoeff(i)*rcoeff(i))*E(i);
end
acorr = R;
refcoeff = rcoeff;
lpparams = [1 -a];

View File

@@ -0,0 +1,162 @@
function llr_mean= comp_llr(cleanFile, enhancedFile);
% ----------------------------------------------------------------------
%
% Log Likelihood Ratio (LLR) Objective Speech Quality Measure
%
%
% This function implements the Log Likelihood Ratio Measure
% defined on page 48 of [1] (see Equation 2.18).
%
% Usage: llr=comp_llr(cleanFile.wav, enhancedFile.wav)
%
% cleanFile.wav - clean input file in .wav format
% enhancedFile - enhanced output file in .wav format
% llr - computed likelihood ratio
%
% Note that the LLR measure is limited in the range [0, 2].
%
% Example call: llr =comp_llr('sp04.wav','enhanced.wav')
%
%
% References:
%
% [1] S. R. Quackenbush, T. P. Barnwell, and M. A. Clements,
% Objective Measures of Speech Quality. Prentice Hall
% Advanced Reference Series, Englewood Cliffs, NJ, 1988,
% ISBN: 0-13-629056-6.
%
% Authors: Bryan L. Pellom and John H. L. Hansen (July 1998)
% Modified by: Philipos C. Loizou (Oct 2006) - limited LLR to be in [0,2]
%
% Copyright (c) 2006 by Philipos C. Loizou
% $Revision: 0.0 $ $Date: 10/09/2006 $
% ----------------------------------------------------------------------
if nargin~=2
fprintf('USAGE: LLR=comp_llr(cleanFile.wav, enhancedFile.wav)\n');
fprintf('For more help, type: help comp_llr\n\n');
return;
end
alpha=0.95;
[data1, Srate1, Nbits1]= wavread(cleanFile);
[data2, Srate2, Nbits2]= wavread(enhancedFile);
if ( Srate1~= Srate2) | ( Nbits1~= Nbits2)
error( 'The two files do not match!\n');
end
len= min( length( data1), length( data2));
data1= data1( 1: len)+eps;
data2= data2( 1: len)+eps;
IS_dist= llr( data1, data2,Srate1);
IS_len= round( length( IS_dist)* alpha);
IS= sort( IS_dist);
llr_mean= mean( IS( 1: IS_len));
function distortion = llr(clean_speech, processed_speech,sample_rate)
% ----------------------------------------------------------------------
% Check the length of the clean and processed speech. Must be the same.
% ----------------------------------------------------------------------
clean_length = length(clean_speech);
processed_length = length(processed_speech);
if (clean_length ~= processed_length)
disp('Error: Both Speech Files must be same length.');
return
end
% ----------------------------------------------------------------------
% Global Variables
% ----------------------------------------------------------------------
winlength = round(30*sample_rate/1000); %240; % window length in samples
skiprate = floor(winlength/4); % window skip in samples
if sample_rate<10000
P = 10; % LPC Analysis Order
else
P=16; % this could vary depending on sampling frequency.
end
% ----------------------------------------------------------------------
% For each frame of input speech, calculate the Log Likelihood Ratio
% ----------------------------------------------------------------------
num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames
start = 1; % starting sample
window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
for frame_count = 1:num_frames
% ----------------------------------------------------------
% (1) Get the Frames for the test and reference speech.
% Multiply by Hanning Window.
% ----------------------------------------------------------
clean_frame = clean_speech(start:start+winlength-1);
processed_frame = processed_speech(start:start+winlength-1);
clean_frame = clean_frame.*window;
processed_frame = processed_frame.*window;
% ----------------------------------------------------------
% (2) Get the autocorrelation lags and LPC parameters used
% to compute the LLR measure.
% ----------------------------------------------------------
[R_clean, Ref_clean, A_clean] = ...
lpcoeff(clean_frame, P);
[R_processed, Ref_processed, A_processed] = ...
lpcoeff(processed_frame, P);
% ----------------------------------------------------------
% (3) Compute the LLR measure
% ----------------------------------------------------------
numerator = A_processed*toeplitz(R_clean)*A_processed';
denominator = A_clean*toeplitz(R_clean)*A_clean';
distortion(frame_count) = min(2,log(numerator/denominator));
start = start + skiprate;
end
function [acorr, refcoeff, lpparams] = lpcoeff(speech_frame, model_order)
% ----------------------------------------------------------
% (1) Compute Autocorrelation Lags
% ----------------------------------------------------------
winlength = max(size(speech_frame));
for k=1:model_order+1
R(k) = sum(speech_frame(1:winlength-k+1) ...
.*speech_frame(k:winlength));
end
% ----------------------------------------------------------
% (2) Levinson-Durbin
% ----------------------------------------------------------
a = ones(1,model_order);
E(1)=R(1);
for i=1:model_order
a_past(1:i-1) = a(1:i-1);
sum_term = sum(a_past(1:i-1).*R(i:-1:2));
rcoeff(i)=(R(i+1) - sum_term) / E(i);
a(i)=rcoeff(i);
a(1:i-1) = a_past(1:i-1) - rcoeff(i).*a_past(i-1:-1:1);
E(i+1)=(1-rcoeff(i)*rcoeff(i))*E(i);
end
acorr = R;
refcoeff = rcoeff;
lpparams = [1 -a];

View File

@@ -0,0 +1,132 @@
function [snr_mean, segsnr_mean]= comp_SNR(cleanFile, enhdFile);
%
% Segmental Signal-to-Noise Ratio Objective Speech Quality Measure
%
% This function implements the segmental signal-to-noise ratio
% as defined in [1, p. 45] (see Equation 2.12).
%
% Usage: [SNRovl, SNRseg]=comp_snr(cleanFile.wav, enhancedFile.wav)
%
% cleanFile.wav - clean input file in .wav format
% enhancedFile - enhanced output file in .wav format
% SNRovl - overall SNR (dB)
% SNRseg - segmental SNR (dB)
%
% This function returns 2 parameters. The first item is the
% overall SNR for the two speech signals. The second value
% is the segmental signal-to-noise ratio (1 seg-snr per
% frame of input). The segmental SNR is clamped to range
% between 35dB and -10dB (see suggestions in [2]).
%
% Example call: [SNRovl,SNRseg]=comp_SNR('sp04.wav','enhanced.wav')
%
% References:
%
% [1] S. R. Quackenbush, T. P. Barnwell, and M. A. Clements,
% Objective Measures of Speech Quality. Prentice Hall
% Advanced Reference Series, Englewood Cliffs, NJ, 1988,
% ISBN: 0-13-629056-6.
%
% [2] P. E. Papamichalis, Practical Approaches to Speech
% Coding, Prentice-Hall, Englewood Cliffs, NJ, 1987.
% ISBN: 0-13-689019-9. (see pages 179-181).
%
% Authors: Bryan L. Pellom and John H. L. Hansen (July 1998)
% Modified by: Philipos C. Loizou (Oct 2006)
%
% Copyright (c) 2006 by Philipos C. Loizou
% $Revision: 0.0 $ $Date: 10/09/2006 $
%-------------------------------------------------------------------------
if nargin ~=2
fprintf('USAGE: [snr_mean, segsnr_mean]= comp_SNR(cleanFile, enhdFile) \n');
return;
end
[data1, Srate1, Nbits1]= wavread(cleanFile);
[data2, Srate2, Nbits2]= wavread(enhdFile);
if (( Srate1~= Srate2) | ( Nbits1~= Nbits2))
error( 'The two files do not match!\n');
end
len= min( length( data1), length( data2));
data1= data1( 1: len);
data2= data2( 1: len);
[snr_dist, segsnr_dist]= snr( data1, data2,Srate1);
snr_mean= snr_dist;
segsnr_mean= mean( segsnr_dist);
% =========================================================================
function [overall_snr, segmental_snr] = snr(clean_speech, processed_speech,sample_rate)
% ----------------------------------------------------------------------
% Check the length of the clean and processed speech. Must be the same.
% ----------------------------------------------------------------------
clean_length = length(clean_speech);
processed_length = length(processed_speech);
if (clean_length ~= processed_length)
disp('Error: Both Speech Files must be same length.');
return
end
% ----------------------------------------------------------------------
% Scale both clean speech and processed speech to have same dynamic
% range. Also remove DC component from each signal
% ----------------------------------------------------------------------
%clean_speech = clean_speech - mean(clean_speech);
%processed_speech = processed_speech - mean(processed_speech);
%processed_speech = processed_speech.*(max(abs(clean_speech))/ max(abs(processed_speech)));
overall_snr = 10* log10( sum(clean_speech.^2)/sum((clean_speech-processed_speech).^2));
% ----------------------------------------------------------------------
% Global Variables
% ----------------------------------------------------------------------
winlength = round(30*sample_rate/1000); %240; % window length in samples for 30-msecs
skiprate = floor(winlength/4); %60; % window skip in samples
MIN_SNR = -10; % minimum SNR in dB
MAX_SNR = 35; % maximum SNR in dB
% ----------------------------------------------------------------------
% For each frame of input speech, calculate the Segmental SNR
% ----------------------------------------------------------------------
num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames
start = 1; % starting sample
window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
for frame_count = 1: num_frames
% ----------------------------------------------------------
% (1) Get the Frames for the test and reference speech.
% Multiply by Hanning Window.
% ----------------------------------------------------------
clean_frame = clean_speech(start:start+winlength-1);
processed_frame = processed_speech(start:start+winlength-1);
clean_frame = clean_frame.*window;
processed_frame = processed_frame.*window;
% ----------------------------------------------------------
% (2) Compute the Segmental SNR
% ----------------------------------------------------------
signal_energy = sum(clean_frame.^2);
noise_energy = sum((clean_frame-processed_frame).^2);
segmental_snr(frame_count) = 10*log10(signal_energy/(noise_energy+eps)+eps);
segmental_snr(frame_count) = max(segmental_snr(frame_count),MIN_SNR);
segmental_snr(frame_count) = min(segmental_snr(frame_count),MAX_SNR);
start = start + skiprate;
end

View File

@@ -0,0 +1,299 @@
function wss_dist= comp_wss(cleanFile, enhancedFile);
% ----------------------------------------------------------------------
%
% Weighted Spectral Slope (WSS) Objective Speech Quality Measure
%
% This function implements the Weighted Spectral Slope (WSS)
% distance measure originally proposed in [1]. The algorithm
% works by first decomposing the speech signal into a set of
% frequency bands (this is done for both the test and reference
% frame). The intensities within each critical band are
% measured. Then, a weighted distances between the measured
% slopes of the log-critical band spectra are computed.
% This measure is also described in Section 2.2.9 (pages 56-58)
% of [2].
%
% Whereas Klatt's original measure used 36 critical-band
% filters to estimate the smoothed short-time spectrum, this
% implementation considers a bank of 25 filters spanning
% the 4 kHz bandwidth.
%
% Usage: wss_dist=comp_wss(cleanFile.wav, enhancedFile.wav)
%
% cleanFile.wav - clean input file in .wav format
% enhancedFile - enhanced output file in .wav format
% wss_dist - computed spectral slope distance
%
% Example call: ws =comp_wss('sp04.wav','enhanced.wav')
%
% References:
%
% [1] D. H. Klatt, "Prediction of Perceived Phonetic Distance
% from Critical-Band Spectra: A First Step", Proc. IEEE
% ICASSP'82, Volume 2, pp. 1278-1281, May, 1982.
%
% [2] S. R. Quackenbush, T. P. Barnwell, and M. A. Clements,
% Objective Measures of Speech Quality. Prentice Hall
% Advanced Reference Series, Englewood Cliffs, NJ, 1988,
% ISBN: 0-13-629056-6.
%
% Authors: Bryan L. Pellom and John H. L. Hansen (July 1998)
% Modified by: Philipos C. Loizou (Oct 2006)
%
% Copyright (c) 2006 by Philipos C. Loizou
% $Revision: 0.0 $ $Date: 10/09/2006 $
%
% ----------------------------------------------------------------------
if nargin~=2
fprintf('USAGE: WSS=comp_wss(cleanFile.wav, enhancedFile.wav)\n');
fprintf('For more help, type: help comp_wss\n\n');
return;
end
alpha= 0.95;
[data1, Srate1, Nbits1]= wavread(cleanFile);
[data2, Srate2, Nbits2]= wavread(enhancedFile);
if ( Srate1~= Srate2) | ( Nbits1~= Nbits2)
error( 'The two files do not match!\n');
end
len= min( length( data1), length( data2));
data1= data1( 1: len)+eps;
data2= data2( 1: len)+eps;
wss_dist_vec= wss( data1, data2,Srate1);
wss_dist_vec= sort( wss_dist_vec);
wss_dist= mean( wss_dist_vec( 1: round( length( wss_dist_vec)*alpha)));
function distortion = wss(clean_speech, processed_speech,sample_rate)
% ----------------------------------------------------------------------
% Check the length of the clean and processed speech. Must be the same.
% ----------------------------------------------------------------------
clean_length = length(clean_speech);
processed_length = length(processed_speech);
if (clean_length ~= processed_length)
disp('Error: Files musthave same length.');
return
end
% ----------------------------------------------------------------------
% Global Variables
% ----------------------------------------------------------------------
winlength = round(30*sample_rate/1000); % window length in samples
skiprate = floor(winlength/4); % window skip in samples
max_freq = sample_rate/2; % maximum bandwidth
num_crit = 25; % number of critical bands
USE_FFT_SPECTRUM = 1; % defaults to 10th order LP spectrum
n_fft = 2^nextpow2(2*winlength);
n_fftby2 = n_fft/2; % FFT size/2
Kmax = 20; % value suggested by Klatt, pg 1280
Klocmax = 1; % value suggested by Klatt, pg 1280
% ----------------------------------------------------------------------
% Critical Band Filter Definitions (Center Frequency and Bandwidths in Hz)
% ----------------------------------------------------------------------
cent_freq(1) = 50.0000; bandwidth(1) = 70.0000;
cent_freq(2) = 120.000; bandwidth(2) = 70.0000;
cent_freq(3) = 190.000; bandwidth(3) = 70.0000;
cent_freq(4) = 260.000; bandwidth(4) = 70.0000;
cent_freq(5) = 330.000; bandwidth(5) = 70.0000;
cent_freq(6) = 400.000; bandwidth(6) = 70.0000;
cent_freq(7) = 470.000; bandwidth(7) = 70.0000;
cent_freq(8) = 540.000; bandwidth(8) = 77.3724;
cent_freq(9) = 617.372; bandwidth(9) = 86.0056;
cent_freq(10) = 703.378; bandwidth(10) = 95.3398;
cent_freq(11) = 798.717; bandwidth(11) = 105.411;
cent_freq(12) = 904.128; bandwidth(12) = 116.256;
cent_freq(13) = 1020.38; bandwidth(13) = 127.914;
cent_freq(14) = 1148.30; bandwidth(14) = 140.423;
cent_freq(15) = 1288.72; bandwidth(15) = 153.823;
cent_freq(16) = 1442.54; bandwidth(16) = 168.154;
cent_freq(17) = 1610.70; bandwidth(17) = 183.457;
cent_freq(18) = 1794.16; bandwidth(18) = 199.776;
cent_freq(19) = 1993.93; bandwidth(19) = 217.153;
cent_freq(20) = 2211.08; bandwidth(20) = 235.631;
cent_freq(21) = 2446.71; bandwidth(21) = 255.255;
cent_freq(22) = 2701.97; bandwidth(22) = 276.072;
cent_freq(23) = 2978.04; bandwidth(23) = 298.126;
cent_freq(24) = 3276.17; bandwidth(24) = 321.465;
cent_freq(25) = 3597.63; bandwidth(25) = 346.136;
bw_min = bandwidth (1); % minimum critical bandwidth
% ----------------------------------------------------------------------
% Set up the critical band filters. Note here that Gaussianly shaped
% filters are used. Also, the sum of the filter weights are equivalent
% for each critical band filter. Filter less than -30 dB and set to
% zero.
% ----------------------------------------------------------------------
min_factor = exp (-30.0 / (2.0 * 2.303)); % -30 dB point of filter
for i = 1:num_crit
f0 = (cent_freq (i) / max_freq) * (n_fftby2);
all_f0(i) = floor(f0);
bw = (bandwidth (i) / max_freq) * (n_fftby2);
norm_factor = log(bw_min) - log(bandwidth(i));
j = 0:1:n_fftby2-1;
crit_filter(i,:) = exp (-11 *(((j - floor(f0)) ./bw).^2) + norm_factor);
crit_filter(i,:) = crit_filter(i,:).*(crit_filter(i,:) > min_factor);
end
% ----------------------------------------------------------------------
% For each frame of input speech, calculate the Weighted Spectral
% Slope Measure
% ----------------------------------------------------------------------
num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames
start = 1; % starting sample
window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
for frame_count = 1:num_frames
% ----------------------------------------------------------
% (1) Get the Frames for the test and reference speech.
% Multiply by Hanning Window.
% ----------------------------------------------------------
clean_frame = clean_speech(start:start+winlength-1);
processed_frame = processed_speech(start:start+winlength-1);
clean_frame = clean_frame.*window;
processed_frame = processed_frame.*window;
% ----------------------------------------------------------
% (2) Compute the Power Spectrum of Clean and Processed
% ----------------------------------------------------------
if (USE_FFT_SPECTRUM)
clean_spec = (abs(fft(clean_frame,n_fft)).^2);
processed_spec = (abs(fft(processed_frame,n_fft)).^2);
else
a_vec = zeros(1,n_fft);
a_vec(1:11) = lpc(clean_frame,10);
clean_spec = 1.0/(abs(fft(a_vec,n_fft)).^2)';
a_vec = zeros(1,n_fft);
a_vec(1:11) = lpc(processed_frame,10);
processed_spec = 1.0/(abs(fft(a_vec,n_fft)).^2)';
end
% ----------------------------------------------------------
% (3) Compute Filterbank Output Energies (in dB scale)
% ----------------------------------------------------------
for i = 1:num_crit
clean_energy(i) = sum(clean_spec(1:n_fftby2) ...
.*crit_filter(i,:)');
processed_energy(i) = sum(processed_spec(1:n_fftby2) ...
.*crit_filter(i,:)');
end
clean_energy = 10*log10(max(clean_energy,1E-10));
processed_energy = 10*log10(max(processed_energy,1E-10));
% ----------------------------------------------------------
% (4) Compute Spectral Slope (dB[i+1]-dB[i])
% ----------------------------------------------------------
clean_slope = clean_energy(2:num_crit) - ...
clean_energy(1:num_crit-1);
processed_slope = processed_energy(2:num_crit) - ...
processed_energy(1:num_crit-1);
% ----------------------------------------------------------
% (5) Find the nearest peak locations in the spectra to
% each critical band. If the slope is negative, we
% search to the left. If positive, we search to the
% right.
% ----------------------------------------------------------
for i = 1:num_crit-1
% find the peaks in the clean speech signal
if (clean_slope(i)>0) % search to the right
n = i;
while ((n<num_crit) & (clean_slope(n) > 0))
n = n+1;
end
clean_loc_peak(i) = clean_energy(n-1);
else % search to the left
n = i;
while ((n>0) & (clean_slope(n) <= 0))
n = n-1;
end
clean_loc_peak(i) = clean_energy(n+1);
end
% find the peaks in the processed speech signal
if (processed_slope(i)>0) % search to the right
n = i;
while ((n<num_crit) & (processed_slope(n) > 0))
n = n+1;
end
processed_loc_peak(i) = processed_energy(n-1);
else % search to the left
n = i;
while ((n>0) & (processed_slope(n) <= 0))
n = n-1;
end
processed_loc_peak(i) = processed_energy(n+1);
end
end
% ----------------------------------------------------------
% (6) Compute the WSS Measure for this frame. This
% includes determination of the weighting function.
% ----------------------------------------------------------
dBMax_clean = max(clean_energy);
dBMax_processed = max(processed_energy);
% The weights are calculated by averaging individual
% weighting factors from the clean and processed frame.
% These weights W_clean and W_processed should range
% from 0 to 1 and place more emphasis on spectral
% peaks and less emphasis on slope differences in spectral
% valleys. This procedure is described on page 1280 of
% Klatt's 1982 ICASSP paper.
Wmax_clean = Kmax ./ (Kmax + dBMax_clean - ...
clean_energy(1:num_crit-1));
Wlocmax_clean = Klocmax ./ ( Klocmax + clean_loc_peak - ...
clean_energy(1:num_crit-1));
W_clean = Wmax_clean .* Wlocmax_clean;
Wmax_processed = Kmax ./ (Kmax + dBMax_processed - ...
processed_energy(1:num_crit-1));
Wlocmax_processed = Klocmax ./ ( Klocmax + processed_loc_peak - ...
processed_energy(1:num_crit-1));
W_processed = Wmax_processed .* Wlocmax_processed;
W = (W_clean + W_processed)./2.0;
distortion(frame_count) = sum(W.*(clean_slope(1:num_crit-1) - ...
processed_slope(1:num_crit-1)).^2);
% this normalization is not part of Klatt's paper, but helps
% to normalize the measure. Here we scale the measure by the
% sum of the weights.
distortion(frame_count) = distortion(frame_count)/sum(W);
start = start + skiprate;
end

View File

@@ -0,0 +1,496 @@
function [Csig,Cbak,Covl]= composite(cleanFile, enhancedFile);
% ----------------------------------------------------------------------
% Composite Objective Speech Quality Measure
%
% This function implements the composite objective measure proposed in
% [1].
%
% Usage: [sig,bak,ovl]=composite(cleanFile.wav, enhancedFile.wav)
%
% cleanFile.wav - clean input file in .wav format
% enhancedFile - enhanced output file in .wav format
% sig - predicted rating [1-5] of speech distortion
% bak - predicted rating [1-5] of noise distortion
% ovl - predicted rating [1-5] of overall quality
%
% In addition to the above ratings (sig, bak, & ovl) it returns
% the individual values of the LLR, SNRseg, WSS and PESQ measures.
%
% Example call: [sig,bak,ovl] =composite('sp04.wav','enhanced.wav')
%
%
% References:
%
% [1] Hu, Y. and Loizou, P. (2006). Evaluation of objective measures
% for speech enhancement. Proc. Interspeech, Pittsburg, PA.
%
% Authors: Yi Hu and Philipos C. Loizou
% (the LLR, SNRseg and WSS measures were based on Bryan Pellom and John
% Hansen's implementations)
%
% Copyright (c) 2006 by Philipos C. Loizou
% $Revision: 0.0 $ $Date: 10/09/2006 $
% ----------------------------------------------------------------------
if nargin~=2
fprintf('USAGE: [sig,bak,ovl]=composite(cleanFile.wav, enhancedFile.wav)\n');
fprintf('For more help, type: help composite\n\n');
return;
end
alpha= 0.95;
[data1, Srate1, Nbits1]= wavread(cleanFile);
[data2, Srate2, Nbits2]= wavread(enhancedFile);
if ( Srate1~= Srate2) | ( Nbits1~= Nbits2)
error( 'The two files do not match!\n');
end
len= min( length( data1), length( data2));
data1= data1( 1: len)+eps;
data2= data2( 1: len)+eps;
% -- compute the WSS measure ---
%
wss_dist_vec= wss( data1, data2,Srate1);
wss_dist_vec= sort( wss_dist_vec);
wss_dist= mean( wss_dist_vec( 1: round( length( wss_dist_vec)*alpha)));
% --- compute the LLR measure ---------
%
LLR_dist= llr( data1, data2,Srate1);
LLRs= sort(LLR_dist);
LLR_len= round( length(LLR_dist)* alpha);
llr_mean= mean( LLRs( 1: LLR_len));
% --- compute the SNRseg ----------------
%
[snr_dist, segsnr_dist]= snr( data1, data2,Srate1);
snr_mean= snr_dist;
segSNR= mean( segsnr_dist);
% -- compute the pesq ----
[pesq_mos]= pesq(cleanFile, enhancedFile);
% --- now compute the composite measures ------------------
%
Csig = 3.093 - 1.029*llr_mean + 0.603*pesq_mos-0.009*wss_dist;
Csig = max(1,Csig); Csig=min(5, Csig); % limit values to [1, 5]
Cbak = 1.634 + 0.478 *pesq_mos - 0.007*wss_dist + 0.063*segSNR;
Cbak = max(1, Cbak); Cbak=min(5,Cbak); % limit values to [1, 5]
Covl = 1.594 + 0.805*pesq_mos - 0.512*llr_mean - 0.007*wss_dist;
Covl = max(1, Covl); Covl=min(5, Covl); % limit values to [1, 5]
fprintf('\n LLR=%f SNRseg=%f WSS=%f PESQ=%f\n',llr_mean,segSNR,wss_dist,pesq_mos);
return; %=================================================================
function distortion = wss(clean_speech, processed_speech,sample_rate)
% ----------------------------------------------------------------------
% Check the length of the clean and processed speech. Must be the same.
% ----------------------------------------------------------------------
clean_length = length(clean_speech);
processed_length = length(processed_speech);
if (clean_length ~= processed_length)
disp('Error: Files musthave same length.');
return
end
% ----------------------------------------------------------------------
% Global Variables
% ----------------------------------------------------------------------
winlength = round(30*sample_rate/1000); %240; % window length in samples
skiprate = floor(winlength/4); % window skip in samples
max_freq = sample_rate/2; % maximum bandwidth
num_crit = 25; % number of critical bands
USE_FFT_SPECTRUM = 1; % defaults to 10th order LP spectrum
n_fft = 2^nextpow2(2*winlength);
n_fftby2 = n_fft/2; % FFT size/2
Kmax = 20; % value suggested by Klatt, pg 1280
Klocmax = 1; % value suggested by Klatt, pg 1280
% ----------------------------------------------------------------------
% Critical Band Filter Definitions (Center Frequency and Bandwidths in Hz)
% ----------------------------------------------------------------------
cent_freq(1) = 50.0000; bandwidth(1) = 70.0000;
cent_freq(2) = 120.000; bandwidth(2) = 70.0000;
cent_freq(3) = 190.000; bandwidth(3) = 70.0000;
cent_freq(4) = 260.000; bandwidth(4) = 70.0000;
cent_freq(5) = 330.000; bandwidth(5) = 70.0000;
cent_freq(6) = 400.000; bandwidth(6) = 70.0000;
cent_freq(7) = 470.000; bandwidth(7) = 70.0000;
cent_freq(8) = 540.000; bandwidth(8) = 77.3724;
cent_freq(9) = 617.372; bandwidth(9) = 86.0056;
cent_freq(10) = 703.378; bandwidth(10) = 95.3398;
cent_freq(11) = 798.717; bandwidth(11) = 105.411;
cent_freq(12) = 904.128; bandwidth(12) = 116.256;
cent_freq(13) = 1020.38; bandwidth(13) = 127.914;
cent_freq(14) = 1148.30; bandwidth(14) = 140.423;
cent_freq(15) = 1288.72; bandwidth(15) = 153.823;
cent_freq(16) = 1442.54; bandwidth(16) = 168.154;
cent_freq(17) = 1610.70; bandwidth(17) = 183.457;
cent_freq(18) = 1794.16; bandwidth(18) = 199.776;
cent_freq(19) = 1993.93; bandwidth(19) = 217.153;
cent_freq(20) = 2211.08; bandwidth(20) = 235.631;
cent_freq(21) = 2446.71; bandwidth(21) = 255.255;
cent_freq(22) = 2701.97; bandwidth(22) = 276.072;
cent_freq(23) = 2978.04; bandwidth(23) = 298.126;
cent_freq(24) = 3276.17; bandwidth(24) = 321.465;
cent_freq(25) = 3597.63; bandwidth(25) = 346.136;
bw_min = bandwidth (1); % minimum critical bandwidth
% ----------------------------------------------------------------------
% Set up the critical band filters. Note here that Gaussianly shaped
% filters are used. Also, the sum of the filter weights are equivalent
% for each critical band filter. Filter less than -30 dB and set to
% zero.
% ----------------------------------------------------------------------
min_factor = exp (-30.0 / (2.0 * 2.303)); % -30 dB point of filter
for i = 1:num_crit
f0 = (cent_freq (i) / max_freq) * (n_fftby2);
all_f0(i) = floor(f0);
bw = (bandwidth (i) / max_freq) * (n_fftby2);
norm_factor = log(bw_min) - log(bandwidth(i));
j = 0:1:n_fftby2-1;
crit_filter(i,:) = exp (-11 *(((j - floor(f0)) ./bw).^2) + norm_factor);
crit_filter(i,:) = crit_filter(i,:).*(crit_filter(i,:) > min_factor);
end
% ----------------------------------------------------------------------
% For each frame of input speech, calculate the Weighted Spectral
% Slope Measure
% ----------------------------------------------------------------------
num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames
start = 1; % starting sample
window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
for frame_count = 1:num_frames
% ----------------------------------------------------------
% (1) Get the Frames for the test and reference speech.
% Multiply by Hanning Window.
% ----------------------------------------------------------
clean_frame = clean_speech(start:start+winlength-1);
processed_frame = processed_speech(start:start+winlength-1);
clean_frame = clean_frame.*window;
processed_frame = processed_frame.*window;
% ----------------------------------------------------------
% (2) Compute the Power Spectrum of Clean and Processed
% ----------------------------------------------------------
if (USE_FFT_SPECTRUM)
clean_spec = (abs(fft(clean_frame,n_fft)).^2);
processed_spec = (abs(fft(processed_frame,n_fft)).^2);
else
a_vec = zeros(1,n_fft);
a_vec(1:11) = lpc(clean_frame,10);
clean_spec = 1.0/(abs(fft(a_vec,n_fft)).^2)';
a_vec = zeros(1,n_fft);
a_vec(1:11) = lpc(processed_frame,10);
processed_spec = 1.0/(abs(fft(a_vec,n_fft)).^2)';
end
% ----------------------------------------------------------
% (3) Compute Filterbank Output Energies (in dB scale)
% ----------------------------------------------------------
for i = 1:num_crit
clean_energy(i) = sum(clean_spec(1:n_fftby2) ...
.*crit_filter(i,:)');
processed_energy(i) = sum(processed_spec(1:n_fftby2) ...
.*crit_filter(i,:)');
end
clean_energy = 10*log10(max(clean_energy,1E-10));
processed_energy = 10*log10(max(processed_energy,1E-10));
% ----------------------------------------------------------
% (4) Compute Spectral Slope (dB[i+1]-dB[i])
% ----------------------------------------------------------
clean_slope = clean_energy(2:num_crit) - ...
clean_energy(1:num_crit-1);
processed_slope = processed_energy(2:num_crit) - ...
processed_energy(1:num_crit-1);
% ----------------------------------------------------------
% (5) Find the nearest peak locations in the spectra to
% each critical band. If the slope is negative, we
% search to the left. If positive, we search to the
% right.
% ----------------------------------------------------------
for i = 1:num_crit-1
% find the peaks in the clean speech signal
if (clean_slope(i)>0) % search to the right
n = i;
while ((n<num_crit) & (clean_slope(n) > 0))
n = n+1;
end
clean_loc_peak(i) = clean_energy(n-1);
else % search to the left
n = i;
while ((n>0) & (clean_slope(n) <= 0))
n = n-1;
end
clean_loc_peak(i) = clean_energy(n+1);
end
% find the peaks in the processed speech signal
if (processed_slope(i)>0) % search to the right
n = i;
while ((n<num_crit) & (processed_slope(n) > 0))
n = n+1;
end
processed_loc_peak(i) = processed_energy(n-1);
else % search to the left
n = i;
while ((n>0) & (processed_slope(n) <= 0))
n = n-1;
end
processed_loc_peak(i) = processed_energy(n+1);
end
end
% ----------------------------------------------------------
% (6) Compute the WSS Measure for this frame. This
% includes determination of the weighting function.
% ----------------------------------------------------------
dBMax_clean = max(clean_energy);
dBMax_processed = max(processed_energy);
% The weights are calculated by averaging individual
% weighting factors from the clean and processed frame.
% These weights W_clean and W_processed should range
% from 0 to 1 and place more emphasis on spectral
% peaks and less emphasis on slope differences in spectral
% valleys. This procedure is described on page 1280 of
% Klatt's 1982 ICASSP paper.
Wmax_clean = Kmax ./ (Kmax + dBMax_clean - ...
clean_energy(1:num_crit-1));
Wlocmax_clean = Klocmax ./ ( Klocmax + clean_loc_peak - ...
clean_energy(1:num_crit-1));
W_clean = Wmax_clean .* Wlocmax_clean;
Wmax_processed = Kmax ./ (Kmax + dBMax_processed - ...
processed_energy(1:num_crit-1));
Wlocmax_processed = Klocmax ./ ( Klocmax + processed_loc_peak - ...
processed_energy(1:num_crit-1));
W_processed = Wmax_processed .* Wlocmax_processed;
W = (W_clean + W_processed)./2.0;
distortion(frame_count) = sum(W.*(clean_slope(1:num_crit-1) - ...
processed_slope(1:num_crit-1)).^2);
% this normalization is not part of Klatt's paper, but helps
% to normalize the measure. Here we scale the measure by the
% sum of the weights.
distortion(frame_count) = distortion(frame_count)/sum(W);
start = start + skiprate;
end
%-----------------------------------------------
function distortion = llr(clean_speech, processed_speech,sample_rate)
% ----------------------------------------------------------------------
% Check the length of the clean and processed speech. Must be the same.
% ----------------------------------------------------------------------
clean_length = length(clean_speech);
processed_length = length(processed_speech);
if (clean_length ~= processed_length)
disp('Error: Both Speech Files must be same length.');
return
end
% ----------------------------------------------------------------------
% Global Variables
% ----------------------------------------------------------------------
winlength = round(30*sample_rate/1000); % window length in samples
skiprate = floor(winlength/4); % window skip in samples
if sample_rate<10000
P = 10; % LPC Analysis Order
else
P=16; % this could vary depending on sampling frequency.
end
% ----------------------------------------------------------------------
% For each frame of input speech, calculate the Log Likelihood Ratio
% ----------------------------------------------------------------------
num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames
start = 1; % starting sample
window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
for frame_count = 1:num_frames
% ----------------------------------------------------------
% (1) Get the Frames for the test and reference speech.
% Multiply by Hanning Window.
% ----------------------------------------------------------
clean_frame = clean_speech(start:start+winlength-1);
processed_frame = processed_speech(start:start+winlength-1);
clean_frame = clean_frame.*window;
processed_frame = processed_frame.*window;
% ----------------------------------------------------------
% (2) Get the autocorrelation lags and LPC parameters used
% to compute the LLR measure.
% ----------------------------------------------------------
[R_clean, Ref_clean, A_clean] = ...
lpcoeff(clean_frame, P);
[R_processed, Ref_processed, A_processed] = ...
lpcoeff(processed_frame, P);
% ----------------------------------------------------------
% (3) Compute the LLR measure
% ----------------------------------------------------------
numerator = A_processed*toeplitz(R_clean)*A_processed';
denominator = A_clean*toeplitz(R_clean)*A_clean';
distortion(frame_count) = log(numerator/denominator);
start = start + skiprate;
end
%---------------------------------------------
function [acorr, refcoeff, lpparams] = lpcoeff(speech_frame, model_order)
% ----------------------------------------------------------
% (1) Compute Autocorrelation Lags
% ----------------------------------------------------------
winlength = max(size(speech_frame));
for k=1:model_order+1
R(k) = sum(speech_frame(1:winlength-k+1) ...
.*speech_frame(k:winlength));
end
% ----------------------------------------------------------
% (2) Levinson-Durbin
% ----------------------------------------------------------
a = ones(1,model_order);
E(1)=R(1);
for i=1:model_order
a_past(1:i-1) = a(1:i-1);
sum_term = sum(a_past(1:i-1).*R(i:-1:2));
rcoeff(i)=(R(i+1) - sum_term) / E(i);
a(i)=rcoeff(i);
a(1:i-1) = a_past(1:i-1) - rcoeff(i).*a_past(i-1:-1:1);
E(i+1)=(1-rcoeff(i)*rcoeff(i))*E(i);
end
acorr = R;
refcoeff = rcoeff;
lpparams = [1 -a];
% ----------------------------------------------------------------------
function [overall_snr, segmental_snr] = snr(clean_speech, processed_speech,sample_rate)
% ----------------------------------------------------------------------
% Check the length of the clean and processed speech. Must be the same.
% ----------------------------------------------------------------------
clean_length = length(clean_speech);
processed_length = length(processed_speech);
if (clean_length ~= processed_length)
disp('Error: Both Speech Files must be same length.');
return
end
% ----------------------------------------------------------------------
% Scale both clean speech and processed speech to have same dynamic
% range. Also remove DC component from each signal
% ----------------------------------------------------------------------
%clean_speech = clean_speech - mean(clean_speech);
%processed_speech = processed_speech - mean(processed_speech);
%processed_speech = processed_speech.*(max(abs(clean_speech))/ max(abs(processed_speech)));
overall_snr = 10* log10( sum(clean_speech.^2)/sum((clean_speech-processed_speech).^2));
% ----------------------------------------------------------------------
% Global Variables
% ----------------------------------------------------------------------
winlength = round(30*sample_rate/1000); %240; % window length in samples
skiprate = floor(winlength/4); % window skip in samples
MIN_SNR = -10; % minimum SNR in dB
MAX_SNR = 35; % maximum SNR in dB
% ----------------------------------------------------------------------
% For each frame of input speech, calculate the Segmental SNR
% ----------------------------------------------------------------------
num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames
start = 1; % starting sample
window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
for frame_count = 1: num_frames
% ----------------------------------------------------------
% (1) Get the Frames for the test and reference speech.
% Multiply by Hanning Window.
% ----------------------------------------------------------
clean_frame = clean_speech(start:start+winlength-1);
processed_frame = processed_speech(start:start+winlength-1);
clean_frame = clean_frame.*window;
processed_frame = processed_frame.*window;
% ----------------------------------------------------------
% (2) Compute the Segmental SNR
% ----------------------------------------------------------
signal_energy = sum(clean_frame.^2);
noise_energy = sum((clean_frame-processed_frame).^2);
segmental_snr(frame_count) = 10*log10(signal_energy/(noise_energy+eps)+eps);
segmental_snr(frame_count) = max(segmental_snr(frame_count),MIN_SNR);
segmental_snr(frame_count) = min(segmental_snr(frame_count),MAX_SNR);
start = start + skiprate;
end

View File

@@ -0,0 +1,84 @@
function crude_align( ref_logVAD, ref_Nsamples, deg_logVAD, ...
deg_Nsamples, Utt_id)
global Downsample
global Nutterances Largest_uttsize Nsurf_samples Crude_DelayEst
global Crude_DelayConf UttSearch_Start UttSearch_End Utt_DelayEst
global Utt_Delay Utt_DelayConf Utt_Start Utt_End
global MAXNUTTERANCES WHOLE_SIGNAL
global pesq_mos subj_mos cond_nr
if (Utt_id== WHOLE_SIGNAL )
nr = floor( ref_Nsamples/ Downsample);
nd = floor( deg_Nsamples/ Downsample);
startr= 1;
startd= 1;
elseif Utt_id== MAXNUTTERANCES
startr= UttSearch_Start(MAXNUTTERANCES);
startd= startr+ Utt_DelayEst(MAXNUTTERANCES)/ Downsample;
if ( startd< 0 )
startr= 1- Utt_DelayEst(MAXNUTTERANCES)/ Downsample;
startd= 1;
end
nr= UttSearch_End(MAXNUTTERANCES)- startr;
nd= nr;
if( startd+ nd> floor( deg_Nsamples/ Downsample) )
nd= floor( deg_Nsamples/ Downsample)- startd;
end
% fprintf( 'nr,nd is %d,%d\n', nr, nd);
else
startr= UttSearch_Start(Utt_id);
startd= startr+ Crude_DelayEst/ Downsample;
if ( startd< 0 )
startr= 1- Crude_DelayEst/ Downsample;
startd= 1;
end
nr= UttSearch_End(Utt_id)- startr;
nd = nr;
if( startd+ nd> floor( deg_Nsamples/ Downsample)+ 1)
nd = floor( deg_Nsamples/ Downsample)- startd+ 1;
end
end
max_Y= 0.0;
I_max_Y= nr;
if( (nr> 1) && (nd> 1) )
Y= FFTNXCorr( ref_logVAD, startr, nr, deg_logVAD, startd, nd);
[max_Y, I_max_Y]= max( Y);
if (max_Y<= 0)
max_Y= 0;
I_max_Y= nr;
end
end
% fprintf( 'max_Y, I_max_Y is %f, %d\n', max_Y, I_max_Y);
if( Utt_id== WHOLE_SIGNAL )
Crude_DelayEst= (I_max_Y- nr)* Downsample;
Crude_DelayConf= 0.0;
% fprintf( 1, 'I_max_Y, nr, Crude_DelayEst is %f, %f, %f\n', ...
% I_max_Y, nr, Crude_DelayEst);
elseif( Utt_id == MAXNUTTERANCES )
Utt_Delay(MAXNUTTERANCES)= (I_max_Y- nr)* Downsample+ ...
Utt_DelayEst(MAXNUTTERANCES);
% fprintf( 'startr, startd, nr, nd, I_max, Utt_Delay[%d] is %d, %d, %d, %d, %d, %d\n', ...
% MAXNUTTERANCES, startr, startd, nr, nd, ...
% I_max_Y, Utt_Delay(MAXNUTTERANCES) );
else
% fprintf( 'I_max_Y, nr is %d, %d\n', I_max_Y, nr);
Utt_DelayEst(Utt_id)= (I_max_Y- nr)* Downsample+ ...
Crude_DelayEst;
end

View File

@@ -0,0 +1,21 @@
function mod_data= fix_power_level( data, data_Nsamples, maxNsamples)
% this function is used for level normalization, i.e., to fix the power
% level of data to a preset number, and return it to mod_data.
global Downsample DATAPADDING_MSECS SEARCHBUFFER Fs
global TARGET_AVG_POWER
TARGET_AVG_POWER= 1e7;
align_filter_dB= [0,-500; 50, -500; 100, -500; 125, -500; 160, -500; 200, -500;
250, -500; 300, -500; 350, 0; 400, 0; 500, 0; 600, 0; 630, 0;
800, 0; 1000, 0; 1250, 0; 1600, 0; 2000, 0; 2500, 0; 3000, 0;
3250, 0; 3500, -500; 4000, -500; 5000, -500; 6300, -500; 8000, -500];
align_filtered= apply_filter( data, data_Nsamples, align_filter_dB);
power_above_300Hz = pow_of (align_filtered, SEARCHBUFFER* Downsample+ 1, ...
data_Nsamples- SEARCHBUFFER* Downsample+ DATAPADDING_MSECS* (Fs/ 1000), ...
maxNsamples- 2* SEARCHBUFFER* Downsample+ DATAPADDING_MSECS* (Fs/ 1000));
global_scale= sqrt( TARGET_AVG_POWER/ power_above_300Hz);
% fprintf( 1, '\tglobal_scale is %f\n', global_scale);
mod_data= data* global_scale;

View File

@@ -0,0 +1,68 @@
function id_searchwindows( ref_VAD, ref_Nsamples, deg_VAD, deg_Nsamples);
global MINUTTLENGTH Downsample MINUTTLENGTH SEARCHBUFFER
global Crude_DelayEst Nutterances UttSearch_Start UttSearch_End
Utt_num = 1;
speech_flag = 0;
VAD_length= floor( ref_Nsamples/ Downsample);
del_deg_start= MINUTTLENGTH- Crude_DelayEst/ Downsample;
del_deg_end= floor((deg_Nsamples- Crude_DelayEst)/ Downsample)-...
MINUTTLENGTH;
for count= 1: VAD_length
VAD_value= ref_VAD(count);
if( (VAD_value> 0) && (speech_flag== 0) )
speech_flag= 1;
this_start= count;
UttSearch_Start(Utt_num)= count- SEARCHBUFFER;
if( UttSearch_Start(Utt_num)< 0 )
UttSearch_Start(Utt_num)= 0;
end
end
if( ((VAD_value== 0) || (count == (VAD_length-1))) && ...
(speech_flag == 1) )
speech_flag = 0;
UttSearch_End(Utt_num) = count + SEARCHBUFFER;
if( UttSearch_End(Utt_num) > VAD_length - 1 )
UttSearch_End(Utt_num) = VAD_length -1;
end
if( ((count - this_start) >= MINUTTLENGTH) &&...
(this_start < del_deg_end) &&...
(count > del_deg_start) )
Utt_num= Utt_num + 1;
end
end
end
Utt_num= Utt_num- 1;
Nutterances = Utt_num;
% fprintf( 1, 'Nutterances is %d\n', Nutterances);
% fid= fopen( 'mat_utt.txt', 'wt');
% fprintf( fid, '%d\n', UttSearch_Start( 1: Nutterances));
% fprintf( fid, '\n');
% fprintf( fid, '%d\n', UttSearch_End( 1: Nutterances));
% fclose(fid);

View File

@@ -0,0 +1,85 @@
function id_utterances( ref_Nsamples, ref_VAD, deg_Nsamples)
global Largest_uttsize MINUTTLENGTH MINUTTLENGTH Crude_DelayEst
global Downsample SEARCHBUFFER Nutterances Utt_Start
global Utt_End Utt_Delay
Utt_num = 1;
speech_flag = 0;
VAD_length = floor( ref_Nsamples / Downsample);
% fprintf( 1, 'VAD_length is %d\n', VAD_length);
del_deg_start = MINUTTLENGTH - Crude_DelayEst / Downsample;
del_deg_end = floor((deg_Nsamples- Crude_DelayEst)/ Downsample) ...
- MINUTTLENGTH;
for count = 1: VAD_length
VAD_value = ref_VAD(count);
if( (VAD_value > 0.0) && (speech_flag == 0) )
speech_flag = 1;
this_start = count;
Utt_Start (Utt_num) = count;
end
if( ((VAD_value == 0) || (count == VAD_length)) && ...
(speech_flag == 1) )
speech_flag = 0;
Utt_End (Utt_num) = count;
if( ((count - this_start) >= MINUTTLENGTH) && ...
(this_start < del_deg_end) && ...
(count > del_deg_start) )
Utt_num = Utt_num + 1;
end
end
end
Utt_Start(1) = SEARCHBUFFER+ 1;
Utt_End(Nutterances) = VAD_length - SEARCHBUFFER+ 1;
for Utt_num = 2: Nutterances
this_start = Utt_Start(Utt_num)- 1;
last_end = Utt_End(Utt_num - 1)- 1;
count = floor( (this_start + last_end) / 2);
Utt_Start(Utt_num) = count+ 1;
Utt_End(Utt_num - 1) = count+ 1;
end
this_start = (Utt_Start(1)- 1) * Downsample + Utt_Delay(1);
if( this_start < (SEARCHBUFFER * Downsample) )
count = SEARCHBUFFER + floor( ...
(Downsample - 1 - Utt_Delay(1)) / Downsample);
Utt_Start(1) = count+ 1;
end
last_end = (Utt_End(Nutterances)- 1) * Downsample + 1 + ...
Utt_Delay(Nutterances);
% fprintf( 'Utt_End(%d) is %d\n', Nutterances, Utt_End(Nutterances));
% fprintf( 'last_end is %d\n', last_end);
% fprintf( 'Utt_Delay(%d) is %d\n', Nutterances, Utt_Delay(Nutterances));
if( last_end > (deg_Nsamples - SEARCHBUFFER * Downsample+ 1) )
count = floor( (deg_Nsamples - Utt_Delay(Nutterances)) / Downsample) ...
- SEARCHBUFFER;
Utt_End(Nutterances) = count+ 1;
end
for Utt_num = 2: Nutterances
this_start = (Utt_Start(Utt_num)- 1) * Downsample + Utt_Delay(Utt_num);
last_end = (Utt_End(Utt_num - 1)- 1) * Downsample + Utt_Delay(Utt_num - 1);
if( this_start < last_end )
count = floor( (this_start + last_end) / 2);
this_start = floor( (Downsample- 1+ count- Utt_Delay(Utt_num))...
/ Downsample);
last_end = floor( (count - Utt_Delay(Utt_num - 1))...
/ Downsample);
Utt_Start(Utt_num) = this_start+ 1;
Utt_End(Utt_num- 1) = last_end+ 1;
end
end
Largest_uttsize= max( Utt_End- Utt_Start);

View File

@@ -0,0 +1,9 @@
function [mod_ref_data, mod_deg_data]= input_filter( ref_data, ref_Nsamples, ...
deg_data, deg_Nsamples)
mod_ref_data= DC_block( ref_data, ref_Nsamples);
mod_deg_data= DC_block( deg_data, deg_Nsamples);
mod_ref_data= apply_filters( mod_ref_data, ref_Nsamples);
mod_deg_data= apply_filters( mod_deg_data, deg_Nsamples);

View File

@@ -0,0 +1,127 @@
function [pesq_mos]= pesq(ref_wav, deg_wav)
% ----------------------------------------------------------------------
% PESQ objective speech quality measure
%
% This function implements the PESQ measure based on the ITU standard
% P.862 [1].
%
%
% Usage: pval=pesq(cleanFile.wav, enhancedFile.wav)
%
% cleanFile.wav - clean input file in .wav format
% enhancedFile - enhanced output file in .wav format
% pval - PESQ value
%
% Note that the PESQ routine only supports sampling rates of 8 kHz and
% 16 kHz [1]
%
% Example call: pval = pesq ('sp04.wav','enhanced.wav')
%
%
% References:
% [1] ITU (2000). Perceptual evaluation of speech quality (PESQ), and
% objective method for end-to-end speech quality assessment of
% narrowband telephone networks and speech codecs. ITU-T
% Recommendation P. 862
%
% Authors: Yi Hu and Philipos C. Loizou
%
%
% Copyright (c) 2006 by Philipos C. Loizou
% $Revision: 0.0 $ $Date: 10/09/2006 $
% ----------------------------------------------------------------------
if nargin<2
fprintf('Usage: [pesq_mos]=pesq(cleanfile.wav,enhanced.wav) \n');
return;
end;
global Downsample DATAPADDING_MSECS SEARCHBUFFER Fs WHOLE_SIGNAL
global Align_Nfft Window
[ref_data,sampling_rate]= audioread( ref_wav);
if sampling_rate~=8000 & sampling_rate~=16000
error('Sampling frequency needs to be either 8000 or 16000 Hz');
end
setup_global( sampling_rate);
% Window= hann( Align_Nfft, 'periodic'); %Hanning window
% Window= Window';
TWOPI= 6.28318530717959;
%for count = 0: Align_Nfft- 1
% Window(1+ count) = 0.5 * (1.0 - cos((TWOPI * count) / Align_Nfft));
%end
count=0:Align_Nfft- 1;
Window= 0.5 * (1.0 - cos((TWOPI * count) / Align_Nfft));
ref_data= ref_data';
ref_data= ref_data* 32768;
ref_Nsamples= length( ref_data)+ 2* SEARCHBUFFER* Downsample;
ref_data= [zeros( 1, SEARCHBUFFER* Downsample), ref_data, ...
zeros( 1, DATAPADDING_MSECS* (Fs/ 1000)+ SEARCHBUFFER* Downsample)];
deg_data= audioread( deg_wav);
deg_data= deg_data';
deg_data= deg_data* 32768;
deg_Nsamples= length( deg_data)+ 2* SEARCHBUFFER* Downsample;
deg_data= [zeros( 1, SEARCHBUFFER* Downsample), deg_data, ...
zeros( 1, DATAPADDING_MSECS* (Fs/ 1000)+ SEARCHBUFFER* Downsample)];
maxNsamples= max( ref_Nsamples, deg_Nsamples);
ref_data= fix_power_level( ref_data, ref_Nsamples, maxNsamples);
deg_data= fix_power_level( deg_data, deg_Nsamples, maxNsamples);
standard_IRS_filter_dB= [0, -200; 50, -40; 100, -20; 125, -12; 160, -6; 200, 0;...
250, 4; 300, 6; 350, 8; 400, 10; 500, 11; 600, 12; 700, 12; 800, 12;...
1000, 12; 1300, 12; 1600, 12; 2000, 12; 2500, 12; 3000, 12; 3250, 12;...
3500, 4; 4000, -200; 5000, -200; 6300, -200; 8000, -200];
ref_data= apply_filter( ref_data, ref_Nsamples, standard_IRS_filter_dB);
deg_data= apply_filter( deg_data, deg_Nsamples, standard_IRS_filter_dB);
%
% for later use in psychoacoustical model
model_ref= ref_data;
model_deg= deg_data;
[ref_data, deg_data]= input_filter( ref_data, ref_Nsamples, deg_data, ...
deg_Nsamples);
[ref_VAD, ref_logVAD]= apply_VAD( ref_data, ref_Nsamples);
[deg_VAD, deg_logVAD]= apply_VAD( deg_data, deg_Nsamples);
crude_align (ref_logVAD, ref_Nsamples, deg_logVAD, deg_Nsamples,...
WHOLE_SIGNAL);
utterance_locate (ref_data, ref_Nsamples, ref_VAD, ref_logVAD,...
deg_data, deg_Nsamples, deg_VAD, deg_logVAD);
ref_data= model_ref;
deg_data= model_deg;
% make ref_data and deg_data equal length
if (ref_Nsamples< deg_Nsamples)
newlen= deg_Nsamples+ DATAPADDING_MSECS* (Fs/ 1000);
ref_data( newlen)= 0;
elseif (ref_Nsamples> deg_Nsamples)
newlen= ref_Nsamples+ DATAPADDING_MSECS* (Fs/ 1000);
deg_data( newlen)= 0;
end
pesq_mos= pesq_psychoacoustic_model (ref_data, ref_Nsamples, deg_data, ...
deg_Nsamples );

View File

@@ -0,0 +1,920 @@
function pesq_mos= pesq_psychoacoustic_model (ref_data, ref_Nsamples, deg_data, ...
deg_Nsamples )
global CALIBRATE Nfmax Nb Sl Sp
global nr_of_hz_bands_per_bark_band centre_of_band_bark
global width_of_band_hz centre_of_band_hz width_of_band_bark
global pow_dens_correction_factor abs_thresh_power
global Downsample SEARCHBUFFER DATAPADDING_MSECS Fs Nutterances
global Utt_Start Utt_End Utt_Delay NUMBER_OF_PSQM_FRAMES_PER_SYLLABE
global Fs Plot_Frame
% Plot_Frame= 75; % this is the frame whose spectrum will be plotted
FALSE= 0;
TRUE= 1;
NUMBER_OF_PSQM_FRAMES_PER_SYLLABE= 20;
maxNsamples = max (ref_Nsamples, deg_Nsamples);
Nf = Downsample * 8;
MAX_NUMBER_OF_BAD_INTERVALS = 1000;
start_frame_of_bad_interval= zeros( 1, MAX_NUMBER_OF_BAD_INTERVALS);
stop_frame_of_bad_interval= zeros( 1, MAX_NUMBER_OF_BAD_INTERVALS);
start_sample_of_bad_interval= zeros( 1, MAX_NUMBER_OF_BAD_INTERVALS);
stop_sample_of_bad_interval= zeros( 1, MAX_NUMBER_OF_BAD_INTERVALS);
number_of_samples_in_bad_interval= zeros( 1, MAX_NUMBER_OF_BAD_INTERVALS);
delay_in_samples_in_bad_interval= zeros( 1, MAX_NUMBER_OF_BAD_INTERVALS);
number_of_bad_intervals= 0;
there_is_a_bad_frame= FALSE;
Whanning= hann( Nf, 'periodic');
Whanning= Whanning';
D_POW_F = 2;
D_POW_S = 6;
D_POW_T = 2;
A_POW_F = 1;
A_POW_S = 6;
A_POW_T = 2;
D_WEIGHT= 0.1;
A_WEIGHT= 0.0309;
CRITERIUM_FOR_SILENCE_OF_5_SAMPLES = 500;
samples_to_skip_at_start = 0;
sum_of_5_samples= 0;
while ((sum_of_5_samples< CRITERIUM_FOR_SILENCE_OF_5_SAMPLES) ...
&& (samples_to_skip_at_start < maxNsamples / 2))
sum_of_5_samples= sum( abs( ref_data( samples_to_skip_at_start...
+ SEARCHBUFFER * Downsample + 1: samples_to_skip_at_start...
+ SEARCHBUFFER * Downsample + 5)));
if (sum_of_5_samples< CRITERIUM_FOR_SILENCE_OF_5_SAMPLES)
samples_to_skip_at_start = samples_to_skip_at_start+ 1;
end
end
% fprintf( 'samples_to_skip_at_start is %d\n', samples_to_skip_at_start);
samples_to_skip_at_end = 0;
sum_of_5_samples= 0;
while ((sum_of_5_samples< CRITERIUM_FOR_SILENCE_OF_5_SAMPLES) ...
&& (samples_to_skip_at_end < maxNsamples / 2))
sum_of_5_samples= sum( abs( ref_data( maxNsamples - ...
SEARCHBUFFER* Downsample + DATAPADDING_MSECS* (Fs/ 1000) ...
- samples_to_skip_at_end - 4: maxNsamples - ...
SEARCHBUFFER* Downsample + DATAPADDING_MSECS* (Fs/ 1000) ...
- samples_to_skip_at_end)));
if (sum_of_5_samples< CRITERIUM_FOR_SILENCE_OF_5_SAMPLES)
samples_to_skip_at_end = samples_to_skip_at_end+ 1;
end
end
% fprintf( 'samples_to_skip_at_end is %d\n', samples_to_skip_at_end);
start_frame = floor( samples_to_skip_at_start/ (Nf/ 2));
stop_frame = floor( (maxNsamples- 2* SEARCHBUFFER* Downsample ...
+ DATAPADDING_MSECS* (Fs/ 1000)- samples_to_skip_at_end) ...
/ (Nf/ 2))- 1;
% number of frames in speech data plus DATAPADDING_MSECS
% fprintf( 'start/end frame is %d/%d\n', start_frame, stop_frame);
D_disturbance= zeros( stop_frame+ 1, Nb);
DA_disturbance= zeros( stop_frame+ 1, Nb);
power_ref = pow_of (ref_data, SEARCHBUFFER* Downsample, ...
maxNsamples- SEARCHBUFFER* Downsample+ DATAPADDING_MSECS* (Fs/ 1000),...
maxNsamples- 2* SEARCHBUFFER* Downsample+ DATAPADDING_MSECS* (Fs/ 1000));
power_deg = pow_of (deg_data, SEARCHBUFFER * Downsample, ...
maxNsamples- SEARCHBUFFER* Downsample+ DATAPADDING_MSECS* (Fs/ 1000),...
maxNsamples- 2* SEARCHBUFFER* Downsample+ DATAPADDING_MSECS* (Fs/ 1000));
% fprintf( 'ref/deg power is %f/%f\n', power_ref, power_deg);
hz_spectrum_ref = zeros( 1, Nf/ 2);
hz_spectrum_deg = zeros( 1, Nf/ 2);
frame_is_bad = zeros( 1, stop_frame + 1);
smeared_frame_is_bad = zeros( 1, stop_frame + 1);
silent = zeros( 1, stop_frame + 1);
pitch_pow_dens_ref = zeros( stop_frame + 1, Nb);
pitch_pow_dens_deg = zeros( stop_frame + 1, Nb);
frame_was_skipped = zeros( 1, stop_frame + 1);
frame_disturbance = zeros( 1, stop_frame + 1);
frame_disturbance_asym_add = zeros( 1, stop_frame + 1);
avg_pitch_pow_dens_ref = zeros( 1, Nb);
avg_pitch_pow_dens_deg = zeros( 1, Nb);
loudness_dens_ref = zeros( 1, Nb);
loudness_dens_deg = zeros( 1, Nb);
deadzone = zeros( 1, Nb);
disturbance_dens = zeros( 1, Nb);
disturbance_dens_asym_add = zeros( 1, Nb);
time_weight = zeros( 1, stop_frame + 1);
total_power_ref = zeros( 1, stop_frame + 1);
% fid= fopen( 'tmp_mat.txt', 'wt');
for frame = 0: stop_frame
start_sample_ref = 1+ SEARCHBUFFER * Downsample + frame* (Nf/ 2);
hz_spectrum_ref= short_term_fft (Nf, ref_data, Whanning, ...
start_sample_ref);
utt = Nutterances;
while ((utt >= 1) && ((Utt_Start(utt)- 1)* Downsample+ 1 ...
> start_sample_ref))
utt= utt - 1;
end
if (utt >= 1)
delay = Utt_Delay(utt);
else
delay = Utt_Delay(1);
end
start_sample_deg = start_sample_ref + delay;
if ((start_sample_deg > 0) && (start_sample_deg + Nf- 1 < ...
maxNsamples+ DATAPADDING_MSECS* (Fs/ 1000)))
hz_spectrum_deg= short_term_fft (Nf, deg_data, Whanning, ...
start_sample_deg);
else
hz_spectrum_deg( 1: Nf/ 2)= 0;
end
pitch_pow_dens_ref( frame+ 1, :)= freq_warping (...
hz_spectrum_ref, Nb, frame);
%peak = maximum_of (pitch_pow_dens_ref, 0, Nb);
pitch_pow_dens_deg( frame+ 1, :)= freq_warping (...
hz_spectrum_deg, Nb, frame);
total_audible_pow_ref = total_audible (frame, pitch_pow_dens_ref, 1E2);
total_audible_pow_deg = total_audible (frame, pitch_pow_dens_deg, 1E2);
silent(frame+ 1) = (total_audible_pow_ref < 1E7);
end
% fclose( fid);
avg_pitch_pow_dens_ref= time_avg_audible_of (stop_frame + 1, ...
silent, pitch_pow_dens_ref, floor((maxNsamples- 2* SEARCHBUFFER* ...
Downsample+ DATAPADDING_MSECS* (Fs/ 1000))/ (Nf / 2))- 1);
avg_pitch_pow_dens_deg= time_avg_audible_of (stop_frame + 1, ...
silent, pitch_pow_dens_deg, floor((maxNsamples- 2* SEARCHBUFFER* ...
Downsample+ DATAPADDING_MSECS* (Fs/ 1000))/ (Nf/ 2))- 1);
% fid= fopen( 'tmp_mat.txt', 'wt');
% fprintf( fid, '%f\n', avg_pitch_pow_dens_deg);
% fclose( fid);
if (CALIBRATE== 0)
pitch_pow_dens_ref= freq_resp_compensation (stop_frame + 1, ...
pitch_pow_dens_ref, avg_pitch_pow_dens_ref, ...
avg_pitch_pow_dens_deg, 1000);
if (Plot_Frame>= 0) % plot pitch_pow_dens_ref
figure;
subplot( 1, 2, 1);
plot( centre_of_band_hz, 10* log10( eps+ ...
pitch_pow_dens_ref( Plot_Frame+ 1, :)));
axis( [0 Fs/2 0 95]); %xlabel( 'Hz'); ylabel( 'Db');
title( 'reference signal bark spectrum with frequency compensation');
subplot( 1, 2, 2);
plot( centre_of_band_hz, 10* log10( eps+ ...
pitch_pow_dens_deg( Plot_Frame+ 1, :)));
axis( [0 Fs/2 0 95]); %xlabel( 'Hz'); ylabel( 'Db');
title( 'degraded signal bark spectrum');
end
end
% tmp1= pitch_pow_dens_ref';
MAX_SCALE = 5.0;
MIN_SCALE = 3e-4;
oldScale = 1;
THRESHOLD_BAD_FRAMES = 30;
for frame = 0: stop_frame
total_audible_pow_ref = total_audible (frame, pitch_pow_dens_ref, 1);
total_audible_pow_deg = total_audible (frame, pitch_pow_dens_deg, 1);
total_power_ref (1+ frame) = total_audible_pow_ref;
scale = (total_audible_pow_ref + 5e3)/ (total_audible_pow_deg + 5e3);
if (frame > 0)
scale = 0.2 * oldScale + 0.8 * scale;
end
oldScale = scale;
if (scale > MAX_SCALE)
scale = MAX_SCALE;
elseif (scale < MIN_SCALE)
scale = MIN_SCALE;
end
pitch_pow_dens_deg( 1+ frame, :) = ...
pitch_pow_dens_deg( 1+ frame, :) * scale;
if (frame== Plot_Frame)
figure;
subplot( 1, 2, 1);
plot( centre_of_band_hz, 10* log10( eps+ ...
pitch_pow_dens_ref( Plot_Frame+ 1, :)));
axis( [0 Fs/2 0 95]); %xlabel( 'Hz'); ylabel( 'Db');
subplot( 1, 2, 2);
plot( centre_of_band_hz, 10* log10( eps+ ...
pitch_pow_dens_deg( Plot_Frame+ 1, :)));
axis( [0 Fs/2 0 95]); %xlabel( 'Hz'); ylabel( 'Db');
end
loudness_dens_ref = intensity_warping_of (frame, pitch_pow_dens_ref);
loudness_dens_deg = intensity_warping_of (frame, pitch_pow_dens_deg);
disturbance_dens = loudness_dens_deg - loudness_dens_ref;
if (frame== Plot_Frame)
figure;
subplot( 1, 2, 1);
plot( centre_of_band_hz, 10* log10( eps+ ...
loudness_dens_ref));
axis( [0 Fs/2 0 15]); %xlabel( 'Hz'); ylabel( 'Db');
title( 'reference signal loudness density');
subplot( 1, 2, 2);
plot( centre_of_band_hz, 10* log10( eps+ ...
loudness_dens_deg));
axis( [0 Fs/2 0 15]); %xlabel( 'Hz'); ylabel( 'Db');
title( 'degraded signal loudness density');
end
for band =1: Nb
deadzone (band) = 0.25* min (loudness_dens_deg (band), ...
loudness_dens_ref (band));
end
for band = 1: Nb
d = disturbance_dens (band);
m = deadzone (band);
if (d > m)
disturbance_dens (band) = disturbance_dens (band)- m;
% disturbance_dens (band) = d- m;
else
if (d < -m)
disturbance_dens (band) = disturbance_dens (band)+ m;
% disturbance_dens (band) = d+ m;
else
disturbance_dens (band) = 0;
end
end
end
if (frame== Plot_Frame)
figure;
subplot( 1, 2, 1);
plot( centre_of_band_hz, disturbance_dens);
axis( [0 Fs/2 -1 50]); %xlabel( 'Hz'); ylabel( 'Db');
title( 'disturbance');
end
D_disturbance( frame+ 1, :)= disturbance_dens;
frame_disturbance (1+ frame) = pseudo_Lp (disturbance_dens, D_POW_F);
if (frame_disturbance (1+ frame) > THRESHOLD_BAD_FRAMES)
there_is_a_bad_frame = TRUE;
end
disturbance_dens= multiply_with_asymmetry_factor (...
disturbance_dens, frame, pitch_pow_dens_ref, pitch_pow_dens_deg);
if (frame== Plot_Frame)
subplot( 1, 2, 2);
plot( centre_of_band_hz, disturbance_dens);
axis( [0 Fs/2 -1 50]); %xlabel( 'Hz'); ylabel( 'Db');
title( 'disturbance after asymmetry processing');
end
DA_disturbance( frame+ 1, :)= disturbance_dens;
frame_disturbance_asym_add (1+ frame) = ...
pseudo_Lp (disturbance_dens, A_POW_F);
end
% fid= fopen( 'tmp_mat.txt', 'wt');
% fprintf( fid, '%f\n', frame_disturbance);
% fclose( fid);
frame_was_skipped (1: 1+ stop_frame) = FALSE;
for utt = 2: Nutterances
frame1 = floor (((Utt_Start(utt)- 1- SEARCHBUFFER )* Downsample+ 1+ ...
Utt_Delay(utt))/ (Nf/ 2));
j = floor( floor(((Utt_End(utt-1)- 1- SEARCHBUFFER)* Downsample+ 1+ ...
Utt_Delay(utt-1)))/(Nf/ 2));
delay_jump = Utt_Delay(utt) - Utt_Delay(utt-1);
if (frame1 > j)
frame1 = j;
elseif (frame1 < 0)
frame1 = 0;
end
% fprintf( 'frame1, j, delay_jump is %d, %d, %d\n', frame1, ...
% j, delay_jump);
if (delay_jump < -(Nf/ 2))
frame2 = floor (((Utt_Start(utt)- 1- SEARCHBUFFER)* Downsample+ 1 ...
+ max (0, abs (delay_jump)))/ (Nf/ 2)) + 1;
for frame = frame1: frame2
if (frame < stop_frame)
frame_was_skipped (1+ frame) = TRUE;
frame_disturbance (1+ frame) = 0;
frame_disturbance_asym_add (1+ frame) = 0;
end
end
end
end
nn = DATAPADDING_MSECS* (Fs/ 1000) + maxNsamples;
tweaked_deg = zeros( 1, nn);
% fprintf( 'nn is %d\n', nn);
for i= SEARCHBUFFER* Downsample+ 1: nn- SEARCHBUFFER* Downsample
utt = Nutterances;
while ((utt >= 1) && ((Utt_Start (utt)- 1)* Downsample> i))
utt = utt- 1;
end
if (utt >= 1)
delay = Utt_Delay (utt);
else
delay = Utt_Delay (1);
end
j = i + delay;
if (j < SEARCHBUFFER * Downsample+ 1)
j = SEARCHBUFFER * Downsample+ 1;
end
if (j > nn - SEARCHBUFFER * Downsample)
j = nn - SEARCHBUFFER * Downsample;
end
tweaked_deg (i) = deg_data (j);
end
if (there_is_a_bad_frame)
for frame = 0: stop_frame
frame_is_bad (1+ frame) = (frame_disturbance (1+ frame)...
> THRESHOLD_BAD_FRAMES);
smeared_frame_is_bad (1+ frame) = FALSE;
end
frame_is_bad (1) = FALSE;
SMEAR_RANGE = 2;
for frame = SMEAR_RANGE: stop_frame- 1- SMEAR_RANGE
max_itself_and_left = frame_is_bad (1+ frame);
max_itself_and_right = frame_is_bad (1+ frame);
for i = -SMEAR_RANGE: 0
if (max_itself_and_left < frame_is_bad (1+ frame+ i))
max_itself_and_left = frame_is_bad (1+ frame+ i);
end
end
for i = 0: SMEAR_RANGE
if (max_itself_and_right < frame_is_bad (1+ frame + i))
max_itself_and_right = frame_is_bad (1+ frame + i);
end
end
mini = max_itself_and_left;
if (mini > max_itself_and_right)
mini = max_itself_and_right;
end
smeared_frame_is_bad (1+ frame) = mini;
end
MINIMUM_NUMBER_OF_BAD_FRAMES_IN_BAD_INTERVAL = 5;
number_of_bad_intervals = 0;
frame = 0;
while (frame <= stop_frame)
while ((frame <= stop_frame) && (~smeared_frame_is_bad (1+ frame)))
frame= frame+ 1;
end
if (frame <= stop_frame)
start_frame_of_bad_interval(1+ number_of_bad_intervals)= ...
1+ frame;
while ((frame <= stop_frame) && (...
smeared_frame_is_bad (1+ frame)))
frame= frame+ 1;
end
if (frame <= stop_frame)
stop_frame_of_bad_interval(1+ number_of_bad_intervals)= ...
1+ frame;
if (stop_frame_of_bad_interval(1+ number_of_bad_intervals)- ...
start_frame_of_bad_interval(1+ number_of_bad_intervals)...
>= MINIMUM_NUMBER_OF_BAD_FRAMES_IN_BAD_INTERVAL)
number_of_bad_intervals= number_of_bad_intervals+ 1;
end
end
end
end
for bad_interval = 0: number_of_bad_intervals - 1
start_sample_of_bad_interval(1+ bad_interval) = ...
(start_frame_of_bad_interval(1+ bad_interval)- 1) * (Nf/ 2) ...
+ SEARCHBUFFER * Downsample+ 1;
stop_sample_of_bad_interval(1+ bad_interval) = ...
(stop_frame_of_bad_interval(1+ bad_interval)- 1) * (Nf/ 2) ...
+ Nf + SEARCHBUFFER* Downsample;
if (stop_frame_of_bad_interval(1+ bad_interval) > stop_frame+ 1)
stop_frame_of_bad_interval(1+ bad_interval) = stop_frame+ 1;
end
number_of_samples_in_bad_interval(1+ bad_interval) = ...
stop_sample_of_bad_interval(1+ bad_interval) - ...
start_sample_of_bad_interval(1+ bad_interval)+ 1;
end
% fprintf( 'number of bad intervals %d\n', number_of_bad_intervals);
% fprintf( '%d %d\n', number_of_samples_in_bad_interval(1), ...
% number_of_samples_in_bad_interval(2));
% fprintf( '%d %d\n', start_sample_of_bad_interval(1), ...
% start_sample_of_bad_interval(2));
SEARCH_RANGE_IN_TRANSFORM_LENGTH = 4;
search_range_in_samples= SEARCH_RANGE_IN_TRANSFORM_LENGTH * Nf;
for bad_interval= 0: number_of_bad_intervals- 1
ref = zeros (1, 2 * search_range_in_samples + ...
number_of_samples_in_bad_interval (1+ bad_interval));
deg = zeros (1, 2 * search_range_in_samples + ...
number_of_samples_in_bad_interval (1+ bad_interval));
ref(1: search_range_in_samples) = 0;
ref (search_range_in_samples+ 1: search_range_in_samples+ ...
number_of_samples_in_bad_interval (1+ bad_interval)) = ...
ref_data (start_sample_of_bad_interval( 1+ bad_interval) + 1: ...
start_sample_of_bad_interval( 1+ bad_interval) + ...
number_of_samples_in_bad_interval (1+ bad_interval));
ref (search_range_in_samples + ...
number_of_samples_in_bad_interval (1+ bad_interval) + 1: ...
search_range_in_samples + ...
number_of_samples_in_bad_interval (1+ bad_interval) + ...
search_range_in_samples) = 0;
for i = 0: 2 * search_range_in_samples + ...
number_of_samples_in_bad_interval (1+ bad_interval) - 1
j = start_sample_of_bad_interval (1+ bad_interval) - ...
search_range_in_samples + i;
nn = maxNsamples - SEARCHBUFFER * Downsample + ...
DATAPADDING_MSECS * (Fs / 1000);
if (j <= SEARCHBUFFER * Downsample)
j = SEARCHBUFFER * Downsample+ 1;
end
if (j > nn)
j = nn;
end
deg (1+ i) = tweaked_deg (j);
end
[delay_in_samples, best_correlation]= compute_delay ...
(1, 2 * search_range_in_samples + ...
number_of_samples_in_bad_interval (1+ bad_interval), ...
search_range_in_samples, ref, deg);
delay_in_samples_in_bad_interval (1+ bad_interval) = ...
delay_in_samples;
% fprintf( 'delay_in_samples, best_correlation is \n\t%d, %f\n', ...
% delay_in_samples, best_correlation);
%
if (best_correlation < 0.5)
delay_in_samples_in_bad_interval (1+ bad_interval) = 0;
end
end
if (number_of_bad_intervals > 0)
doubly_tweaked_deg = tweaked_deg( 1: maxNsamples + ...
DATAPADDING_MSECS * (Fs / 1000));
for bad_interval= 0: number_of_bad_intervals- 1
delay = delay_in_samples_in_bad_interval (1+ bad_interval);
for i = start_sample_of_bad_interval (1+ bad_interval): ...
stop_sample_of_bad_interval (1+ bad_interval)
j = i + delay;
if (j < 1)
j = 1;
end
if (j > maxNsamples)
j = maxNsamples;
end
h = tweaked_deg (j);
doubly_tweaked_deg (i) = h;
end
end
untweaked_deg = deg_data;
deg_data = doubly_tweaked_deg;
for bad_interval= 0: number_of_bad_intervals- 1
for frame = start_frame_of_bad_interval (1+ bad_interval): ...
stop_frame_of_bad_interval (1+ bad_interval)- 1
frame= frame- 1;
start_sample_ref = SEARCHBUFFER * Downsample + ...
frame * Nf / 2+ 1;
start_sample_deg = start_sample_ref;
hz_spectrum_deg= short_term_fft (Nf, deg_data, ...
Whanning, start_sample_deg);
pitch_pow_dens_deg( 1+ frame, :)= freq_warping (...
hz_spectrum_deg, Nb, frame);
end
oldScale = 1;
for frame = start_frame_of_bad_interval (1+ bad_interval): ...
stop_frame_of_bad_interval (1+ bad_interval)- 1
frame= frame- 1;
% see implementation for detail why 1 needed to be
% subtracted
total_audible_pow_ref = total_audible (frame, ...
pitch_pow_dens_ref, 1);
total_audible_pow_deg = total_audible (frame, ...
pitch_pow_dens_deg, 1);
scale = (total_audible_pow_ref + 5e3) / ...
(total_audible_pow_deg + 5e3);
if (frame > 0)
scale = 0.2 * oldScale + 0.8*scale;
end
oldScale = scale;
if (scale > MAX_SCALE)
scale = MAX_SCALE;
end
if (scale < MIN_SCALE)
scale = MIN_SCALE;
end
pitch_pow_dens_deg (1+ frame, :) = ...
pitch_pow_dens_deg (1+ frame, :)* scale;
loudness_dens_ref= intensity_warping_of (frame, ...
pitch_pow_dens_ref);
loudness_dens_deg= intensity_warping_of (frame, ...
pitch_pow_dens_deg);
disturbance_dens = loudness_dens_deg - loudness_dens_ref;
for band = 1: Nb
deadzone(band) = min (loudness_dens_deg(band), ...
loudness_dens_ref(band));
deadzone(band) = deadzone(band)* 0.25;
end
for band = 1: Nb
d = disturbance_dens (band);
m = deadzone (band);
if (d > m)
disturbance_dens (band) = ...
disturbance_dens (band)- m;
else
if (d < -m)
disturbance_dens (band) = ...
disturbance_dens (band)+ m;
else
disturbance_dens (band) = 0;
end
end
end
frame_disturbance( 1+ frame) = min (...
frame_disturbance( 1+ frame), pseudo_Lp(...
disturbance_dens, D_POW_F));
disturbance_dens= multiply_with_asymmetry_factor ...
(disturbance_dens, frame, pitch_pow_dens_ref, ...
pitch_pow_dens_deg);
frame_disturbance_asym_add(1+ frame) = min (...
frame_disturbance_asym_add(1+ frame), ...
pseudo_Lp (disturbance_dens, A_POW_F));
end
end
deg_data = untweaked_deg;
end
end
for frame = 0: stop_frame
h = 1;
if (stop_frame + 1 > 1000)
n = floor( (maxNsamples - 2 * SEARCHBUFFER * Downsample)...
/ (Nf / 2)) - 1;
timeWeightFactor = (n - 1000) / 5500;
if (timeWeightFactor > 0.5)
timeWeightFactor = 0.5;
end
h = (1.0 - timeWeightFactor) + timeWeightFactor * frame / n;
end
time_weight (1 +frame) = h;
end
% fid= fopen( 'tmp_mat1.txt', 'at');
% fprintf( '\n');
for frame = 0: stop_frame
h = ((total_power_ref (1+ frame) + 1e5) / 1e7)^ 0.04;
% if (frame== 118)
% fprintf( '%f\n', h);
% fprintf( '%f\n', frame_disturbance( 1+ frame));
% end
frame_disturbance( 1+ frame) = frame_disturbance( 1+ frame)/ h;
% if (frame== 118)
% fprintf( '%f\n', frame_disturbance( 1+ frame));
% end
%
frame_disturbance_asym_add( 1+ frame) = ...
frame_disturbance_asym_add( 1+ frame)/ h;
if (frame_disturbance( 1+ frame) > 45)
frame_disturbance( 1+ frame) = 45;
end
if (frame_disturbance_asym_add( 1+ frame)> 45)
frame_disturbance_asym_add( 1+ frame) = 45;
end
end
% fclose ( fid);
d_indicator = Lpq_weight (start_frame, stop_frame, ...
D_POW_S, D_POW_T, frame_disturbance, time_weight);
a_indicator = Lpq_weight (start_frame, stop_frame, ...
A_POW_S, A_POW_T, frame_disturbance_asym_add, time_weight);
pesq_mos = 4.5 - D_WEIGHT * d_indicator - A_WEIGHT * a_indicator;
if (Plot_Frame> 0)
figure;
subplot( 1, 2, 1);
mesh( 0: stop_frame, centre_of_band_hz, D_disturbance');
title( 'disturbance');
subplot( 1, 2, 2);
mesh( 0: stop_frame, centre_of_band_hz, DA_disturbance');
title( 'disturbance after asymmetry processing');
end
% fid= fopen( 'tmp_mat.txt', 'wt');
% fprintf( fid, 'time_weight\n');
% fprintf( fid, '%f\n', time_weight);
% fprintf( fid, 'frame_disturbance:\n');
% fprintf( fid, '%f\n', frame_disturbance);
% fprintf( fid, 'frame_disturbance_asym_add\n');
% fprintf( fid, '%f\n', frame_disturbance_asym_add);
% fclose( fid);
function result_time= Lpq_weight(start_frame, stop_frame, ...
power_syllable, power_time, frame_disturbance, time_weight)
global NUMBER_OF_PSQM_FRAMES_PER_SYLLABE
% fid= fopen( 'tmp_mat1.txt', 'at');
% fprintf( 'result_time:\n');
result_time= 0;
total_time_weight_time = 0;
% fprintf( 'start/end frame: %d/%d\n', start_frame, stop_frame);
for start_frame_of_syllable = start_frame: ...
NUMBER_OF_PSQM_FRAMES_PER_SYLLABE/2: stop_frame
result_syllable = 0;
count_syllable = 0;
for frame = start_frame_of_syllable: ...
start_frame_of_syllable + NUMBER_OF_PSQM_FRAMES_PER_SYLLABE- 1
if (frame <= stop_frame)
h = frame_disturbance(1+ frame);
% if (start_frame_of_syllable== 101)
% fprintf( fid, '%f\n', h);
% end
result_syllable = result_syllable+ (h^ power_syllable);
end
count_syllable = count_syllable+ 1;
end
result_syllable = result_syllable/ count_syllable;
result_syllable = result_syllable^ (1/power_syllable);
result_time= result_time+ (time_weight (...
1+ start_frame_of_syllable - start_frame) * ...
result_syllable)^ power_time;
total_time_weight_time = total_time_weight_time+ ...
time_weight (1+ start_frame_of_syllable - start_frame)^ power_time;
% fprintf( fid, '%f\n', result_time);
end
% fclose (fid);
% fprintf( 'total_time_weight_time is %f\n', total_time_weight_time);
result_time = result_time/ total_time_weight_time;
result_time= result_time^ (1/ power_time);
% fprintf( 'result_time is %f\n\n', result_time);
function [best_delay, max_correlation] = compute_delay (...
start_sample, stop_sample, search_range, ...
time_series1, time_series2)
n = stop_sample - start_sample+ 1;
power_of_2 = 2^ (ceil( log2( 2 * n)));
power1 = pow_of (time_series1, start_sample, stop_sample, n)* ...
n/ power_of_2;
power2 = pow_of (time_series2, start_sample, stop_sample, n)* ...
n/ power_of_2;
normalization = sqrt (power1 * power2);
% fprintf( 'normalization is %f\n', normalization);
if ((power1 <= 1e-6) || (power2 <= 1e-6))
max_correlation = 0;
best_delay= 0;
end
x1( 1: power_of_2)= 0;
x2( 1: power_of_2)= 0;
y( 1: power_of_2)= 0;
x1( 1: n)= abs( time_series1( start_sample: ...
stop_sample));
x2( 1: n)= abs( time_series2( start_sample: ...
stop_sample));
x1_fft= fft( x1, power_of_2)/ power_of_2;
x2_fft= fft( x2, power_of_2);
x1_fft_conj= conj( x1_fft);
y= ifft( x1_fft_conj.* x2_fft, power_of_2);
best_delay = 0;
max_correlation = 0;
% these loop can be rewritten
for i = -search_range: -1
h = abs (y (1+ i + power_of_2)) / normalization;
if (h > max_correlation)
max_correlation = h;
best_delay= i;
end
end
for i = 0: search_range- 1
h = abs (y (1+i)) / normalization;
if (h > max_correlation)
max_correlation = h;
best_delay= i;
end
end
best_delay= best_delay- 1;
function mod_disturbance_dens= multiply_with_asymmetry_factor (...
disturbance_dens, frame, pitch_pow_dens_ref, pitch_pow_dens_deg)
global Nb
for i = 1: Nb
ratio = (pitch_pow_dens_deg(1+ frame, i) + 50)...
/ (pitch_pow_dens_ref (1+ frame, i) + 50);
h = ratio^ 1.2;
if (h > 12)
h = 12;
elseif (h < 3)
h = 0.0;
end
mod_disturbance_dens (i) = disturbance_dens (i) * h;
end
function loudness_dens = intensity_warping_of (...
frame, pitch_pow_dens)
global abs_thresh_power Sl Nb centre_of_band_bark
ZWICKER_POWER= 0.23;
for band = 1: Nb
threshold = abs_thresh_power (band);
input = pitch_pow_dens (1+ frame, band);
if (centre_of_band_bark (band) < 4)
h = 6 / (centre_of_band_bark (band) + 2);
else
h = 1;
end
if (h > 2)
h = 2;
end
h = h^ 0.15;
modified_zwicker_power = ZWICKER_POWER * h;
if (input > threshold)
loudness_dens (band) = ((threshold / 0.5)^ modified_zwicker_power)...
* ((0.5 + 0.5 * input / threshold)^ modified_zwicker_power- 1);
else
loudness_dens (band) = 0;
end
loudness_dens (band) = loudness_dens (band)* Sl;
end
function result= pseudo_Lp (x, p)
global Nb width_of_band_bark
totalWeight = 0;
result = 0;
for band = 2: Nb
h = abs (x (band));
w = width_of_band_bark (band);
prod = h * w;
result = result+ prod^ p;
totalWeight = totalWeight+ w;
end
result = (result/ totalWeight)^ (1/p);
result = result* totalWeight;
function mod_pitch_pow_dens_ref= freq_resp_compensation (number_of_frames, ...
pitch_pow_dens_ref, avg_pitch_pow_dens_ref, ...
avg_pitch_pow_dens_deg, constant)
global Nb
for band = 1: Nb
x = (avg_pitch_pow_dens_deg (band) + constant) / ...
(avg_pitch_pow_dens_ref (band) + constant);
if (x > 100.0)
x = 100.0;
elseif (x < 0.01)
x = 0.01;
end
for frame = 1: number_of_frames
mod_pitch_pow_dens_ref(frame, band) = ...
pitch_pow_dens_ref(frame, band) * x;
end
end
function avg_pitch_pow_dens= time_avg_audible_of(number_of_frames, ...
silent, pitch_pow_dens, total_number_of_frames)
global Nb abs_thresh_power
for band = 1: Nb
result = 0;
for frame = 1: number_of_frames
if (~silent (frame))
h = pitch_pow_dens (frame, band);
if (h > 100 * abs_thresh_power (band))
result = result + h;
end
end
avg_pitch_pow_dens (band) = result/ total_number_of_frames;
end
end
function hz_spectrum= short_term_fft (Nf, data, Whanning, start_sample)
x1= data( start_sample: start_sample+ Nf-1).* Whanning;
x1_fft= fft( x1);
hz_spectrum= abs( x1_fft( 1: Nf/ 2)).^ 2;
hz_spectrum( 1)= 0;
function pitch_pow_dens= freq_warping( hz_spectrum, Nb, frame)
global nr_of_hz_bands_per_bark_band pow_dens_correction_factor
global Sp
hz_band = 1;
for bark_band = 1: Nb
n = nr_of_hz_bands_per_bark_band (bark_band);
sum = 0;
for i = 1: n
sum = sum+ hz_spectrum( hz_band);
hz_band= hz_band+ 1;
end
sum = sum* pow_dens_correction_factor (bark_band);
sum = sum* Sp;
pitch_pow_dens (bark_band) = sum;
end
function total_audible_pow = total_audible (frame, ...
pitch_pow_dens, factor)
global Nb abs_thresh_power
total_audible_pow = 0;
for band= 2: Nb
h = pitch_pow_dens (frame+ 1,band);
threshold = factor * abs_thresh_power (band);
if (h > threshold)
total_audible_pow = total_audible_pow+ h;
end
end

View File

@@ -0,0 +1,3 @@
function power= pow_of( data, start_point, end_point, divisor)
power= sum( data( start_point: end_point).^ 2)/ divisor;

View File

@@ -0,0 +1,301 @@
function setup_global( sampling_rate);
global Downsample InIIR_Hsos InIIR_Nsos Align_Nfft
global DATAPADDING_MSECS SEARCHBUFFER Fs MINSPEECHLGTH JOINSPEECHLGTH
global Nutterances Largest_uttsize Nsurf_samples Crude_DelayEst
global Crude_DelayConf UttSearch_Start UttSearch_End Utt_DelayEst
global Utt_Delay Utt_DelayConf Utt_Start Utt_End
global MAXNUTTERANCES WHOLE_SIGNAL
global pesq_mos subj_mos cond_nr MINUTTLENGTH
global CALIBRATE Nfmax Nb Sl Sp
global nr_of_hz_bands_per_bark_band centre_of_band_bark
global width_of_band_hz centre_of_band_hz width_of_band_bark
global pow_dens_correction_factor abs_thresh_power
CALIBRATE= 0;
Nfmax= 512;
MAXNUTTERANCES= 50;
MINUTTLENGTH= 50;
WHOLE_SIGNAL= -1;
UttSearch_Star= zeros( 1, MAXNUTTERANCES);
UttSearch_End= zeros( 1, MAXNUTTERANCES);
Utt_DelayEst= zeros( 1, MAXNUTTERANCES);
Utt_Delay= zeros( 1, MAXNUTTERANCES);
Utt_DelayConf= zeros( 1, MAXNUTTERANCES);
Utt_Start= zeros( 1, MAXNUTTERANCES);
Utt_End= zeros( 1, MAXNUTTERANCES);
DATAPADDING_MSECS= 320;
SEARCHBUFFER= 75;
MINSPEECHLGTH= 4;
JOINSPEECHLGTH= 50;
Sp_16k = 6.910853e-006;
Sl_16k = 1.866055e-001;
fs_16k= 16000;
Downsample_16k = 64;
Align_Nfft_16k = 1024;
InIIR_Nsos_16k = 12;
InIIR_Hsos_16k = [
0.325631521, -0.086782860, -0.238848661, -1.079416490, 0.434583902;
0.403961804, -0.556985881, 0.153024077, -0.415115835, 0.696590244;
4.736162769, 3.287251046, 1.753289019, -1.859599046, 0.876284034;
0.365373469, 0.000000000, 0.000000000, -0.634626531, 0.000000000;
0.884811506, 0.000000000, 0.000000000, -0.256725271, 0.141536777;
0.723593055, -1.447186099, 0.723593044, -1.129587469, 0.657232737;
1.644910855, -1.817280902, 1.249658063, -1.778403899, 0.801724355;
0.633692689, -0.284644314, -0.319789663, 0.000000000, 0.000000000;
1.032763031, 0.268428979, 0.602913323, 0.000000000, 0.000000000;
1.001616361, -0.823749013, 0.439731942, -0.885778255, 0.000000000;
0.752472096, -0.375388990, 0.188977609, -0.077258216, 0.247230734;
1.023700575, 0.001661628, 0.521284240, -0.183867259, 0.354324187
];
Sp_8k = 2.764344e-5;
Sl_8k = 1.866055e-1;
fs_8k= 8000;
Downsample_8k = 32;
Align_Nfft_8k = 512;
InIIR_Nsos_8k = 8;
InIIR_Hsos_8k = [
0.885535424, -0.885535424, 0.000000000, -0.771070709, 0.000000000;
0.895092588, 1.292907193, 0.449260174, 1.268869037, 0.442025372;
4.049527940, -7.865190042, 3.815662102, -1.746859852, 0.786305963;
0.500002353, -0.500002353, 0.000000000, 0.000000000, 0.000000000;
0.565002834, -0.241585934, -0.306009671, 0.259688659, 0.249979657;
2.115237288, 0.919935084, 1.141240051, -1.587313419, 0.665935315;
0.912224584, -0.224397719, -0.641121413, -0.246029464, -0.556720590;
0.444617727, -0.307589321, 0.141638062, -0.996391149, 0.502251622
];
nr_of_hz_bands_per_bark_band_8k = [
1, 1, 1, 1, 1, 1, 1, 1, 2, 1, ...
1, 1, 1, 1, 2, 1, 1, 2, 2, 2, ...
2, 2, 2, 2, 2, 3, 3, 3, 3, 4, ...
3, 4, 5, 4, 5, 6, 6, 7, 8, 9, ...
9, 11
];
centre_of_band_bark_8k = [
0.078672, 0.316341, 0.636559, 0.961246, 1.290450, ...
1.624217, 1.962597, 2.305636, 2.653383, 3.005889, ...
3.363201, 3.725371, 4.092449, 4.464486, 4.841533, ...
5.223642, 5.610866, 6.003256, 6.400869, 6.803755, ...
7.211971, 7.625571, 8.044611, 8.469146, 8.899232, ...
9.334927, 9.776288, 10.223374, 10.676242, 11.134952,...
11.599563, 12.070135, 12.546731, 13.029408, 13.518232,...
14.013264, 14.514566, 15.022202, 15.536238, 16.056736,...
16.583761, 17.117382
];
centre_of_band_hz_8k = [
7.867213, 31.634144, 63.655895, 96.124611, 129.044968,...
162.421738, 196.259659, 230.563568, 265.338348, 300.588867,...
336.320129, 372.537140, 409.244934, 446.448578, 484.568604,...
526.600586, 570.303833, 619.423340, 672.121643, 728.525696,...
785.675964, 846.835693, 909.691650, 977.063293, 1049.861694,...
1129.635986, 1217.257568, 1312.109497, 1412.501465, 1517.999390,...
1628.894165, 1746.194336, 1871.568848, 2008.776123, 2158.979248,...
2326.743164, 2513.787109, 2722.488770, 2952.586670, 3205.835449,...
3492.679932, 3820.219238
];
width_of_band_bark_8k = [
0.157344, 0.317994, 0.322441, 0.326934, 0.331474, ...
0.336061, 0.340697, 0.345381, 0.350114, 0.354897, ...
0.359729, 0.364611, 0.369544, 0.374529, 0.379565, ...
0.384653, 0.389794, 0.394989, 0.400236, 0.405538, ...
0.410894, 0.416306, 0.421773, 0.427297, 0.432877, ...
0.438514, 0.444209, 0.449962, 0.455774, 0.461645, ...
0.467577, 0.473569, 0.479621, 0.485736, 0.491912, ...
0.498151, 0.504454, 0.510819, 0.517250, 0.523745, ...
0.530308, 0.536934
];
width_of_band_hz_8k = [
15.734426, 31.799433, 32.244064, 32.693359, 33.147385, ...
33.606140, 34.069702, 34.538116, 35.011429, 35.489655, ...
35.972870, 36.461121, 36.954407, 37.452911, 40.269653, ...
42.311859, 45.992554, 51.348511, 55.040527, 56.775208, ...
58.699402, 62.445862, 64.820923, 69.195374, 76.745667, ...
84.016235, 90.825684, 97.931152, 103.348877, 107.801880, ...
113.552246, 121.490601, 130.420410, 143.431763, 158.486816, ...
176.872803, 198.314697, 219.549561, 240.600098, 268.702393, ...
306.060059, 349.937012
];
pow_dens_correction_factor_8k = [
100.000000, 99.999992, 100.000000, 100.000008, 100.000008,...
100.000015, 99.999992, 99.999969, 50.000027, 100.000000,...
99.999969, 100.000015, 99.999947, 100.000061, 53.047077, ...
110.000046, 117.991989, 65.000000, 68.760147, 69.999931, ...
71.428818, 75.000038, 76.843384, 80.968781, 88.646126, ...
63.864388, 68.155350, 72.547775, 75.584831, 58.379192,...
80.950836, 64.135651, 54.384785, 73.821884, 64.437073, ...
59.176456, 65.521278, 61.399822, 58.144047, 57.004543,...
64.126297, 59.248363
];
abs_thresh_power_8k = [
51286152, 2454709.500, 70794.593750, ...
4897.788574, 1174.897705, 389.045166, ...
104.712860, 45.708820, 17.782795, ...
9.772372, 4.897789, 3.090296, ...
1.905461, 1.258925, 0.977237, ...
0.724436, 0.562341, 0.457088, ...
0.389045, 0.331131, 0.295121, ...
0.269153, 0.257040, 0.251189, ...
0.251189, 0.251189, 0.251189, ...
0.263027, 0.288403, 0.309030, ...
0.338844, 0.371535, 0.398107, ...
0.436516, 0.467735, 0.489779, ...
0.501187, 0.501187, 0.512861, ...
0.524807, 0.524807, 0.524807
];
nr_of_hz_bands_per_bark_band_16k = [
1, 1, 1, 1, 1, 1, 1, 1, 2, 1, ...
1, 1, 1, 1, 2, 1, 1, 2, 2, 2, ...
2, 2, 2, 2, 2, 3, 3, 3, 3, 4, ...
3, 4, 5, 4, 5, 6, 6, 7, 8, 9, ...
9, 12, 12, 15, 16, 18, 21, 25, 20
];
centre_of_band_bark_16k = [
0.078672, 0.316341, 0.636559, 0.961246, 1.290450, ...
1.624217, 1.962597, 2.305636, 2.653383, 3.005889, ...
3.363201, 3.725371, 4.092449, 4.464486, 4.841533, ...
5.223642, 5.610866, 6.003256, 6.400869, 6.803755, ...
7.211971, 7.625571, 8.044611, 8.469146, 8.899232, ...
9.334927, 9.776288, 10.223374, 10.676242, 11.134952, ...
11.599563, 12.070135, 12.546731, 13.029408, 13.518232, ...
14.013264, 14.514566, 15.022202, 15.536238, 16.056736, ...
16.583761, 17.117382, 17.657663, 18.204674, 18.758478, ...
19.319147, 19.886751, 20.461355, 21.043034
];
centre_of_band_hz_16k = [
7.867213, 31.634144, 63.655895, 96.124611, 129.044968,...
162.421738, 196.259659, 230.563568, 265.338348, 300.588867,...
336.320129, 372.537140, 409.244934, 446.448578, 484.568604,...
526.600586, 570.303833, 619.423340, 672.121643, 728.525696,...
785.675964, 846.835693, 909.691650, 977.063293, 1049.861694,...
1129.635986, 1217.257568, 1312.109497, 1412.501465, 1517.999390,...
1628.894165, 1746.194336, 1871.568848, 2008.776123, 2158.979248,...
2326.743164, 2513.787109, 2722.488770, 2952.586670, 3205.835449,...
3492.679932, 3820.219238, 4193.938477, 4619.846191, 5100.437012,...
5636.199219, 6234.313477, 6946.734863, 7796.473633
];
width_of_band_bark_16k = [
0.157344, 0.317994, 0.322441, 0.326934, 0.331474,...
0.336061, 0.340697, 0.345381, 0.350114, 0.354897,...
0.359729, 0.364611, 0.369544, 0.374529, 0.379565,...
0.384653, 0.389794, 0.394989, 0.400236, 0.405538,...
0.410894, 0.416306, 0.421773, 0.427297, 0.432877,...
0.438514, 0.444209, 0.449962, 0.455774, 0.461645,...
0.467577, 0.473569, 0.479621, 0.485736, 0.491912,...
0.498151, 0.504454, 0.510819, 0.517250, 0.523745,...
0.530308, 0.536934, 0.543629, 0.550390, 0.557220,...
0.564119, 0.571085, 0.578125, 0.585232
];
width_of_band_hz_16k = [
15.734426, 31.799433, 32.244064, 32.693359, ...
33.147385, 33.606140, 34.069702, 34.538116, ...
35.011429, 35.489655, 35.972870, 36.461121, ...
36.954407, 37.452911, 40.269653, 42.311859, ...
45.992554, 51.348511, 55.040527, 56.775208, ...
58.699402, 62.445862, 64.820923, 69.195374, ...
76.745667, 84.016235, 90.825684, 97.931152, ...
103.348877, 107.801880, 113.552246, 121.490601, ...
130.420410, 143.431763, 158.486816, 176.872803, ...
198.314697, 219.549561, 240.600098, 268.702393, ...
306.060059, 349.937012, 398.686279, 454.713867, ...
506.841797, 564.863770, 637.261230, 794.717285, ...
931.068359
];
pow_dens_correction_factor_16k = [
100.000000, 99.999992, 100.000000, 100.000008,...
100.000008, 100.000015, 99.999992, 99.999969, ...
50.000027, 100.000000, 99.999969, 100.000015, ...
99.999947, 100.000061, 53.047077, 110.000046, ...
117.991989, 65.000000, 68.760147, 69.999931, ...
71.428818, 75.000038, 76.843384, 80.968781, ...
88.646126, 63.864388, 68.155350, 72.547775, ...
75.584831, 58.379192, 80.950836, 64.135651, ...
54.384785, 73.821884, 64.437073, 59.176456, ...
65.521278, 61.399822, 58.144047, 57.004543, ...
64.126297, 54.311001, 61.114979, 55.077751, ...
56.849335, 55.628868, 53.137054, 54.985844, ...
79.546974
];
abs_thresh_power_16k = [
51286152.00, 2454709.500, 70794.593750, ...
4897.788574, 1174.897705, 389.045166, ...
104.712860, 45.708820, 17.782795, ...
9.772372, 4.897789, 3.090296, ...
1.905461, 1.258925, 0.977237, ...
0.724436, 0.562341, 0.457088, ...
0.389045, 0.331131, 0.295121, ...
0.269153, 0.257040, 0.251189, ...
0.251189, 0.251189, 0.251189, ...
0.263027, 0.288403, 0.309030, ...
0.338844, 0.371535, 0.398107, ...
0.436516, 0.467735, 0.489779, ...
0.501187, 0.501187, 0.512861, ...
0.524807, 0.524807, 0.524807, ...
0.512861, 0.478630, 0.426580, ...
0.371535, 0.363078, 0.416869, ...
0.537032
];
if (sampling_rate== fs_16k)
Downsample = Downsample_16k;
InIIR_Hsos = InIIR_Hsos_16k;
InIIR_Nsos = InIIR_Nsos_16k;
Align_Nfft = Align_Nfft_16k;
Fs= fs_16k;
Nb = 49;
Sl = Sl_16k;
Sp = Sp_16k;
nr_of_hz_bands_per_bark_band = nr_of_hz_bands_per_bark_band_16k;
centre_of_band_bark = centre_of_band_bark_16k;
centre_of_band_hz = centre_of_band_hz_16k;
width_of_band_bark = width_of_band_bark_16k;
width_of_band_hz = width_of_band_hz_16k;
pow_dens_correction_factor = pow_dens_correction_factor_16k;
abs_thresh_power = abs_thresh_power_16k;
return;
end
if (sampling_rate== fs_8k)
Downsample = Downsample_8k;
InIIR_Hsos = InIIR_Hsos_8k;
InIIR_Nsos = InIIR_Nsos_8k;
Align_Nfft = Align_Nfft_8k;
Fs= fs_8k;
Nb = 42;
Sl = Sl_8k;
Sp = Sp_8k;
nr_of_hz_bands_per_bark_band = nr_of_hz_bands_per_bark_band_8k;
centre_of_band_bark = centre_of_band_bark_8k;
centre_of_band_hz = centre_of_band_hz_8k;
width_of_band_bark = width_of_band_bark_8k;
width_of_band_hz = width_of_band_hz_8k;
pow_dens_correction_factor = pow_dens_correction_factor_8k;
abs_thresh_power = abs_thresh_power_8k;
return;
end

View File

@@ -0,0 +1,390 @@
function split_align( ref_data, ref_Nsamples, ref_VAD, ref_logVAD, ...
deg_data, deg_Nsamples, deg_VAD, deg_logVAD, ...
Utt_Start_l, Utt_SpeechStart, Utt_SpeechEnd, Utt_End_l, ...
Utt_DelayEst_l, Utt_DelayConf_l)
global MAXNUTTERANCES Align_Nfft Downsample Window
global Utt_DelayEst Utt_Delay UttSearch_Start UttSearch_End
global Best_ED1 Best_D1 Best_DC1 Best_ED2 Best_D2 Best_DC2 Best_BP
Utt_BPs= zeros( 1, 41);
Utt_ED1= zeros( 1, 41);
Utt_ED2= zeros( 1, 41);
Utt_D1= zeros( 1, 41);
Utt_D2= zeros( 1, 41);
Utt_DC1= zeros( 1, 41);
Utt_DC2= zeros( 1, 41);
Utt_Len = Utt_SpeechEnd - Utt_SpeechStart;
Utt_Test = MAXNUTTERANCES;
Best_DC1 = 0.0;
Best_DC2 = 0.0;
kernel = Align_Nfft / 64;
Delta = Align_Nfft / (4 * Downsample);
Step = floor( ((0.801 * Utt_Len + 40 * Delta - 1)/(40 * Delta)));
Step = Step* Delta;
% fprintf( 'Step is %f\n', Step);
Pad = floor( Utt_Len / 10);
if( Pad < 75 )
Pad = 75;
end
Utt_BPs(1) = Utt_SpeechStart + Pad;
N_BPs = 1;
while( 1)
N_BPs= N_BPs+ 1;
Utt_BPs(N_BPs)= Utt_BPs(N_BPs- 1)+ Step;
if (~((Utt_BPs(N_BPs) <= (Utt_SpeechEnd- Pad)) && (N_BPs <= 40) ))
break;
end
end
if( N_BPs <= 1 )
return;
end
% fprintf( 'Utt_DelayEst_l, Utt_Start_l, N_BPs is %d,%d,%d\n', ...
% Utt_DelayEst_l, Utt_Start_l, N_BPs);
for bp = 1: N_BPs- 1
Utt_DelayEst(Utt_Test) = Utt_DelayEst_l;
UttSearch_Start(Utt_Test) = Utt_Start_l;
UttSearch_End(Utt_Test) = Utt_BPs(bp);
% fprintf( 'bp,Utt_BPs(%d) is %d,%d\n', bp,bp,Utt_BPs(bp));
crude_align( ref_logVAD, ref_Nsamples, deg_logVAD, ...
deg_Nsamples, MAXNUTTERANCES);
Utt_ED1(bp) = Utt_Delay(Utt_Test);
Utt_DelayEst(Utt_Test) = Utt_DelayEst_l;
UttSearch_Start(Utt_Test) = Utt_BPs(bp);
UttSearch_End(Utt_Test) = Utt_End_l;
crude_align( ref_logVAD, ref_Nsamples, deg_logVAD, ...
deg_Nsamples, MAXNUTTERANCES);
Utt_ED2(bp) = Utt_Delay(Utt_Test);
end
% stream = fopen( 'matmat.txt', 'wt' );
% for count= 1: N_BPs- 1
% fprintf( stream, '%d\n', Utt_ED2(count));
% end
% fclose( stream );
Utt_DC1(1: N_BPs-1) = -2.0;
% stream= fopen( 'what_mmm.txt', 'at');
while( 1 )
bp = 1;
while( (bp <= N_BPs- 1) && (Utt_DC1(bp) > -2.0) )
bp = bp+ 1;
end
if( bp >= N_BPs )
break;
end
estdelay = Utt_ED1(bp);
% fprintf( 'bp,estdelay is %d,%d\n', bp, estdelay);
H(1: Align_Nfft)= 0;
Hsum = 0.0;
startr = (Utt_Start_l- 1) * Downsample+ 1;
startd = startr + estdelay;
% fprintf( 'startr/startd is %d/%d\n', startr, startd);
if ( startd < 0 )
startr = -estdelay+ 1;
startd = 1;
end
while( ((startd + Align_Nfft) <= 1+ deg_Nsamples) &&...
((startr + Align_Nfft) <= (1+ (Utt_BPs(bp)- 1) * Downsample)) )
X1= ref_data(startr: startr+ Align_Nfft- 1).* Window;
X2= deg_data(startd: startd+ Align_Nfft- 1).* Window;
X1_fft= fft( X1, Align_Nfft );
X1_fft_conj= conj( X1_fft);
X2_fft= fft( X2, Align_Nfft );
X1= ifft( X1_fft_conj.* X2_fft, Align_Nfft);
X1= abs( X1);
v_max= max( X1)* 0.99;
n_max = (v_max^ 0.125 )/ kernel;
% fprintf( stream, '%f %f\n', v_max, n_max);
for count = 0: Align_Nfft- 1
if( X1(count+ 1) > v_max )
Hsum = Hsum+ n_max * kernel;
for k = 1-kernel: kernel- 1
H(1+ rem( count+ k+ Align_Nfft, Align_Nfft))= ...
H(1+ rem(count+ k+ Align_Nfft, Align_Nfft))+ ...
n_max* (kernel- abs(k));
end
end
end
startr = startr+ (Align_Nfft / 4);
startd = startd+ (Align_Nfft / 4);
end
[v_max, I_max] = max( H);
if( I_max- 1 >= (Align_Nfft/2) )
I_max = I_max- Align_Nfft;
end
Utt_D1(bp) = estdelay + I_max- 1;
if( Hsum > 0.0 )
% if (Utt_Len== 236)
% fprintf( 'v_max, Hsum is %f, %f\n', v_max, Hsum);
% end
Utt_DC1(bp) = v_max / Hsum;
else
Utt_DC1(bp) = 0.0;
end
% fprintf( 'bp/startr/startd is %d/%d/%d\n', bp, startr, startd);
while( bp < (N_BPs - 1) )
bp = bp + 1;
if( (Utt_ED1(bp) == estdelay) && (Utt_DC1(bp) <= -2.0) )
% loopno= 0;
while(((startd+ Align_Nfft)<= 1+ deg_Nsamples) && ...
((startr+ Align_Nfft)<= ...
((Utt_BPs(bp)- 1)* Downsample+ 1) ))
X1= ref_data( startr: startr+ Align_Nfft- 1).* ...
Window;
% % if (Utt_Len== 321)
% fid= fopen( 'what_mat.txt', 'at');
% fprintf( fid, '%f\n', Window);
% fclose( fid);
% % fprintf( '\n');
% % end
X2= deg_data( startd: startd+ Align_Nfft- 1).* ...
Window;
X1_fft= fft( X1, Align_Nfft );
X1_fft_conj= conj( X1_fft);
X2_fft= fft( X2, Align_Nfft );
X1= ifft( X1_fft_conj.* X2_fft, Align_Nfft);
X1= abs( X1);
v_max = 0.99* max( X1);
n_max = (v_max^ 0.125)/ kernel;
% fprintf( 'v_max n_max is %f %f\n', v_max, n_max);
for count = 0: Align_Nfft- 1
if( X1(count+ 1) > v_max )
Hsum = Hsum+ n_max * kernel;
for k = 1-kernel: kernel-1
H(1+ rem( count+ k+ Align_Nfft, Align_Nfft))= ...
H(1+ rem(count+ k+ Align_Nfft, Align_Nfft))+ ...
n_max* (kernel- abs(k));
end
end
end
startr = startr+ (Align_Nfft / 4);
startd = startd+ (Align_Nfft / 4);
% loopno= loopno+ 1;
end
% fprintf( 'loopno is %d\n', loopno);
[v_max, I_max] = max( H);
% fprintf( 'I_max is %d ', I_max);
if( I_max- 1 >= (Align_Nfft/2) )
I_max = I_max- Align_Nfft;
end
Utt_D1(bp) = estdelay + I_max- 1;
if( Hsum > 0.0 )
% fprintf( 'v_max Hsum is %f %f\n', v_max, Hsum);
Utt_DC1(bp) = v_max / Hsum;
else
Utt_DC1(bp) = 0.0;
end
end
end
end
% fclose( stream);
for bp= 1: N_BPs- 1
if( Utt_DC1(bp) > Utt_DelayConf_l )
Utt_DC2(bp) = -2.0;
else
Utt_DC2(bp) = 0.0;
end
end
while( 1 )
bp = N_BPs- 1;
while( (bp >= 1) && (Utt_DC2(bp) > -2.0) )
bp = bp- 1;
end
if( bp < 1 )
break;
end
estdelay = Utt_ED2(bp);
H( 1: Align_Nfft)= 0;
Hsum = 0.0;
startr = (Utt_End_l- 1)* Downsample+ 1- Align_Nfft;
startd = startr + estdelay;
% fprintf( '***NEW startr is %d\n', startr);
% fprintf( 'startr/d, deg_Nsamples is %d/%d, %d\n', startr,startd, ...
% deg_Nsamples);
% fprintf( 'deg_data has %d elements\n', numel( deg_data));
if ( (startd + Align_Nfft) > deg_Nsamples+ 1 )
startd = deg_Nsamples - Align_Nfft+ 1;
startr = startd - estdelay;
end
while( (startd>= 1) && (startr>= (Utt_BPs(bp)- 1)* Downsample+ 1) )
X1= ref_data( startr: startr+ Align_Nfft- 1).* Window;
X2= deg_data( startd: startd+ Align_Nfft- 1).* Window;
X1_fft= fft( X1, Align_Nfft);
X1_fft_conj= conj( X1_fft);
X2_fft= fft( X2, Align_Nfft);
X1= ifft( X1_fft_conj.* X2_fft, Align_Nfft );
X1= abs( X1);
v_max = max( X1)* 0.99;
n_max = ( v_max^ 0.125 )/ kernel;
for count = 0: Align_Nfft- 1
if( X1(count+ 1) > v_max )
Hsum = Hsum+ n_max * kernel;
for k = 1-kernel: kernel- 1
H(1+ rem(count+ k+ Align_Nfft, Align_Nfft))= ...
H(1+ rem(count+ k+ Align_Nfft, Align_Nfft))+ ...
n_max* (kernel- abs(k));
end
end
end
startr = startr- (Align_Nfft / 4);
startd = startd- (Align_Nfft / 4);
end
[v_max, I_max] = max( H);
if( I_max- 1 >= (Align_Nfft/2) )
I_max = I_max- Align_Nfft;
end
Utt_D2(bp) = estdelay + I_max- 1;
if( Hsum > 0.0 )
Utt_DC2(bp) = v_max / Hsum;
else
Utt_DC2(bp) = 0.0;
end
while( bp > 1 )
bp = bp - 1;
if( (Utt_ED2(bp) == estdelay) && (Utt_DC2(bp) <= -2.0) )
while( (startd >= 1) && (startr >= (Utt_BPs(bp)- 1) * Downsample+ 1))
X1= ref_data( startr: startr+ Align_Nfft- 1).* Window;
X2= deg_data( startd: startd+ Align_Nfft- 1).* Window;
X1_fft_conj= conj( fft( X1, Align_Nfft));
X2_fft= fft( X2, Align_Nfft);
X1= ifft( X1_fft_conj.* X2_fft, Align_Nfft);
X1= abs( X1);
v_max = max( X1)* 0.99;
n_max = (v_max^ 0.125)/ kernel;
for count = 0: Align_Nfft- 1
if( X1(count+ 1) > v_max )
Hsum = Hsum+ n_max * kernel;
for k = 1-kernel: kernel- 1
H(1+ rem( count+ k+ Align_Nfft, Align_Nfft))= ...
H(1+ rem(count+ k+ Align_Nfft, Align_Nfft))+ ...
n_max* (kernel- abs(k));
end
end
end
startr = startr- (Align_Nfft / 4);
startd = startd- (Align_Nfft / 4);
end
[v_max, I_max] = max( H);
if( I_max- 1 >= (Align_Nfft/2) )
I_max = I_max- Align_Nfft;
end
Utt_D2(bp) = estdelay + I_max- 1;
if( Hsum > 0.0 )
Utt_DC2(bp) = v_max / Hsum;
else
Utt_DC2(bp) = 0.0;
end
end
end
end
% fid= fopen( 'uttinfo_mat.txt', 'wt');
% fprintf( fid, '%f\n', Utt_D2);
% fprintf( fid, '\n');
% fprintf( fid, '%f\n', Utt_DC2);
% fclose( fid);
% fprintf( 'Utt_Len, N_BPs is %d, %d\n', Utt_Len, N_BPs);
for bp = 1: N_BPs- 1
if( (abs(Utt_D2(bp) - Utt_D1(bp)) >= Downsample) && ...
((Utt_DC1(bp)+ Utt_DC2(bp))> (Best_DC1 + Best_DC2)) &&...
(Utt_DC1(bp) > Utt_DelayConf_l) && ...
(Utt_DC2(bp) > Utt_DelayConf_l) )
Best_ED1 = Utt_ED1(bp);
Best_D1 = Utt_D1(bp);
Best_DC1 = Utt_DC1(bp);
Best_ED2 = Utt_ED2(bp);
Best_D2 = Utt_D2(bp);
Best_DC2 = Utt_DC2(bp);
Best_BP = Utt_BPs(bp);
% fprintf( 'in loop...');
end
end
% if (Utt_Len== 236)
% fid= fopen( 'matmat.txt', 'wt');
% fprintf( fid, 'N_BPs is %d\n', N_BPs);
% fprintf( fid, 'Utt_DelayConf is %f\n', Utt_DelayConf_l);
% fprintf( fid, 'ED2\t ED1\t D2\t D1\t DC2\t DC1\t BPs\n');
% for bp= 1: N_BPs- 1
% fprintf( fid, '%d\t %d\t %d\t %d\t %f\t %f\t %d\n', Utt_ED2( bp), ...
% Utt_ED1( bp), Utt_D2(bp), Utt_D1(bp), Utt_DC2(bp),...
% Utt_DC1( bp), Utt_BPs( bp));
% end
% fclose( fid);
% end

View File

@@ -0,0 +1,76 @@
function time_align(ref_data, ref_Nsamples, ...
deg_data, deg_Nsamples, Utt_id)
global Utt_DelayEst Utt_Delay Utt_DelayConf UttSearch_Start UttSearch_End
global Align_Nfft Downsample Window
estdelay = Utt_DelayEst(Utt_id);
H = zeros( 1, Align_Nfft);
X1= zeros( 1, Align_Nfft);
X2= zeros( 1, Align_Nfft);
startr = (UttSearch_Start(Utt_id)- 1)* Downsample+ 1;
startd = startr + estdelay;
if ( startd < 0 )
startr = 1 -estdelay;
startd = 1;
end
while( ((startd + Align_Nfft) <= deg_Nsamples) && ...
((startr + Align_Nfft) <= ((UttSearch_End(Utt_id)- 1) * Downsample)) )
X1= ref_data( startr: startr+ Align_Nfft- 1).* Window;
X2= deg_data( startd: startd+ Align_Nfft- 1).* Window;
% find cross-correlation between X1 and X2
X1_fft= fft( X1, Align_Nfft );
X1_fft_conj= conj( X1_fft);
X2_fft= fft( X2, Align_Nfft );
X1= ifft( X1_fft_conj.* X2_fft, Align_Nfft );
X1= abs( X1);
v_max = max( X1)* 0.99;
X1_greater_vmax= find( X1 > v_max );
H( X1_greater_vmax )= H( X1_greater_vmax )+ v_max^ 0.125;
startr = startr+ Align_Nfft/ 4;
startd = startd+ Align_Nfft/ 4;
end
X1= H;
X2= 0;
Hsum = sum( H);
X2(1) = 1.0;
kernel = Align_Nfft / 64;
for count= 2: kernel
X2( count)= 1- (count- 1)/ kernel;
X2( Align_Nfft- count+ 2)= 1- (count- 1)/ kernel;
end
X1_fft= fft( X1, Align_Nfft );
X2_fft= fft( X2, Align_Nfft );
X1= ifft( X1_fft.* X2_fft, Align_Nfft );
if (Hsum> 0)
H= abs( X1)/ Hsum;
else
H= 0;
end
[v_max, I_max] = max( H);
if( I_max- 1 >= (Align_Nfft/2) )
I_max = I_max- Align_Nfft;
end
Utt_Delay(Utt_id) = estdelay + I_max- 1;
Utt_DelayConf(Utt_id) = v_max; % confidence

View File

@@ -0,0 +1,26 @@
function utterance_locate (ref_data, ref_Nsamples, ref_VAD, ref_logVAD,...
deg_data, deg_Nsamples, deg_VAD, deg_logVAD);
global Nutterances Utt_Delay Utt_DelayConf Utt_Start Utt_End Utt_DelayEst
id_searchwindows( ref_VAD, ref_Nsamples, deg_VAD, deg_Nsamples);
for Utt_id= 1: Nutterances
%fprintf( 1, 'Utt_id is %d\n', Utt_id);
crude_align( ref_logVAD, ref_Nsamples, deg_logVAD, deg_Nsamples, Utt_id);
time_align(ref_data, ref_Nsamples, ...
deg_data, deg_Nsamples, Utt_id);
end
id_utterances( ref_Nsamples, ref_VAD, deg_Nsamples);
utterance_split( ref_data, ref_Nsamples, ref_VAD, ref_logVAD, ...
deg_data, deg_Nsamples, deg_VAD, deg_logVAD);

View File

@@ -0,0 +1,122 @@
function utterance_split( ref_data, ref_Nsamples, ref_VAD, ref_logVAD, ...
deg_data, deg_Nsamples, deg_VAD, deg_logVAD)
global Nutterances MAXNUTTERANCES Downsample SEARCHBUFFER
global Utt_DelayEst Utt_Delay Utt_DelayConf UttSearch_Start
global Utt_Start Utt_End Largest_uttsize UttSearch_End
global Best_ED1 Best_D1 Best_DC1 Best_ED2 Best_D2 Best_DC2 Best_BP
Utt_id = 1;
while( (Utt_id <= Nutterances) && (Nutterances <= MAXNUTTERANCES) )
Utt_DelayEst_l = Utt_DelayEst(Utt_id);
Utt_Delay_l = Utt_Delay(Utt_id);
Utt_DelayConf_l = Utt_DelayConf(Utt_id);
Utt_Start_l = Utt_Start(Utt_id);
Utt_End_l = Utt_End(Utt_id);
Utt_SpeechStart = Utt_Start_l;
% fprintf( 'SpeechStart is %d\n', Utt_SpeechStart);
while( (Utt_SpeechStart < Utt_End_l) && ...
(ref_VAD(Utt_SpeechStart)<= 0.0) )
Utt_SpeechStart = Utt_SpeechStart + 1;
end %find the SpeechStart for each utterance
Utt_SpeechEnd = Utt_End_l;
% fprintf( 'SpeechEnd is %d\n', Utt_SpeechEnd);
while( (Utt_SpeechEnd > Utt_Start_l) && ...
(ref_VAD(Utt_SpeechEnd) <= 0))
Utt_SpeechEnd = Utt_SpeechEnd- 1;
end
Utt_SpeechEnd = Utt_SpeechEnd+ 1;
%find SpeechEnd for each utterance
Utt_Len = Utt_SpeechEnd - Utt_SpeechStart;
% fprintf( 'Utt_Len is %d\n', Utt_Len);
if( Utt_Len >= 200 )
split_align( ref_data, ref_Nsamples, ref_VAD, ref_logVAD, ...
deg_data, deg_Nsamples, deg_VAD, deg_logVAD, ...
Utt_Start_l, Utt_SpeechStart, Utt_SpeechEnd, Utt_End_l, ...
Utt_DelayEst_l, Utt_DelayConf_l);
% fprintf( '\nBest_ED1, Best_D1, Best_DC1 is %d, %d, %f\n',...
% Best_ED1, Best_D1, Best_DC1);
% fprintf( 'Best_ED2, Best_D2, Best_DC2 is %d, %d, %f\n',...
% Best_ED2, Best_D2, Best_DC2);
% fprintf( 'Best_BP is %d\n', Best_BP);
if( (Best_DC1 > Utt_DelayConf_l) && (Best_DC2 > Utt_DelayConf_l) )
for step = Nutterances: -1: Utt_id+ 1
Utt_DelayEst(step+ 1) = Utt_DelayEst(step);
Utt_Delay(step+ 1) = Utt_Delay(step);
Utt_DelayConf(step+ 1) = Utt_DelayConf(step);
Utt_Start(step+ 1) = Utt_Start(step);
Utt_End(step+ 1) = Utt_End(step);
UttSearch_Start(step+ 1) = Utt_Start( step);
UttSearch_End(step+ 1) = Utt_End( step);
end
Nutterances = Nutterances+ 1;
Utt_DelayEst(Utt_id) = Best_ED1;
Utt_Delay(Utt_id) = Best_D1;
Utt_DelayConf(Utt_id) = Best_DC1;
Utt_DelayEst(Utt_id +1) = Best_ED2;
Utt_Delay(Utt_id +1) = Best_D2;
Utt_DelayConf(Utt_id +1) = Best_DC2;
UttSearch_Start(Utt_id +1) = UttSearch_Start(Utt_id);
UttSearch_End(Utt_id +1) = UttSearch_End( Utt_id);
if( Best_D2 < Best_D1 )
Utt_Start(Utt_id) = Utt_Start_l;
Utt_End(Utt_id) = Best_BP;
Utt_Start(Utt_id +1) = Best_BP;
Utt_End(Utt_id +1) = Utt_End_l;
else
Utt_Start( Utt_id) = Utt_Start_l;
Utt_End( Utt_id) = Best_BP + ...
floor( (Best_D2- Best_D1)/ (2 * Downsample));
Utt_Start( Utt_id +1) = Best_BP - ...
floor( (Best_D2- Best_D1)/ (2 * Downsample));
Utt_End( Utt_id +1) = Utt_End_l;
end
if( (Utt_Start(Utt_id)- SEARCHBUFFER- 1)* Downsample+ 1+ ...
Best_D1 < 0 )
Utt_Start(Utt_id) = SEARCHBUFFER+ 1+ ...
floor( (Downsample - 1 - Best_D1) / Downsample);
end
if( ((Utt_End( Utt_id +1)- 1)* Downsample+ 1 + Best_D2) >...
(deg_Nsamples - SEARCHBUFFER * Downsample) )
Utt_End( Utt_id +1) = floor( (deg_Nsamples - Best_D2)...
/ Downsample)- SEARCHBUFFER+ 1;
end
else
Utt_id= Utt_id+ 1;
end
else
Utt_id = Utt_id+ 1;
end
end
Largest_uttsize = max( Utt_End- Utt_Start);
% fid= fopen( 'uttinfo_mat.txt', 'wt');
% fprintf( fid, 'Number of Utterances is:\n');
% fprintf( fid, '%d\n', Nutterances);
% fprintf( fid, 'Utterance Delay Estimation:\n');
% fprintf( fid, '%d\n', Utt_DelayEst( 1: Nutterances) );
% fprintf( fid, 'Utterance Delay:\n');
% fprintf( fid, '%d\n', Utt_Delay( 1: Nutterances));
% fprintf( fid, 'Utterance Delay Confidence:\n');
% fprintf( fid, '%f\n', Utt_DelayConf( 1: Nutterances));
% fprintf( fid, 'Utterance Start:\n');
% fprintf( fid, '%d\n', Utt_Start( 1: Nutterances));
% fprintf( fid, 'Utterance End:\n');
% fprintf( fid, '%d\n', Utt_End( 1: Nutterances));
% fprintf( fid, 'Largest utterance length:\n');
% fprintf( fid, '%d\n', Largest_uttsize);
% fclose( fid);