Started code for DSP final project
This commit is contained in:
@@ -0,0 +1,26 @@
|
||||
function mod_data= DC_block( data, Nsamples)
|
||||
|
||||
global Downsample DATAPADDING_MSECS SEARCHBUFFER
|
||||
|
||||
ofs= SEARCHBUFFER* Downsample;
|
||||
mod_data= data;
|
||||
|
||||
%compute dc component, it is a little weird
|
||||
facc= sum( data( ofs+ 1: Nsamples- ofs))/ Nsamples;
|
||||
mod_data( ofs+ 1: Nsamples- ofs)= data( ofs+ 1: Nsamples- ofs)- facc;
|
||||
|
||||
mod_data( ofs+ 1: ofs+ Downsample)= mod_data( ofs+ 1: ofs+ Downsample).* ...
|
||||
( 0.5+ (0: Downsample- 1))/ Downsample;
|
||||
|
||||
mod_data( Nsamples- ofs: -1: Nsamples- ofs-Downsample+ 1)= ...
|
||||
mod_data( Nsamples- ofs: -1: Nsamples- ofs-Downsample+ 1).* ...
|
||||
( 0.5+ (0: Downsample- 1))/ Downsample;
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,53 @@
|
||||
function Y= FFTNXCorr( ref_VAD, startr, nr, deg_VAD, startd, nd)
|
||||
% this function has other simple implementations, current implementation is
|
||||
% consistent with the C version
|
||||
|
||||
% one way to do this (in time domain) =====
|
||||
x1= ref_VAD( startr: startr+ nr- 1);
|
||||
x2= deg_VAD( startd: startd+ nd- 1);
|
||||
x1= fliplr( x1);
|
||||
Y= conv( x2, x1);
|
||||
% done =====
|
||||
|
||||
% % the other way to do this (in freq domain)===
|
||||
% Nx= 2^ (ceil( log2( max( nr, nd))));
|
||||
% x1= zeros( 1, 2* Nx);
|
||||
% x2= zeros( 1, 2* Nx);
|
||||
% x1( 1: nr)= fliplr( ref_VAD( startr: startr+ nr- 1));
|
||||
% x2( 1: nd)= deg_VAD( startd: startd+ nd- 1);
|
||||
%
|
||||
% if (nr== 491)
|
||||
% fid= fopen( 'mat_debug.txt', 'wt');
|
||||
% fprintf( fid, '%f\n', x1);
|
||||
% fclose( fid);
|
||||
% end
|
||||
%
|
||||
% x1_fft= fft( x1, 2* Nx);
|
||||
% x2_fft= fft( x2, 2* Nx);
|
||||
%
|
||||
% tmp1= ifft( x1_fft.* x2_fft, 2* Nx);
|
||||
%
|
||||
% Ny= nr+ nd- 1;
|
||||
% Y= tmp1( 1: Ny);
|
||||
% % done ===========
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,162 @@
|
||||
function [VAD, logVAD]= apply_VAD( data, Nsamples)
|
||||
|
||||
global Downsample MINSPEECHLGTH JOINSPEECHLGTH
|
||||
|
||||
Nwindows= floor( Nsamples/ Downsample);
|
||||
%number of 4ms window
|
||||
|
||||
VAD= zeros( 1, Nwindows);
|
||||
for count= 1: Nwindows
|
||||
VAD( count)= sum( data( (count-1)* Downsample+ 1: ...
|
||||
count* Downsample).^ 2)/ Downsample;
|
||||
end
|
||||
%VAD is the power of each 4ms window
|
||||
|
||||
LevelThresh = sum( VAD)/ Nwindows;
|
||||
%LevelThresh is set to mean value of VAD
|
||||
|
||||
LevelMin= max( VAD);
|
||||
if( LevelMin > 0 )
|
||||
LevelMin= LevelMin* 1.0e-4;
|
||||
else
|
||||
LevelMin = 1.0;
|
||||
end
|
||||
%fprintf( 1, 'LevelMin is %f\n', LevelMin);
|
||||
|
||||
VAD( find( VAD< LevelMin))= LevelMin;
|
||||
|
||||
for iteration= 1: 12
|
||||
LevelNoise= 0;
|
||||
len= 0;
|
||||
StDNoise= 0;
|
||||
|
||||
VAD_lessthan_LevelThresh= VAD( find( VAD<= LevelThresh));
|
||||
len= length( VAD_lessthan_LevelThresh);
|
||||
LevelNoise= sum( VAD_lessthan_LevelThresh);
|
||||
if (len> 0)
|
||||
LevelNoise= LevelNoise/ len;
|
||||
StDNoise= sqrt( sum( ...
|
||||
(VAD_lessthan_LevelThresh- LevelNoise).^ 2)/ len);
|
||||
end
|
||||
LevelThresh= 1.001* (LevelNoise+ 2* StDNoise);
|
||||
end
|
||||
%fprintf( 1, 'LevelThresh is %f\n', LevelThresh);
|
||||
|
||||
LevelNoise= 0;
|
||||
LevelSig= 0;
|
||||
len= 0;
|
||||
VAD_greaterthan_LevelThresh= VAD( find( VAD> LevelThresh));
|
||||
len= length( VAD_greaterthan_LevelThresh);
|
||||
LevelSig= sum( VAD_greaterthan_LevelThresh);
|
||||
|
||||
VAD_lessorequal_LevelThresh= VAD( find( VAD<= LevelThresh));
|
||||
LevelNoise= sum( VAD_lessorequal_LevelThresh);
|
||||
|
||||
if (len> 0)
|
||||
LevelSig= LevelSig/ len;
|
||||
else
|
||||
LevelThresh= -1;
|
||||
end
|
||||
%fprintf( 1, 'LevelSig is %f\n', LevelSig);
|
||||
|
||||
if (len< Nwindows)
|
||||
LevelNoise= LevelNoise/( Nwindows- len);
|
||||
else
|
||||
LevelNoise= 1;
|
||||
end
|
||||
%fprintf( 1, 'LevelNoise is %f\n', LevelNoise);
|
||||
|
||||
VAD( find( VAD<= LevelThresh))= -VAD( find( VAD<= LevelThresh));
|
||||
VAD(1)= -LevelMin;
|
||||
VAD(Nwindows)= -LevelMin;
|
||||
|
||||
|
||||
start= 0;
|
||||
finish= 0;
|
||||
for count= 2: Nwindows
|
||||
if( (VAD(count) > 0.0) && (VAD(count-1) <= 0.0) )
|
||||
start = count;
|
||||
end
|
||||
if( (VAD(count) <= 0.0) && (VAD(count-1) > 0.0) )
|
||||
finish = count;
|
||||
if( (finish - start)<= MINSPEECHLGTH )
|
||||
VAD( start: finish- 1)= -VAD( start: finish- 1);
|
||||
end
|
||||
end
|
||||
end
|
||||
%to make sure finish- start is more than 4
|
||||
|
||||
if( LevelSig >= (LevelNoise* 1000) )
|
||||
for count= 2: Nwindows
|
||||
if( (VAD(count)> 0) && (VAD(count-1)<= 0) )
|
||||
start= count;
|
||||
end
|
||||
if( (VAD(count)<= 0) && (VAD(count-1)> 0) )
|
||||
finish = count;
|
||||
g = sum( VAD( start: finish- 1));
|
||||
if( g< 3.0* LevelThresh* (finish - start) )
|
||||
VAD( start: finish- 1)= -VAD( start: finish- 1);
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
start = 0;
|
||||
finish = 0;
|
||||
for count= 2: Nwindows
|
||||
if( (VAD(count) > 0.0) && (VAD(count-1) <= 0.0) )
|
||||
start = count;
|
||||
if( (finish > 0) && ((start - finish) <= JOINSPEECHLGTH) )
|
||||
VAD( finish: start- 1)= LevelMin;
|
||||
end
|
||||
end
|
||||
if( (VAD(count) <= 0.0) && (VAD(count-1) > 0.0) )
|
||||
finish = count;
|
||||
end
|
||||
end
|
||||
|
||||
start= 0;
|
||||
for count= 2: Nwindows
|
||||
if( (VAD(count)> 0) && (VAD(count-1)<= 0) )
|
||||
start= count;
|
||||
end
|
||||
end
|
||||
if( start== 0 )
|
||||
VAD= abs(VAD);
|
||||
VAD(1) = -LevelMin;
|
||||
VAD(Nwindows) = -LevelMin;
|
||||
end
|
||||
|
||||
count = 4;
|
||||
while( count< (Nwindows-1) )
|
||||
if( (VAD(count)> 0) && (VAD(count-2) <= 0) )
|
||||
VAD(count-2)= VAD(count)* 0.1;
|
||||
VAD(count-1)= VAD(count)* 0.3;
|
||||
count= count+ 1;
|
||||
end
|
||||
if( (VAD(count)<= 0) && (VAD(count-1)> 0) )
|
||||
VAD(count)= VAD(count-1)* 0.3;
|
||||
VAD(count+ 1)= VAD(count-1)* 0.1;
|
||||
count= count+ 3;
|
||||
end
|
||||
count= count+ 1;
|
||||
end
|
||||
|
||||
VAD( find( VAD< 0))= 0;
|
||||
|
||||
% fid= fopen( 'mat_vad.txt', 'wt');
|
||||
% fprintf( fid, '%f\n', VAD);
|
||||
% fclose( fid);
|
||||
|
||||
if( LevelThresh<= 0 )
|
||||
LevelThresh= LevelMin;
|
||||
end
|
||||
|
||||
logVAD( find( VAD<= LevelThresh))= 0;
|
||||
VAD_greaterthan_LevelThresh= find( VAD> LevelThresh);
|
||||
logVAD( VAD_greaterthan_LevelThresh)= log( VAD( ...
|
||||
VAD_greaterthan_LevelThresh)/ LevelThresh);
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,40 @@
|
||||
function align_filtered= apply_filter( data, data_Nsamples, align_filter_dB)
|
||||
|
||||
global Downsample DATAPADDING_MSECS SEARCHBUFFER Fs
|
||||
|
||||
align_filtered= data;
|
||||
n= data_Nsamples- 2* SEARCHBUFFER* Downsample+ DATAPADDING_MSECS* (Fs/ 1000);
|
||||
% now find the next power of 2 which is greater or equal to n
|
||||
pow_of_2= 2^ (ceil( log2( n)));
|
||||
|
||||
[number_of_points, trivial]= size( align_filter_dB);
|
||||
overallGainFilter= interp1( align_filter_dB( :, 1), align_filter_dB( :, 2), ...
|
||||
1000);
|
||||
|
||||
x= zeros( 1, pow_of_2);
|
||||
x( 1: n)= data( SEARCHBUFFER* Downsample+ 1: SEARCHBUFFER* Downsample+ n);
|
||||
|
||||
x_fft= fft( x, pow_of_2);
|
||||
|
||||
freq_resolution= Fs/ pow_of_2;
|
||||
|
||||
factorDb( 1: pow_of_2/2+ 1)= interp1( align_filter_dB( :, 1), ...
|
||||
align_filter_dB( :, 2), (0: pow_of_2/2)* freq_resolution)- ...
|
||||
overallGainFilter;
|
||||
factor= 10.^ (factorDb/ 20);
|
||||
|
||||
factor= [factor, fliplr( factor( 2: pow_of_2/2))];
|
||||
x_fft= x_fft.* factor;
|
||||
|
||||
y= ifft( x_fft, pow_of_2);
|
||||
|
||||
align_filtered( SEARCHBUFFER* Downsample+ 1: SEARCHBUFFER* Downsample+ n)...
|
||||
= y( 1: n);
|
||||
|
||||
% fid= fopen( 'log_mat.txt', 'wt');
|
||||
% fprintf( fid, '%f\n', y( 1: n));
|
||||
% fclose( fid);
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,26 @@
|
||||
function mod_data= apply_filters( data, Nsamples)
|
||||
%IIRFilt( InIIR_Hsos, InIIR_Nsos, data, data_Nsamples);
|
||||
|
||||
global InIIR_Hsos InIIR_Nsos DATAPADDING_MSECS Fs
|
||||
% data_Nsamples= Nsamples+ DATAPADDING_MSECS* (Fs/ 1000);
|
||||
|
||||
% now we construct the second order section matrix
|
||||
sosMatrix= zeros( InIIR_Nsos, 6);
|
||||
sosMatrix( :, 4)= 1; %set a(1) to 1
|
||||
% each row of sosMatrix holds [b(1*3) a(1*3)] for each section
|
||||
sosMatrix( :, 1: 3)= InIIR_Hsos( :, 1: 3);
|
||||
sosMatrix( :, 5: 6)= InIIR_Hsos( :, 4: 5);
|
||||
%sosMatrix
|
||||
|
||||
% now we construct second order section direct form II filter
|
||||
iirdf2= dfilt.df2sos( sosMatrix);
|
||||
|
||||
mod_data= filter( iirdf2, data);
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,200 @@
|
||||
function cep_mean= comp_cep(cleanFile, enhdFile);
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Cepstrum Distance Objective Speech Quality Measure
|
||||
%
|
||||
% This function implements the cepstrum distance measure used
|
||||
% in [1]
|
||||
%
|
||||
% Usage: CEP=comp_cep(cleanFile.wav, enhancedFile.wav)
|
||||
%
|
||||
% cleanFile.wav - clean input file in .wav format
|
||||
% enhancedFile - enhanced output file in .wav format
|
||||
% CEP - computed cepstrum distance measure
|
||||
%
|
||||
% Note that the cepstrum measure is limited in the range [0, 10].
|
||||
%
|
||||
% Example call: CEP =comp_cep('sp04.wav','enhanced.wav')
|
||||
%
|
||||
%
|
||||
% References:
|
||||
%
|
||||
% [1] Kitawaki, N., Nagabuchi, H., and Itoh, K. (1988). Objective quality
|
||||
% evaluation for low bit-rate speech coding systems. IEEE J. Select.
|
||||
% Areas in Comm., 6(2), 262-273.
|
||||
%
|
||||
% Author: Philipos C. Loizou
|
||||
% (LPC routines were written by Bryan Pellom & John Hansen)
|
||||
%
|
||||
% Copyright (c) 2006 by Philipos C. Loizou
|
||||
% $Revision: 0.0 $ $Date: 10/09/2006 $
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
if nargin~=2
|
||||
fprintf('USAGE: CEP=comp_cep(cleanFile.wav, enhancedFile.wav)\n');
|
||||
fprintf('For more help, type: help comp_cep\n\n');
|
||||
return;
|
||||
end
|
||||
|
||||
alpha=0.95;
|
||||
|
||||
[data1, Srate1, Nbits1]= wavread(cleanFile);
|
||||
[data2, Srate2, Nbits2]= wavread(enhdFile);
|
||||
if ( Srate1~= Srate2) | ( Nbits1~= Nbits2)
|
||||
error( 'The two files do not match!\n');
|
||||
end
|
||||
|
||||
len= min( length( data1), length( data2));
|
||||
data1= data1( 1: len)+eps;
|
||||
data2= data2( 1: len)+eps;
|
||||
|
||||
IS_dist= cepstrum( data1, data2,Srate1);
|
||||
|
||||
IS_len= round( length( IS_dist)* alpha);
|
||||
IS= sort( IS_dist);
|
||||
|
||||
cep_mean= mean( IS( 1: IS_len));
|
||||
|
||||
|
||||
|
||||
|
||||
function distortion = cepstrum(clean_speech, processed_speech,sample_rate)
|
||||
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Check the length of the clean and processed speech. Must be the same.
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
clean_length = length(clean_speech);
|
||||
processed_length = length(processed_speech);
|
||||
|
||||
if (clean_length ~= processed_length)
|
||||
disp('Error: Both Speech Files must be same length.');
|
||||
return
|
||||
end
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Scale both clean speech and processed speech to have same dynamic
|
||||
% range. Also remove DC component from each signal
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
%clean_speech = clean_speech - mean(clean_speech);
|
||||
%processed_speech = processed_speech - mean(processed_speech);
|
||||
|
||||
%processed_speech = processed_speech.*(max(abs(clean_speech))/ max(abs(processed_speech)));
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Global Variables
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
winlength = round(30*sample_rate/1000); %240; % window length in samples
|
||||
skiprate = floor(winlength/4); % window skip in samples
|
||||
if sample_rate<10000
|
||||
P = 10; % LPC Analysis Order
|
||||
else
|
||||
P=16; % this could vary depending on sampling frequency.
|
||||
end
|
||||
C=10*sqrt(2)/log(10);
|
||||
% ----------------------------------------------------------------------
|
||||
% For each frame of input speech, calculate the Itakura-Saito Measure
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames
|
||||
start = 1; % starting sample
|
||||
window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
|
||||
|
||||
for frame_count = 1:num_frames
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (1) Get the Frames for the test and reference speech.
|
||||
% Multiply by Hanning Window.
|
||||
% ----------------------------------------------------------
|
||||
|
||||
clean_frame = clean_speech(start:start+winlength-1);
|
||||
processed_frame = processed_speech(start:start+winlength-1);
|
||||
clean_frame = clean_frame.*window;
|
||||
processed_frame = processed_frame.*window;
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (2) Get the autocorrelation lags and LPC parameters used
|
||||
% to compute the IS measure.
|
||||
% ----------------------------------------------------------
|
||||
|
||||
[R_clean, Ref_clean, A_clean] = ...
|
||||
lpcoeff(clean_frame, P);
|
||||
[R_processed, Ref_processed, A_processed] = ...
|
||||
lpcoeff(processed_frame, P);
|
||||
|
||||
C_clean=lpc2cep(A_clean);
|
||||
C_processed=lpc2cep(A_processed);
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (3) Compute the cepstrum-distance measure
|
||||
% ----------------------------------------------------------
|
||||
|
||||
|
||||
distortion(frame_count) = min(10,C*norm(C_clean-C_processed,2));
|
||||
|
||||
|
||||
start = start + skiprate;
|
||||
|
||||
end
|
||||
|
||||
|
||||
|
||||
function [acorr, refcoeff, lpparams] = lpcoeff(speech_frame, model_order)
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (1) Compute Autocorrelation Lags
|
||||
% ----------------------------------------------------------
|
||||
|
||||
winlength = max(size(speech_frame));
|
||||
for k=1:model_order+1
|
||||
R(k) = sum(speech_frame(1:winlength-k+1) ...
|
||||
.*speech_frame(k:winlength));
|
||||
end
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (2) Levinson-Durbin
|
||||
% ----------------------------------------------------------
|
||||
|
||||
a = ones(1,model_order);
|
||||
E(1)=R(1);
|
||||
for i=1:model_order
|
||||
a_past(1:i-1) = a(1:i-1);
|
||||
sum_term = sum(a_past(1:i-1).*R(i:-1:2));
|
||||
rcoeff(i)=(R(i+1) - sum_term) / E(i);
|
||||
a(i)=rcoeff(i);
|
||||
a(1:i-1) = a_past(1:i-1) - rcoeff(i).*a_past(i-1:-1:1);
|
||||
E(i+1)=(1-rcoeff(i)*rcoeff(i))*E(i);
|
||||
end
|
||||
|
||||
acorr = R;
|
||||
refcoeff = rcoeff;
|
||||
lpparams = [1 -a];
|
||||
|
||||
%----------------------------------------------
|
||||
function [cep]=lpc2cep(a)
|
||||
%
|
||||
% converts prediction to cepstrum coefficients
|
||||
%
|
||||
% Author: Philipos C. Loizou
|
||||
|
||||
M=length(a);
|
||||
cep=zeros(1,M-1);
|
||||
|
||||
cep(1)=-a(2);
|
||||
|
||||
for k=2:M-1
|
||||
ix=1:k-1;
|
||||
vec1=cep(ix).*a(k-1+1:-1:2).*ix;
|
||||
cep(k)=-(a(k+1)+sum(vec1)/k);
|
||||
|
||||
end
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,259 @@
|
||||
function fwseg_dist= comp_fwseg(cleanFile, enhancedFile);
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Frequency weighted SNRseg Objective Speech Quality Measure
|
||||
%
|
||||
% This function implements the frequency-weighted SNRseg measure [1]
|
||||
% using a different weighting function, the clean spectrum.
|
||||
%
|
||||
% Usage: fwSNRseg=comp_fwseg(cleanFile.wav, enhancedFile.wav)
|
||||
%
|
||||
% cleanFile.wav - clean input file in .wav format
|
||||
% enhancedFile - enhanced output file in .wav format
|
||||
% fwSNRseg - computed frequency weighted SNRseg in dB
|
||||
%
|
||||
% Note that large numbers of fwSNRseg are better.
|
||||
%
|
||||
% Example call: fwSNRseg =comp_fwseg('sp04.wav','enhanced.wav')
|
||||
%
|
||||
%
|
||||
% References:
|
||||
% [1] Tribolet, J., Noll, P., McDermott, B., and Crochiere, R. E. (1978).
|
||||
% A study of complexity and quality of speech waveform coders. Proc.
|
||||
% IEEE Int. Conf. Acoust. , Speech, Signal Processing, 586-590.
|
||||
%
|
||||
% Author: Philipos C. Loizou
|
||||
% (critical-band filtering routines were written by Bryan Pellom & John Hansen)
|
||||
%
|
||||
% Copyright (c) 2006 by Philipos C. Loizou
|
||||
% $Revision: 0.0 $ $Date: 10/09/2006 $
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
if nargin~=2
|
||||
fprintf('USAGE: fwSNRseg=comp_fwseg(cleanFile.wav, enhancedFile.wav)\n');
|
||||
fprintf('For more help, type: help comp_fwseg\n\n');
|
||||
return;
|
||||
end
|
||||
|
||||
|
||||
[data1, Srate1, Nbits1]= wavread(cleanFile);
|
||||
[data2, Srate2, Nbits2]= wavread(enhancedFile);
|
||||
if ( Srate1~= Srate2) | ( Nbits1~= Nbits2)
|
||||
error( 'The two files do not match!\n');
|
||||
end
|
||||
|
||||
len= min( length( data1), length( data2));
|
||||
data1= data1( 1: len)+eps;
|
||||
data2= data2( 1: len)+eps;
|
||||
|
||||
wss_dist_vec= fwseg( data1, data2,Srate1);
|
||||
fwseg_dist=mean(wss_dist_vec);
|
||||
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
function distortion = fwseg(clean_speech, processed_speech,sample_rate)
|
||||
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Check the length of the clean and processed speech. Must be the same.
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
clean_length = length(clean_speech);
|
||||
processed_length = length(processed_speech);
|
||||
|
||||
if (clean_length ~= processed_length)
|
||||
disp('Error: Files must have same length.');
|
||||
return
|
||||
end
|
||||
|
||||
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Global Variables
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
|
||||
winlength = round(30*sample_rate/1000); % window length in samples
|
||||
skiprate = floor(winlength/4); % window skip in samples
|
||||
max_freq = sample_rate/2; % maximum bandwidth
|
||||
num_crit = 25; % number of critical bands
|
||||
USE_25=1;
|
||||
n_fft = 2^nextpow2(2*winlength);
|
||||
n_fftby2 = n_fft/2; % FFT size/2
|
||||
gamma=0.2; % power exponent
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Critical Band Filter Definitions (Center Frequency and Bandwidths in Hz)
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
cent_freq(1) = 50.0000; bandwidth(1) = 70.0000;
|
||||
cent_freq(2) = 120.000; bandwidth(2) = 70.0000;
|
||||
cent_freq(3) = 190.000; bandwidth(3) = 70.0000;
|
||||
cent_freq(4) = 260.000; bandwidth(4) = 70.0000;
|
||||
cent_freq(5) = 330.000; bandwidth(5) = 70.0000;
|
||||
cent_freq(6) = 400.000; bandwidth(6) = 70.0000;
|
||||
cent_freq(7) = 470.000; bandwidth(7) = 70.0000;
|
||||
cent_freq(8) = 540.000; bandwidth(8) = 77.3724;
|
||||
cent_freq(9) = 617.372; bandwidth(9) = 86.0056;
|
||||
cent_freq(10) = 703.378; bandwidth(10) = 95.3398;
|
||||
cent_freq(11) = 798.717; bandwidth(11) = 105.411;
|
||||
cent_freq(12) = 904.128; bandwidth(12) = 116.256;
|
||||
cent_freq(13) = 1020.38; bandwidth(13) = 127.914;
|
||||
cent_freq(14) = 1148.30; bandwidth(14) = 140.423;
|
||||
cent_freq(15) = 1288.72; bandwidth(15) = 153.823;
|
||||
cent_freq(16) = 1442.54; bandwidth(16) = 168.154;
|
||||
cent_freq(17) = 1610.70; bandwidth(17) = 183.457;
|
||||
cent_freq(18) = 1794.16; bandwidth(18) = 199.776;
|
||||
cent_freq(19) = 1993.93; bandwidth(19) = 217.153;
|
||||
cent_freq(20) = 2211.08; bandwidth(20) = 235.631;
|
||||
cent_freq(21) = 2446.71; bandwidth(21) = 255.255;
|
||||
cent_freq(22) = 2701.97; bandwidth(22) = 276.072;
|
||||
cent_freq(23) = 2978.04; bandwidth(23) = 298.126;
|
||||
cent_freq(24) = 3276.17; bandwidth(24) = 321.465;
|
||||
cent_freq(25) = 3597.63; bandwidth(25) = 346.136;
|
||||
|
||||
W=[ % articulation index weights
|
||||
0.003
|
||||
0.003
|
||||
0.003
|
||||
0.007
|
||||
0.010
|
||||
0.016
|
||||
0.016
|
||||
0.017
|
||||
0.017
|
||||
0.022
|
||||
0.027
|
||||
0.028
|
||||
0.030
|
||||
0.032
|
||||
0.034
|
||||
0.035
|
||||
0.037
|
||||
0.036
|
||||
0.036
|
||||
0.033
|
||||
0.030
|
||||
0.029
|
||||
0.027
|
||||
0.026
|
||||
0.026];
|
||||
|
||||
W=W';
|
||||
|
||||
if USE_25==0 % use 13 bands
|
||||
% ----- lump adjacent filters together ----------------
|
||||
k=2;
|
||||
cent_freq2(1)=cent_freq(1);
|
||||
bandwidth2(1)=bandwidth(1)+bandwidth(2);
|
||||
W2(1)=W(1);
|
||||
for i=2:13
|
||||
cent_freq2(i)=cent_freq2(i-1)+bandwidth2(i-1);
|
||||
bandwidth2(i)=bandwidth(k)+bandwidth(k+1);
|
||||
W2(i)=0.5*(W(k)+W(k+1));
|
||||
k=k+2;
|
||||
end
|
||||
|
||||
sumW=sum(W2);
|
||||
bw_min = bandwidth2 (1); % minimum critical bandwidth
|
||||
else
|
||||
sumW=sum(W);
|
||||
bw_min=bandwidth(1);
|
||||
end
|
||||
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Set up the critical band filters. Note here that Gaussianly shaped
|
||||
% filters are used. Also, the sum of the filter weights are equivalent
|
||||
% for each critical band filter. Filter less than -30 dB and set to
|
||||
% zero.
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
min_factor = exp (-30.0 / (2.0 * 2.303)); % -30 dB point of filter
|
||||
if USE_25==0
|
||||
|
||||
num_crit=length(cent_freq2);
|
||||
|
||||
for i = 1:num_crit
|
||||
f0 = (cent_freq2 (i) / max_freq) * (n_fftby2);
|
||||
all_f0(i) = floor(f0);
|
||||
bw = (bandwidth2 (i) / max_freq) * (n_fftby2);
|
||||
norm_factor = log(bw_min) - log(bandwidth2(i));
|
||||
j = 0:1:n_fftby2-1;
|
||||
crit_filter(i,:) = exp (-11 *(((j - floor(f0)) ./bw).^2) + norm_factor);
|
||||
crit_filter(i,:) = crit_filter(i,:).*(crit_filter(i,:) > min_factor);
|
||||
end
|
||||
|
||||
else
|
||||
for i = 1:num_crit
|
||||
f0 = (cent_freq (i) / max_freq) * (n_fftby2);
|
||||
all_f0(i) = floor(f0);
|
||||
bw = (bandwidth (i) / max_freq) * (n_fftby2);
|
||||
norm_factor = log(bw_min) - log(bandwidth(i));
|
||||
j = 0:1:n_fftby2-1;
|
||||
crit_filter(i,:) = exp (-11 *(((j - floor(f0)) ./bw).^2) + norm_factor);
|
||||
crit_filter(i,:) = crit_filter(i,:).*(crit_filter(i,:) > min_factor);
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
|
||||
num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames
|
||||
start = 1; % starting sample
|
||||
window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
|
||||
|
||||
for frame_count = 1:num_frames
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (1) Get the Frames for the test and reference speech.
|
||||
% Multiply by Hanning Window.
|
||||
% ----------------------------------------------------------
|
||||
|
||||
clean_frame = clean_speech(start:start+winlength-1);
|
||||
processed_frame = processed_speech(start:start+winlength-1);
|
||||
clean_frame = clean_frame.*window;
|
||||
processed_frame = processed_frame.*window;
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (2) Compute the magnitude Spectrum of Clean and Processed
|
||||
% ----------------------------------------------------------
|
||||
|
||||
|
||||
clean_spec = abs(fft(clean_frame,n_fft));
|
||||
processed_spec = abs(fft(processed_frame,n_fft));
|
||||
|
||||
% normalize spectra to have area of one
|
||||
%
|
||||
clean_spec=clean_spec/sum(clean_spec(1:n_fftby2));
|
||||
processed_spec=processed_spec/sum(processed_spec(1:n_fftby2));
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (3) Compute Filterbank Output Energies
|
||||
% ----------------------------------------------------------
|
||||
|
||||
clean_energy=zeros(1,num_crit);
|
||||
processed_energy=zeros(1,num_crit);
|
||||
error_energy=zeros(1,num_crit);
|
||||
W_freq=zeros(1,num_crit);
|
||||
|
||||
for i = 1:num_crit
|
||||
clean_energy(i) = sum(clean_spec(1:n_fftby2) ...
|
||||
.*crit_filter(i,:)');
|
||||
processed_energy(i) = sum(processed_spec(1:n_fftby2) ...
|
||||
.*crit_filter(i,:)');
|
||||
|
||||
error_energy(i)=max((clean_energy(i)-processed_energy(i))^2,eps);
|
||||
W_freq(i)=(clean_energy(i))^gamma;
|
||||
|
||||
end
|
||||
SNRlog=10*log10((clean_energy.^2)./error_energy);
|
||||
|
||||
fwSNR=sum(W_freq.*SNRlog)/sum(W_freq);
|
||||
|
||||
distortion(frame_count)=min(max(fwSNR,-10),35);
|
||||
|
||||
start = start + skiprate;
|
||||
|
||||
end
|
||||
|
||||
@@ -0,0 +1,493 @@
|
||||
function [SIG,BAK,OVL]= comp_fwseg_mars(cleanFile, enhancedFile);
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% MARS Frequency-variant fwSNRseg objective speech quality measure
|
||||
%
|
||||
% This function implements the frequency-variant fwSNRseg measure based
|
||||
% on MARS analysis (see Chap. 10, Sec. 10.5.4)
|
||||
%
|
||||
%
|
||||
% Usage: [sig,bak,ovl]=comp_fwseg_mars(cleanFile.wav, enhancedFile.wav)
|
||||
%
|
||||
% cleanFile.wav - clean input file in .wav format
|
||||
% enhancedFile - enhanced output file in .wav format
|
||||
% sig - predicted rating [1-5] of speech distortion
|
||||
% bak - predicted rating [1-5] of noise distortion
|
||||
% ovl - predicted rating [1-5] of overall quality
|
||||
%
|
||||
%
|
||||
% Example call: [s,b,o] =comp_fwseg_mars('sp04.wav','enhanced.wav')
|
||||
%
|
||||
%
|
||||
% References:
|
||||
% [1] Chapter 10, Sec 10.5.4,
|
||||
% [2] Chapter 11
|
||||
%
|
||||
% Authors: Yi Hu and Philipos C. Loizou
|
||||
% (critical-band filtering routines were written by Bryan Pellom & John Hansen)
|
||||
%
|
||||
% Copyright (c) 2006 by Philipos C. Loizou
|
||||
% $Revision: 0.0 $ $Date: 10/09/2006 $
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
if nargin~=2
|
||||
fprintf('USAGE: [sig,bak,ovl]=comp_fwseg_mars(cleanFile.wav, enhancedFile.wav)\n');
|
||||
fprintf('For more help, type: help comp_fwseg_mars\n\n');
|
||||
return;
|
||||
end
|
||||
|
||||
[data1, Srate1, Nbits1]= wavread(cleanFile);
|
||||
[data2, Srate2, Nbits2]= wavread(enhancedFile);
|
||||
if ( Srate1~= Srate2) | ( Nbits1~= Nbits2)
|
||||
error( 'The two files do not match!\n');
|
||||
end
|
||||
|
||||
len= min( length( data1), length( data2));
|
||||
data1= data1( 1: len)+eps;
|
||||
data2= data2( 1: len)+eps;
|
||||
|
||||
wss_dist_matrix= fwseg( data1, data2,Srate1);
|
||||
wss_dist=mean(wss_dist_matrix);
|
||||
|
||||
|
||||
SIG= sig_mars( wss_dist( 1), wss_dist( 2), wss_dist( 3), wss_dist( 4), ...
|
||||
wss_dist( 5), wss_dist( 6), wss_dist( 7), wss_dist( 8), ...
|
||||
wss_dist( 9), wss_dist( 10), wss_dist( 11), wss_dist( 12), ...
|
||||
wss_dist( 13), wss_dist( 14), wss_dist( 15), wss_dist( 16), ...
|
||||
wss_dist( 17), wss_dist( 18), wss_dist( 19), wss_dist( 20), ...
|
||||
wss_dist( 21), wss_dist( 22), wss_dist( 23), wss_dist( 24), ...
|
||||
wss_dist( 25));
|
||||
SIG=max(1,SIG); SIG=min(5, SIG); % limit values to [1, 5]
|
||||
|
||||
BAK= bak_mars( wss_dist( 1), wss_dist( 2), wss_dist( 3), wss_dist( 4), ...
|
||||
wss_dist( 5), wss_dist( 6), wss_dist( 7), wss_dist( 8), ...
|
||||
wss_dist( 9), wss_dist( 10), wss_dist( 11), wss_dist( 12), ...
|
||||
wss_dist( 13), wss_dist( 14), wss_dist( 15), wss_dist( 16), ...
|
||||
wss_dist( 17), wss_dist( 18), wss_dist( 19), wss_dist( 20), ...
|
||||
wss_dist( 21), wss_dist( 22), wss_dist( 23), wss_dist( 24), ...
|
||||
wss_dist( 25));
|
||||
BAK=max(1,BAK); BAK=min(5, BAK); % limit values to [1, 5]
|
||||
|
||||
OVL= ovl_mars( wss_dist( 1), wss_dist( 2), wss_dist( 3), wss_dist( 4), ...
|
||||
wss_dist( 5), wss_dist( 6), wss_dist( 7), wss_dist( 8), ...
|
||||
wss_dist( 9), wss_dist( 10), wss_dist( 11), wss_dist( 12), ...
|
||||
wss_dist( 13), wss_dist( 14), wss_dist( 15), wss_dist( 16), ...
|
||||
wss_dist( 17), wss_dist( 18), wss_dist( 19), wss_dist( 20), ...
|
||||
wss_dist( 21), wss_dist( 22), wss_dist( 23), wss_dist( 24), ...
|
||||
wss_dist( 25));
|
||||
OVL=max(1,OVL); OVL=min(5, OVL); % limit values to [1, 5]
|
||||
|
||||
|
||||
%-------------------------------------------------
|
||||
function Y= bak_mars( FWSEG_VA, V5, V6, V7, V8, V9, V10, V11, V12, ...
|
||||
V13, V14, V15, V16, V17, V18, V19, V20, ...
|
||||
V21, V22, V23, V24, V25, V26, V27, V28)
|
||||
|
||||
BF1 = max(0, V21 - 0.282);
|
||||
BF2 = max(0, FWSEG_VA + 9.094);
|
||||
BF3 = max(0, - 9.094 - FWSEG_VA );
|
||||
BF5 = max(0, 10.089 - V11 );
|
||||
BF7 = max(0, 3.624 - V26 ) * BF3;
|
||||
BF8 = max(0, V24 - 5.584) * BF5;
|
||||
BF9 = max(0, 5.584 - V24 ) * BF5;
|
||||
BF10 = max(0, V19 - 8.030) * BF1;
|
||||
BF11 = max(0, 8.030 - V19 ) * BF1;
|
||||
BF12 = max(0, V27 - 4.858) * BF1;
|
||||
BF13 = max(0, 4.858 - V27 ) * BF1;
|
||||
BF14 = max(0, FWSEG_VA + 7.282) * BF1;
|
||||
BF15 = max(0, - 7.282 - FWSEG_VA ) * BF1;
|
||||
BF17 = max(0, 9.458 - V16 ) * BF10;
|
||||
BF18 = max(0, V27 - 10.431) * BF11;
|
||||
BF19 = max(0, 10.431 - V27 ) * BF11;
|
||||
BF21 = max(0, 11.059 - V22 ) * BF1;
|
||||
BF22 = max(0, V26 - 8.675) * BF1;
|
||||
BF23 = max(0, 8.675 - V26 ) * BF1;
|
||||
BF25 = max(0, 11.195 - V6 ) * BF10;
|
||||
BF26 = max(0, V8 - 7.138) * BF1;
|
||||
BF27 = max(0, 7.138 - V8 ) * BF1;
|
||||
BF29 = max(0, 9.006 - V10 ) * BF26;
|
||||
BF30 = max(0, V14 - 8.210) * BF15;
|
||||
BF35 = max(0, 7.026 - V19 ) * BF15;
|
||||
BF36 = max(0, V11 - 3.424) * BF27;
|
||||
BF39 = max(0, 5.418 - V17 ) * BF23;
|
||||
BF40 = max(0, V28 - 6.813);
|
||||
BF41 = max(0, 6.813 - V28 );
|
||||
BF42 = max(0, V26 - 5.998) * BF14;
|
||||
BF43 = max(0, 5.998 - V26 ) * BF14;
|
||||
BF44 = max(0, V5 + 0.206) * BF41;
|
||||
BF45 = max(0, - 0.206 - V5 ) * BF41;
|
||||
BF46 = max(0, V22 - 7.901) * BF45;
|
||||
BF49 = max(0, 7.496 - V8 ) * BF44;
|
||||
BF51 = max(0, 7.904 - V11 ) * BF45;
|
||||
BF52 = max(0, V26 - 10.938) * BF27;
|
||||
BF54 = max(0, V9 - 4.507) * BF26;
|
||||
BF56 = max(0, V28 - 0.549) * BF15;
|
||||
BF57 = max(0, 0.549 - V28 ) * BF15;
|
||||
BF58 = max(0, V25 - 3.252) * BF41;
|
||||
BF59 = max(0, 3.252 - V25 ) * BF41;
|
||||
BF60 = max(0, V23 - 7.650) * BF58;
|
||||
BF61 = max(0, 7.650 - V23 ) * BF58;
|
||||
BF62 = max(0, V25 - 9.931) * BF44;
|
||||
BF63 = max(0, 9.931 - V25 ) * BF44;
|
||||
BF64 = max(0, V25 - 4.923) * BF21;
|
||||
BF65 = max(0, 4.923 - V25 ) * BF21;
|
||||
BF67 = max(0, 3.746 - V28 ) * BF10;
|
||||
BF68 = max(0, V11 - 5.346) * BF41;
|
||||
BF69 = max(0, 5.346 - V11 ) * BF41;
|
||||
BF70 = max(0, V12 - 9.026) * BF68;
|
||||
BF71 = max(0, 9.026 - V12 ) * BF68;
|
||||
BF73 = max(0, - 2.668 - V28 ) * BF21;
|
||||
BF74 = max(0, V24 - 7.028) * BF41;
|
||||
BF75 = max(0, 7.028 - V24 ) * BF41;
|
||||
BF77 = max(0, - 0.224 - V6 ) * BF74;
|
||||
BF78 = max(0, V5 - 3.884);
|
||||
BF79 = max(0, 3.884 - V5 );
|
||||
BF80 = max(0, V15 - 5.019) * BF78;
|
||||
BF83 = max(0, - 1.880 - V28 ) * BF13;
|
||||
BF84 = max(0, V7 - 3.067) * BF12;
|
||||
BF85 = max(0, 3.067 - V7 ) * BF12;
|
||||
BF87 = max(0, 5.353 - V6 );
|
||||
BF88 = max(0, V13 - 3.405) * BF9;
|
||||
BF89 = max(0, 3.405 - V13 ) * BF9;
|
||||
BF91 = max(0, 5.599 - V13 ) * BF45;
|
||||
BF92 = max(0, V15 - 9.821) * BF8;
|
||||
BF94 = max(0, V14 + 2.594) * BF79;
|
||||
BF97 = max(0, 8.635 - V23 ) * BF94;
|
||||
BF99 = max(0, 1.332 - V24 ) * BF45;
|
||||
BF100 = max(0, V7 - 0.209) * BF1;
|
||||
|
||||
Y = 2.751 + 0.135 * BF1 - 0.037 * BF2 + 0.328 * BF3 - 0.098 * BF5 ...
|
||||
+ 0.988 * BF7 + 0.014 * BF8 - 0.034 * BF11 - 0.011 * BF12 ...
|
||||
- 0.013 * BF13 - 0.002 * BF17 + 0.014 * BF18 ...
|
||||
+ 0.004 * BF19 - 0.007 * BF21 - 0.017 * BF22 ...
|
||||
- .895791E-03 * BF25 + 0.011 * BF26 - 0.009 * BF27 ...
|
||||
- 0.007 * BF29 + 0.052 * BF30 + 0.022 * BF35 ...
|
||||
- 0.002 * BF36 - 0.005 * BF39 - 0.059 * BF40 ...
|
||||
- 0.050 * BF41 + 0.001 * BF42 + .743730E-03 * BF43 ...
|
||||
+ 0.011 * BF44 + 0.022 * BF45 + 0.009 * BF46 ...
|
||||
+ 0.004 * BF49 - 0.005 * BF51 + 0.010 * BF52 ...
|
||||
- 0.001 * BF54 - 0.005 * BF56 - 0.015 * BF57 ...
|
||||
- 0.032 * BF59 + 0.009 * BF60 - 0.002 * BF61 ...
|
||||
- 0.009 * BF62 - 0.001 * BF63 + .819374E-03 * BF64 ...
|
||||
+ 0.002 * BF65 + 0.003 * BF67 + 0.024 * BF69 ...
|
||||
- 0.011 * BF70 - 0.004 * BF71 + 0.013 * BF73 ...
|
||||
- 0.026 * BF74 + 0.005 * BF75 + 0.253 * BF77 ...
|
||||
- 0.065 * BF78 + 0.014 * BF80 - 0.010 * BF83 ...
|
||||
+ 0.001 * BF84 + 0.018 * BF85 - 0.050 * BF87 ...
|
||||
- 0.002 * BF88 - 0.020 * BF89 + 0.003 * BF91 ...
|
||||
- 0.043 * BF92 + .707581E-03 * BF97 - 0.015 * BF99 ...
|
||||
- 0.005 * BF100;
|
||||
|
||||
|
||||
function Y= sig_mars( FWSEG_VA, V5, V6, V7, V8, V9, V10, V11, V12, ...
|
||||
V13, V14, V15, V16, V17, V18, V19, V20, ...
|
||||
V21, V22, V23, V24, V25, V26, V27, V28)
|
||||
|
||||
BF1 = max(0, V7 - 9.535);
|
||||
BF2 = max(0, 9.535 - V7 );
|
||||
BF3 = max(0, V27 - 1.578);
|
||||
BF5 = max(0, V6 - 5.422);
|
||||
BF6 = max(0, 5.422 - V6 );
|
||||
BF8 = max(0, 11.333 - V19 );
|
||||
BF10 = max(0, - 6.774 - FWSEG_VA );
|
||||
BF11 = max(0, V10 - 6.255) * BF8;
|
||||
BF12 = max(0, 6.255 - V10 ) * BF8;
|
||||
BF13 = max(0, V24 - 3.894);
|
||||
BF15 = max(0, V5 - 3.884);
|
||||
BF16 = max(0, 3.884 - V5 );
|
||||
BF17 = max(0, V28 - 7.918);
|
||||
BF18 = max(0, 7.918 - V28 );
|
||||
BF19 = max(0, V13 - 6.077) * BF18;
|
||||
BF20 = max(0, 6.077 - V13 ) * BF18;
|
||||
BF22 = max(0, 6.614 - V20 ) * BF10;
|
||||
BF23 = max(0, FWSEG_VA + 0.936) * BF8;
|
||||
BF25 = max(0, V23 - 5.039);
|
||||
BF26 = max(0, 5.039 - V23 );
|
||||
BF28 = max(0, 9.007 - V20 ) * BF25;
|
||||
BF29 = max(0, V25 - 7.582);
|
||||
BF30 = max(0, 7.582 - V25 );
|
||||
BF31 = max(0, V11 + 3.336) * BF16;
|
||||
BF32 = max(0, V26 - 1.877);
|
||||
BF35 = max(0, - 5.749 - FWSEG_VA ) * BF6;
|
||||
BF36 = max(0, V7 - 4.451) * BF29;
|
||||
BF37 = max(0, 4.451 - V7 ) * BF29;
|
||||
BF38 = max(0, V14 - 10.158);
|
||||
BF39 = max(0, 10.158 - V14 );
|
||||
BF41 = max(0, 7.172 - V17 ) * BF39;
|
||||
BF43 = max(0, 7.810 - V24 ) * BF26;
|
||||
BF44 = max(0, V8 + 1.636) * BF3;
|
||||
BF45 = max(0, FWSEG_VA - 10.068) * BF39;
|
||||
BF47 = max(0, V23 - 4.721) * BF30;
|
||||
BF48 = max(0, 4.721 - V23 ) * BF30;
|
||||
BF50 = max(0, - 2.397 - V24 ) * BF16;
|
||||
BF51 = max(0, V14 - 1.428) * BF17;
|
||||
BF53 = max(0, V16 + 1.940) * BF18;
|
||||
BF54 = max(0, V10 - 9.442) * BF18;
|
||||
BF56 = max(0, V10 + 2.144) * BF16;
|
||||
BF58 = max(0, 1.969 - V26 ) * BF2;
|
||||
BF59 = max(0, V19 - 6.089) * BF16;
|
||||
BF62 = max(0, 8.952 - V21 ) * BF15;
|
||||
BF63 = max(0, V24 - 7.371) * BF3;
|
||||
BF65 = max(0, V22 - 8.908) * BF6;
|
||||
BF66 = max(0, 8.908 - V22 ) * BF6;
|
||||
BF67 = max(0, V27 - 9.485) * BF30;
|
||||
BF69 = max(0, V18 - 8.608) * BF10;
|
||||
BF71 = max(0, V13 - 3.374) * BF25;
|
||||
BF73 = max(0, V14 - 3.616) * BF13;
|
||||
BF75 = max(0, V18 - 10.321) * BF32;
|
||||
BF76 = max(0, 10.321 - V18 ) * BF32;
|
||||
BF78 = max(0, 3.972 - V15 ) * BF26;
|
||||
BF79 = max(0, V14 - 7.105) * BF26;
|
||||
BF80 = max(0, 7.105 - V14 ) * BF26;
|
||||
|
||||
Y = 2.638 - 0.089 * BF1 + 0.083 * BF5 - 0.162 * BF6 - 0.037 * BF8 ...
|
||||
- 0.241 * BF10 + 0.018 * BF11 - 0.008 * BF12 ...
|
||||
+ 0.059 * BF13 - 0.144 * BF17 - 0.116 * BF18 ...
|
||||
+ 0.010 * BF19 - 0.012 * BF20 + 0.085 * BF22 ...
|
||||
+ 0.011 * BF23 + 0.049 * BF25 - 0.159 * BF26 ...
|
||||
- 0.016 * BF28 - 0.138 * BF29 + 0.010 * BF31 ...
|
||||
+ 0.016 * BF35 + 0.018 * BF36 + 0.246 * BF37 ...
|
||||
- 0.417 * BF38 + 0.052 * BF39 - 0.005 * BF41 ...
|
||||
+ 0.021 * BF43 + 0.006 * BF44 - 0.047 * BF45 ...
|
||||
- 0.051 * BF47 - 0.014 * BF48 - 0.113 * BF50 ...
|
||||
+ 0.019 * BF51 + 0.007 * BF53 + 0.017 * BF54 ...
|
||||
- 0.007 * BF56 - 0.098 * BF58 + 0.011 * BF59 ...
|
||||
- 0.016 * BF62 - 0.012 * BF63 + 0.113 * BF65 ...
|
||||
+ 0.016 * BF66 + 0.040 * BF67 - 0.065 * BF69 ...
|
||||
- 0.018 * BF71 + 0.014 * BF73 - 0.009 * BF75 ...
|
||||
- 0.008 * BF76 - 0.032 * BF78 + 0.032 * BF79 ...
|
||||
+ 0.011 * BF80;
|
||||
|
||||
|
||||
function Y= ovl_mars( FWSEG_VA, V5, V6, V7, V8, V9, V10, V11, V12, ...
|
||||
V13, V14, V15, V16, V17, V18, V19, V20, ...
|
||||
V21, V22, V23, V24, V25, V26, V27, V28)
|
||||
|
||||
BF1 = max(0, V21 - 4.671);
|
||||
BF3 = max(0, V6 - 5.396);
|
||||
BF4 = max(0, 5.396 - V6 );
|
||||
BF7 = max(0, V11 - 7.884);
|
||||
BF8 = max(0, 7.884 - V11 );
|
||||
BF9 = max(0, FWSEG_VA + 7.229) * BF1;
|
||||
BF10 = max(0, - 7.229 - FWSEG_VA ) * BF1;
|
||||
BF11 = max(0, V19 - 8.128) * BF1;
|
||||
BF12 = max(0, 8.128 - V19 ) * BF1;
|
||||
BF13 = max(0, V28 - 7.918);
|
||||
BF14 = max(0, 7.918 - V28 );
|
||||
BF15 = max(0, V5 + 2.888) * BF14;
|
||||
BF16 = max(0, - 2.888 - V5 ) * BF14;
|
||||
BF17 = max(0, V24 - 2.924) * BF8;
|
||||
BF18 = max(0, 2.924 - V24 ) * BF8;
|
||||
BF20 = max(0, 9.071 - V16 ) * BF15;
|
||||
BF21 = max(0, V10 - 6.286) * BF14;
|
||||
BF22 = max(0, 6.286 - V10 ) * BF14;
|
||||
BF24 = max(0, V23 - 5.173);
|
||||
BF25 = max(0, 5.173 - V23 );
|
||||
BF26 = max(0, V26 - 8.987);
|
||||
BF29 = max(0, 12.216 - V27 ) * BF3;
|
||||
BF30 = max(0, V8 - 4.306) * BF16;
|
||||
BF34 = max(0, V23 - 7.630) * BF21;
|
||||
BF35 = max(0, 7.630 - V23 ) * BF21;
|
||||
BF37 = max(0, 3.638 - V7 ) * BF1;
|
||||
BF39 = max(0, 8.337 - V21 ) * BF17;
|
||||
BF41 = max(0, 1.590 - V5 ) * BF11;
|
||||
BF43 = max(0, 13.993 - V8 ) * BF11;
|
||||
BF44 = max(0, V14 - 5.993) * BF25;
|
||||
BF45 = max(0, 5.993 - V14 ) * BF25;
|
||||
BF46 = max(0, V24 - 1.035);
|
||||
BF47 = max(0, 1.035 - V24 );
|
||||
BF49 = max(0, 8.915 - V23 ) * BF12;
|
||||
BF51 = max(0, - 0.004 - FWSEG_VA );
|
||||
BF52 = max(0, V27 - 6.520) * BF24;
|
||||
BF53 = max(0, 6.520 - V27 ) * BF24;
|
||||
BF54 = max(0, V7 - 11.484) * BF8;
|
||||
BF55 = max(0, 11.484 - V7 ) * BF8;
|
||||
BF57 = max(0, 5.742 - V17 ) * BF25;
|
||||
BF58 = max(0, V12 - 6.949) * BF12;
|
||||
BF59 = max(0, 6.949 - V12 ) * BF12;
|
||||
BF60 = max(0, V25 - 9.203) * BF45;
|
||||
BF63 = max(0, 1.887 - V13 ) * BF7;
|
||||
BF65 = max(0, 9.498 - V26 ) * BF15;
|
||||
BF66 = max(0, V5 - 6.566) * BF22;
|
||||
BF71 = max(0, 13.239 - V19 ) * BF46;
|
||||
BF72 = max(0, V19 - 9.925) * BF55;
|
||||
BF77 = max(0, 3.430 - V22 ) * BF18;
|
||||
BF78 = max(0, V27 - 6.513) * BF45;
|
||||
BF79 = max(0, 6.513 - V27 ) * BF45;
|
||||
BF81 = max(0, 12.511 - V18 );
|
||||
BF82 = max(0, V11 - 6.777) * BF81;
|
||||
BF83 = max(0, 6.777 - V11 ) * BF81;
|
||||
BF85 = max(0, 3.433 - V5 ) * BF47;
|
||||
BF87 = max(0, - 3.524 - FWSEG_VA ) * BF47;
|
||||
BF88 = max(0, V27 - 11.604) * BF9;
|
||||
BF91 = max(0, 8.845 - V26 ) * BF52;
|
||||
BF92 = max(0, V14 - 5.931) * BF82;
|
||||
BF93 = max(0, 5.931 - V14 ) * BF82;
|
||||
BF94 = max(0, V21 - 7.245) * BF25;
|
||||
BF95 = max(0, 7.245 - V21 ) * BF25;
|
||||
BF96 = max(0, V14 - 5.323) * BF7;
|
||||
BF98 = max(0, V10 - 6.248) * BF71;
|
||||
BF100 = max(0, V18 - 0.602) * BF95;
|
||||
|
||||
Y = 2.936 + 0.047 * BF1 + 0.061 * BF3 - 0.084 * BF4 - 0.139 * BF8 ...
|
||||
- 0.064 * BF10 - 0.030 * BF12 - 0.103 * BF13 ...
|
||||
- 0.039 * BF14 + 0.020 * BF17 - 0.002 * BF20 ...
|
||||
- 0.005 * BF22 - 0.114 * BF25 - 0.090 * BF26 ...
|
||||
- 0.011 * BF29 + 0.010 * BF30 + 0.009 * BF34 ...
|
||||
+ 0.002 * BF35 + 0.079 * BF37 - 0.006 * BF39 ...
|
||||
+ 0.007 * BF41 - 0.003 * BF43 + 0.017 * BF44 ...
|
||||
+ 0.076 * BF47 + 0.009 * BF49 + 0.016 * BF51 ...
|
||||
- 0.042 * BF53 - 0.079 * BF54 - 0.030 * BF57 ...
|
||||
- 0.018 * BF58 - 0.009 * BF59 - 0.119 * BF60 ...
|
||||
- 0.210 * BF63 - .456802E-03 * BF65 + 0.028 * BF66 ...
|
||||
+ 0.020 * BF72 + 0.011 * BF77 + 0.005 * BF78 ...
|
||||
+ 0.003 * BF79 - 0.049 * BF81 + 0.012 * BF83 ...
|
||||
- 0.030 * BF85 + 0.070 * BF87 + 0.008 * BF88 ...
|
||||
- 0.008 * BF91 + 0.010 * BF92 + 0.003 * BF93 ...
|
||||
+ 0.022 * BF94 - 0.038 * BF96 + .933766E-03 * BF98 ...
|
||||
+ 0.002 * BF100;
|
||||
|
||||
|
||||
|
||||
function distortion = fwseg(clean_speech, processed_speech,sample_rate)
|
||||
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Check the length of the clean and processed speech. Must be the same.
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
clean_length = length(clean_speech);
|
||||
processed_length = length(processed_speech);
|
||||
|
||||
if (clean_length ~= processed_length)
|
||||
disp('Error: Files must have same length.');
|
||||
return
|
||||
end
|
||||
|
||||
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Global Variables
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
|
||||
winlength = round(30*sample_rate/1000); % window length in samples
|
||||
skiprate = floor(winlength/4); % window skip in samples
|
||||
max_freq = sample_rate/2; % maximum bandwidth
|
||||
num_crit = 25; % number of critical bands
|
||||
|
||||
n_fft = 2^nextpow2(2*winlength);
|
||||
n_fftby2 = n_fft/2; % FFT size/2
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Critical Band Filter Definitions (Center Frequency and Bandwidths in Hz)
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
cent_freq(1) = 50.0000; bandwidth(1) = 70.0000;
|
||||
cent_freq(2) = 120.000; bandwidth(2) = 70.0000;
|
||||
cent_freq(3) = 190.000; bandwidth(3) = 70.0000;
|
||||
cent_freq(4) = 260.000; bandwidth(4) = 70.0000;
|
||||
cent_freq(5) = 330.000; bandwidth(5) = 70.0000;
|
||||
cent_freq(6) = 400.000; bandwidth(6) = 70.0000;
|
||||
cent_freq(7) = 470.000; bandwidth(7) = 70.0000;
|
||||
cent_freq(8) = 540.000; bandwidth(8) = 77.3724;
|
||||
cent_freq(9) = 617.372; bandwidth(9) = 86.0056;
|
||||
cent_freq(10) = 703.378; bandwidth(10) = 95.3398;
|
||||
cent_freq(11) = 798.717; bandwidth(11) = 105.411;
|
||||
cent_freq(12) = 904.128; bandwidth(12) = 116.256;
|
||||
cent_freq(13) = 1020.38; bandwidth(13) = 127.914;
|
||||
cent_freq(14) = 1148.30; bandwidth(14) = 140.423;
|
||||
cent_freq(15) = 1288.72; bandwidth(15) = 153.823;
|
||||
cent_freq(16) = 1442.54; bandwidth(16) = 168.154;
|
||||
cent_freq(17) = 1610.70; bandwidth(17) = 183.457;
|
||||
cent_freq(18) = 1794.16; bandwidth(18) = 199.776;
|
||||
cent_freq(19) = 1993.93; bandwidth(19) = 217.153;
|
||||
cent_freq(20) = 2211.08; bandwidth(20) = 235.631;
|
||||
cent_freq(21) = 2446.71; bandwidth(21) = 255.255;
|
||||
cent_freq(22) = 2701.97; bandwidth(22) = 276.072;
|
||||
cent_freq(23) = 2978.04; bandwidth(23) = 298.126;
|
||||
cent_freq(24) = 3276.17; bandwidth(24) = 321.465;
|
||||
cent_freq(25) = 3597.63; bandwidth(25) = 346.136;
|
||||
|
||||
|
||||
bw_min = bandwidth (1); % minimum critical bandwidth
|
||||
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Set up the critical band filters. Note here that Gaussianly shaped
|
||||
% filters are used. Also, the sum of the filter weights are equivalent
|
||||
% for each critical band filter. Filter less than -30 dB and set to
|
||||
% zero.
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
min_factor = exp (-30.0 / (2.0 * 2.303)); % -30 dB point of filter
|
||||
|
||||
for i = 1:num_crit
|
||||
f0 = (cent_freq (i) / max_freq) * (n_fftby2);
|
||||
all_f0(i) = floor(f0);
|
||||
bw = (bandwidth (i) / max_freq) * (n_fftby2);
|
||||
norm_factor = log(bw_min) - log(bandwidth(i));
|
||||
j = 0:1:n_fftby2-1;
|
||||
crit_filter(i,:) = exp (-11 *(((j - floor(f0)) ./bw).^2) + norm_factor);
|
||||
crit_filter(i,:) = crit_filter(i,:).*(crit_filter(i,:) > min_factor);
|
||||
end
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% For each frame of input speech, calculate the Weighted Spectral
|
||||
% Slope Measure
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
num_frames = floor(clean_length/skiprate-(winlength/skiprate)); % number of frames
|
||||
start = 1; % starting sample
|
||||
window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
|
||||
|
||||
distortion=zeros(num_frames,num_crit);
|
||||
for frame_count = 1:num_frames
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (1) Get the Frames for the test and reference speech.
|
||||
% Multiply by Hanning Window.
|
||||
% ----------------------------------------------------------
|
||||
|
||||
clean_frame = clean_speech(start:start+winlength-1);
|
||||
processed_frame = processed_speech(start:start+winlength-1);
|
||||
clean_frame = clean_frame.*window;
|
||||
processed_frame = processed_frame.*window;
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (2) Compute the magnitude Spectrum of Clean and Processed
|
||||
% ----------------------------------------------------------
|
||||
|
||||
|
||||
clean_spec = abs(fft(clean_frame,n_fft));
|
||||
processed_spec = abs(fft(processed_frame,n_fft));
|
||||
|
||||
% normalize so that spectra have unit area ----
|
||||
clean_spec=clean_spec/sum(clean_spec(1:n_fftby2));
|
||||
processed_spec=processed_spec/sum(processed_spec(1:n_fftby2));
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (3) Compute Filterbank Output Energies
|
||||
% ----------------------------------------------------------
|
||||
|
||||
clean_energy=zeros(1,num_crit);
|
||||
processed_energy=zeros(1,num_crit);
|
||||
error_energy=zeros(1,num_crit);
|
||||
|
||||
for i = 1:num_crit
|
||||
clean_energy(i) = sum(clean_spec(1:n_fftby2) ...
|
||||
.*crit_filter(i,:)');
|
||||
processed_energy(i) = sum(processed_spec(1:n_fftby2) ...
|
||||
.*crit_filter(i,:)');
|
||||
error_energy(i)=max((clean_energy(i)-processed_energy(i))^2,eps);
|
||||
end
|
||||
|
||||
|
||||
SNRlog=10*log10((clean_energy.^2)./error_energy);
|
||||
|
||||
distortion(frame_count,:)=min(max(SNRlog,-10),35);
|
||||
|
||||
start = start + skiprate;
|
||||
|
||||
end
|
||||
|
||||
@@ -0,0 +1,221 @@
|
||||
function [SIG,BAK,OVL]= comp_fwseg_variant(cleanFile, enhancedFile);
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Frequency-variant fwSNRseg Objective Speech Quality Measure
|
||||
%
|
||||
% This function implements the frequency-variant fwSNRseg measure [1]
|
||||
% (see also Chap. 10, Eq. 10.24)
|
||||
%
|
||||
%
|
||||
% Usage: [sig,bak,ovl]=comp_fwseg_variant(cleanFile.wav, enhancedFile.wav)
|
||||
%
|
||||
% cleanFile.wav - clean input file in .wav format
|
||||
% enhancedFile - enhanced output file in .wav format
|
||||
% sig - predicted rating [1-5] of speech distortion
|
||||
% bak - predicted rating [1-5] of noise distortion
|
||||
% ovl - predicted rating [1-5] of overall quality
|
||||
%
|
||||
%
|
||||
% Example call: [s,b,o] =comp_fwseg_variant('sp04.wav','enhanced.wav')
|
||||
%
|
||||
%
|
||||
% References:
|
||||
% [1] S. R. Quackenbush, T. P. Barnwell, and M. A. Clements,
|
||||
% Objective Measures of Speech Quality. Prentice Hall
|
||||
% Advanced Reference Series, Englewood Cliffs, NJ, 1988,
|
||||
% ISBN: 0-13-629056-6.
|
||||
%
|
||||
% Author: Philipos C. Loizou
|
||||
% (critical-band filtering routines were written by Bryan Pellom & John Hansen)
|
||||
%
|
||||
% Copyright (c) 2006 by Philipos C. Loizou
|
||||
% $Revision: 0.0 $ $Date: 10/09/2006 $
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
if nargin~=2
|
||||
fprintf('USAGE: [sig,bak,ovl]=comp_fwseg_variant(cleanFile.wav, enhancedFile.wav)\n');
|
||||
fprintf('For more help, type: help comp_fwseg_variant\n\n');
|
||||
return;
|
||||
end
|
||||
|
||||
|
||||
[data1, Srate1, Nbits1]= wavread(cleanFile);
|
||||
[data2, Srate2, Nbits2]= wavread(enhancedFile);
|
||||
if ( Srate1~= Srate2) | ( Nbits1~= Nbits2)
|
||||
error( 'The two files do not match!\n');
|
||||
end
|
||||
|
||||
len= min( length( data1), length( data2));
|
||||
data1= data1( 1: len)+eps;
|
||||
data2= data2( 1: len)+eps;
|
||||
|
||||
wss_dist_matrix= fwseg( data1, data2,Srate1);
|
||||
wss_dist=mean(wss_dist_matrix);
|
||||
|
||||
% initialize coefficients obtained from multiple linear
|
||||
% regression analysis
|
||||
%
|
||||
b_sig=[0.021,-0.028,0.088,-0.031,0.048,-0.049,0.065,0.009,0.011,0.033,...
|
||||
-0.040,-0.002,0.041,-0.007,0.033,0.018,-0.007,0.044,-0.001,0.021,...
|
||||
-0.002,0.017,-0.03,0.073,0.043];
|
||||
b_ovl=[-0.003,-0.026,0.066,-0.036,0.038,-0.023,0.037,0.022,0.014,0.009,...
|
||||
-0.03,0.004,0.044,-0.005,0.017,0.018,-0.001,0.051,0.009,0.011,...
|
||||
0.011,-0.002,-0.021,0.043,0.031];
|
||||
b_bak=[-0.03,-0.022,0.03,-0.048,0.034,0.002,0.006,0.037,0.017,-0.016,-0.008,...
|
||||
0.019,0.024,-0.002,0.01,0.03,-0.018,0.046,0.022,0.005,0.03,-0.028,...
|
||||
-0.028,0.019,0.005];
|
||||
|
||||
SIG=0.567+sum(b_sig.*wss_dist);
|
||||
SIG=max(1,SIG); SIG=min(5, SIG); % limit values to [1, 5]
|
||||
|
||||
BAK=1.013+sum(b_bak.*wss_dist);
|
||||
BAK=max(1,BAK); BAK=min(5, BAK); % limit values to [1, 5]
|
||||
|
||||
OVL=0.446+sum(b_ovl.*wss_dist);
|
||||
OVL=max(1,OVL); OVL=min(5, OVL); % limit values to [1, 5]
|
||||
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
function distortion = fwseg(clean_speech, processed_speech,sample_rate)
|
||||
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Check the length of the clean and processed speech. Must be the same.
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
clean_length = length(clean_speech);
|
||||
processed_length = length(processed_speech);
|
||||
|
||||
if (clean_length ~= processed_length)
|
||||
disp('Error: Files must have same length.');
|
||||
return
|
||||
end
|
||||
|
||||
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Global Variables
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
|
||||
winlength = round(30*sample_rate/1000); % window length in samples
|
||||
skiprate = floor(winlength/4); % window skip in samples
|
||||
max_freq = sample_rate/2; % maximum bandwidth
|
||||
num_crit = 25; % number of critical bands
|
||||
|
||||
n_fft = 2^nextpow2(2*winlength);
|
||||
n_fftby2 = n_fft/2; % FFT size/2
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Critical Band Filter Definitions (Center Frequency and Bandwidths in Hz)
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
cent_freq(1) = 50.0000; bandwidth(1) = 70.0000;
|
||||
cent_freq(2) = 120.000; bandwidth(2) = 70.0000;
|
||||
cent_freq(3) = 190.000; bandwidth(3) = 70.0000;
|
||||
cent_freq(4) = 260.000; bandwidth(4) = 70.0000;
|
||||
cent_freq(5) = 330.000; bandwidth(5) = 70.0000;
|
||||
cent_freq(6) = 400.000; bandwidth(6) = 70.0000;
|
||||
cent_freq(7) = 470.000; bandwidth(7) = 70.0000;
|
||||
cent_freq(8) = 540.000; bandwidth(8) = 77.3724;
|
||||
cent_freq(9) = 617.372; bandwidth(9) = 86.0056;
|
||||
cent_freq(10) = 703.378; bandwidth(10) = 95.3398;
|
||||
cent_freq(11) = 798.717; bandwidth(11) = 105.411;
|
||||
cent_freq(12) = 904.128; bandwidth(12) = 116.256;
|
||||
cent_freq(13) = 1020.38; bandwidth(13) = 127.914;
|
||||
cent_freq(14) = 1148.30; bandwidth(14) = 140.423;
|
||||
cent_freq(15) = 1288.72; bandwidth(15) = 153.823;
|
||||
cent_freq(16) = 1442.54; bandwidth(16) = 168.154;
|
||||
cent_freq(17) = 1610.70; bandwidth(17) = 183.457;
|
||||
cent_freq(18) = 1794.16; bandwidth(18) = 199.776;
|
||||
cent_freq(19) = 1993.93; bandwidth(19) = 217.153;
|
||||
cent_freq(20) = 2211.08; bandwidth(20) = 235.631;
|
||||
cent_freq(21) = 2446.71; bandwidth(21) = 255.255;
|
||||
cent_freq(22) = 2701.97; bandwidth(22) = 276.072;
|
||||
cent_freq(23) = 2978.04; bandwidth(23) = 298.126;
|
||||
cent_freq(24) = 3276.17; bandwidth(24) = 321.465;
|
||||
cent_freq(25) = 3597.63; bandwidth(25) = 346.136;
|
||||
|
||||
|
||||
bw_min = bandwidth (1); % minimum critical bandwidth
|
||||
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Set up the critical band filters. Note here that Gaussianly shaped
|
||||
% filters are used. Also, the sum of the filter weights are equivalent
|
||||
% for each critical band filter. Filter less than -30 dB and set to
|
||||
% zero.
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
min_factor = exp (-30.0 / (2.0 * 2.303)); % -30 dB point of filter
|
||||
|
||||
for i = 1:num_crit
|
||||
f0 = (cent_freq (i) / max_freq) * (n_fftby2);
|
||||
all_f0(i) = floor(f0);
|
||||
bw = (bandwidth (i) / max_freq) * (n_fftby2);
|
||||
norm_factor = log(bw_min) - log(bandwidth(i));
|
||||
j = 0:1:n_fftby2-1;
|
||||
crit_filter(i,:) = exp (-11 *(((j - floor(f0)) ./bw).^2) + norm_factor);
|
||||
crit_filter(i,:) = crit_filter(i,:).*(crit_filter(i,:) > min_factor);
|
||||
end
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% For each frame of input speech, calculate the Weighted Spectral
|
||||
% Slope Measure
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
num_frames = floor(clean_length/skiprate-(winlength/skiprate)); % number of frames
|
||||
start = 1; % starting sample
|
||||
window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
|
||||
|
||||
distortion=zeros(num_frames,num_crit);
|
||||
for frame_count = 1:num_frames
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (1) Get the Frames for the test and reference speech.
|
||||
% Multiply by Hanning Window.
|
||||
% ----------------------------------------------------------
|
||||
|
||||
clean_frame = clean_speech(start:start+winlength-1);
|
||||
processed_frame = processed_speech(start:start+winlength-1);
|
||||
clean_frame = clean_frame.*window;
|
||||
processed_frame = processed_frame.*window;
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (2) Compute the magnitude Spectrum of Clean and Processed
|
||||
% ----------------------------------------------------------
|
||||
|
||||
|
||||
clean_spec = abs(fft(clean_frame,n_fft));
|
||||
processed_spec = abs(fft(processed_frame,n_fft));
|
||||
|
||||
% normalize so that spectra have unit area ----
|
||||
clean_spec=clean_spec/sum(clean_spec(1:n_fftby2));
|
||||
processed_spec=processed_spec/sum(processed_spec(1:n_fftby2));
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (3) Compute Filterbank Output Energies (in dB scale)
|
||||
% ----------------------------------------------------------
|
||||
|
||||
clean_energy=zeros(1,num_crit);
|
||||
processed_energy=zeros(1,num_crit);
|
||||
error_energy=zeros(1,num_crit);
|
||||
|
||||
for i = 1:num_crit
|
||||
clean_energy(i) = sum(clean_spec(1:n_fftby2) ...
|
||||
.*crit_filter(i,:)');
|
||||
processed_energy(i) = sum(processed_spec(1:n_fftby2) ...
|
||||
.*crit_filter(i,:)');
|
||||
error_energy(i)=max((clean_energy(i)-processed_energy(i))^2,eps);
|
||||
end
|
||||
|
||||
|
||||
SNRlog=10*log10((clean_energy.^2)./error_energy);
|
||||
|
||||
distortion(frame_count,:)=min(max(SNRlog,-10),35);
|
||||
|
||||
start = start + skiprate;
|
||||
|
||||
end
|
||||
|
||||
@@ -0,0 +1,188 @@
|
||||
function is_mean= comp_is(cleanFile, enhdFile);
|
||||
% ----------------------------------------------------------------------
|
||||
% Itakura-Saito (IS) Objective Speech Quality Measure
|
||||
%
|
||||
% This function implements the Itakura-Saito distance measure
|
||||
% defined on page 50 of [1] (see Equation 2.26). See also
|
||||
% Equation 12 (page 1480) of [2].
|
||||
%
|
||||
% Usage: IS=comp_is(cleanFile.wav, enhancedFile.wav)
|
||||
%
|
||||
% cleanFile.wav - clean input file in .wav format
|
||||
% enhancedFile - enhanced output file in .wav format
|
||||
% IS - computed Itakura Saito measure
|
||||
%
|
||||
% Note that the IS measure is limited in the range [0, 100].
|
||||
%
|
||||
% Example call: IS =comp_is('sp04.wav','enhanced.wav')
|
||||
%
|
||||
%
|
||||
% References:
|
||||
%
|
||||
% [1] S. R. Quackenbush, T. P. Barnwell, and M. A. Clements,
|
||||
% Objective Measures of Speech Quality. Prentice Hall
|
||||
% Advanced Reference Series, Englewood Cliffs, NJ, 1988,
|
||||
% ISBN: 0-13-629056-6.
|
||||
%
|
||||
% [2] B.-H. Juang, "On Using the Itakura-Saito Measures for
|
||||
% Speech Coder Performance Evaluation", AT&T Bell
|
||||
% Laboratories Technical Journal, Vol. 63, No. 8,
|
||||
% October 1984, pp. 1477-1498.
|
||||
%
|
||||
% Authors: Bryan L. Pellom and John H. L. Hansen (July 1998)
|
||||
% Modified by: Philipos C. Loizou (Oct 2006) - limited IS to be in [0,100]
|
||||
%
|
||||
% Copyright (c) 2006 by Philipos C. Loizou
|
||||
% $Revision: 0.0 $ $Date: 10/09/2006 $
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
if nargin~=2
|
||||
fprintf('USAGE: IS=comp_is(cleanFile.wav, enhancedFile.wav)\n');
|
||||
fprintf('For more help, type: help comp_is\n\n');
|
||||
return;
|
||||
end
|
||||
|
||||
alpha=0.95;
|
||||
|
||||
[data1, Srate1, Nbits1]= wavread(cleanFile);
|
||||
[data2, Srate2, Nbits2]= wavread(enhdFile);
|
||||
if ( Srate1~= Srate2) | ( Nbits1~= Nbits2)
|
||||
error( 'The two files do not match!\n');
|
||||
end
|
||||
|
||||
len= min( length( data1), length( data2));
|
||||
data1= data1( 1: len)+eps;
|
||||
data2= data2( 1: len)+eps;
|
||||
|
||||
|
||||
IS_dist= is( data1, data2,Srate1);
|
||||
|
||||
IS_len= round( length( IS_dist)* alpha);
|
||||
IS= sort( IS_dist);
|
||||
|
||||
is_mean= mean( IS( 1: IS_len));
|
||||
|
||||
|
||||
|
||||
function distortion = is(clean_speech, processed_speech,sample_rate)
|
||||
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Check the length of the clean and processed speech. Must be the same.
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
clean_length = length(clean_speech);
|
||||
processed_length = length(processed_speech);
|
||||
|
||||
if (clean_length ~= processed_length)
|
||||
disp('Error: Both Speech Files must be same length.');
|
||||
return
|
||||
end
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Scale both clean speech and processed speech to have same dynamic
|
||||
% range. Also remove DC component from each signal
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
%clean_speech = clean_speech - mean(clean_speech);
|
||||
%processed_speech = processed_speech - mean(processed_speech);
|
||||
|
||||
%processed_speech = processed_speech.*(max(abs(clean_speech))/ max(abs(processed_speech)));
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Global Variables
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
%sample_rate = 8000; % default sample rate
|
||||
winlength = round(30*sample_rate/1000); %240; % window length in samples
|
||||
skiprate = floor(winlength/4); % window skip in samples
|
||||
if sample_rate<10000
|
||||
P = 10; % LPC Analysis Order
|
||||
else
|
||||
P=16; % this could vary depending on sampling frequency.
|
||||
end
|
||||
% ----------------------------------------------------------------------
|
||||
% For each frame of input speech, calculate the Itakura-Saito Measure
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames
|
||||
start = 1; % starting sample
|
||||
window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
|
||||
|
||||
for frame_count = 1:num_frames
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (1) Get the Frames for the test and reference speech.
|
||||
% Multiply by Hanning Window.
|
||||
% ----------------------------------------------------------
|
||||
|
||||
clean_frame = clean_speech(start:start+winlength-1);
|
||||
processed_frame = processed_speech(start:start+winlength-1);
|
||||
clean_frame = clean_frame.*window;
|
||||
processed_frame = processed_frame.*window;
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (2) Get the autocorrelation lags and LPC parameters used
|
||||
% to compute the IS measure.
|
||||
% ----------------------------------------------------------
|
||||
|
||||
[R_clean, Ref_clean, A_clean] = ...
|
||||
lpcoeff(clean_frame, P);
|
||||
[R_processed, Ref_processed, A_processed] = ...
|
||||
lpcoeff(processed_frame, P);
|
||||
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (3) Compute the IS measure
|
||||
% ----------------------------------------------------------
|
||||
|
||||
numerator = A_processed*toeplitz(R_clean)*A_processed';
|
||||
denominator = max(A_clean*toeplitz(R_clean)*A_clean',eps);
|
||||
gain_clean = max(R_clean*A_clean',eps); % this is gain
|
||||
gain_processed = max(R_processed*A_processed',eps); % squared (sigma^2)
|
||||
|
||||
|
||||
ISvalue=(gain_clean/gain_processed)*(numerator/denominator) + ...
|
||||
log(gain_processed/gain_clean)-1;
|
||||
|
||||
distortion(frame_count) = min(ISvalue,100);
|
||||
start = start + skiprate;
|
||||
|
||||
end
|
||||
|
||||
|
||||
|
||||
function [acorr, refcoeff, lpparams] = lpcoeff(speech_frame, model_order)
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (1) Compute Autocorrelation Lags
|
||||
% ----------------------------------------------------------
|
||||
|
||||
winlength = max(size(speech_frame));
|
||||
for k=1:model_order+1
|
||||
R(k) = sum(speech_frame(1:winlength-k+1) ...
|
||||
.*speech_frame(k:winlength));
|
||||
end
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (2) Levinson-Durbin
|
||||
% ----------------------------------------------------------
|
||||
|
||||
a = ones(1,model_order);
|
||||
E(1)=R(1);
|
||||
for i=1:model_order
|
||||
a_past(1:i-1) = a(1:i-1);
|
||||
sum_term = sum(a_past(1:i-1).*R(i:-1:2));
|
||||
rcoeff(i)=(R(i+1) - sum_term) / E(i);
|
||||
a(i)=rcoeff(i);
|
||||
a(1:i-1) = a_past(1:i-1) - rcoeff(i).*a_past(i-1:-1:1);
|
||||
E(i+1)=(1-rcoeff(i)*rcoeff(i))*E(i);
|
||||
end
|
||||
|
||||
acorr = R;
|
||||
refcoeff = rcoeff;
|
||||
lpparams = [1 -a];
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,162 @@
|
||||
function llr_mean= comp_llr(cleanFile, enhancedFile);
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
%
|
||||
% Log Likelihood Ratio (LLR) Objective Speech Quality Measure
|
||||
%
|
||||
%
|
||||
% This function implements the Log Likelihood Ratio Measure
|
||||
% defined on page 48 of [1] (see Equation 2.18).
|
||||
%
|
||||
% Usage: llr=comp_llr(cleanFile.wav, enhancedFile.wav)
|
||||
%
|
||||
% cleanFile.wav - clean input file in .wav format
|
||||
% enhancedFile - enhanced output file in .wav format
|
||||
% llr - computed likelihood ratio
|
||||
%
|
||||
% Note that the LLR measure is limited in the range [0, 2].
|
||||
%
|
||||
% Example call: llr =comp_llr('sp04.wav','enhanced.wav')
|
||||
%
|
||||
%
|
||||
% References:
|
||||
%
|
||||
% [1] S. R. Quackenbush, T. P. Barnwell, and M. A. Clements,
|
||||
% Objective Measures of Speech Quality. Prentice Hall
|
||||
% Advanced Reference Series, Englewood Cliffs, NJ, 1988,
|
||||
% ISBN: 0-13-629056-6.
|
||||
%
|
||||
% Authors: Bryan L. Pellom and John H. L. Hansen (July 1998)
|
||||
% Modified by: Philipos C. Loizou (Oct 2006) - limited LLR to be in [0,2]
|
||||
%
|
||||
% Copyright (c) 2006 by Philipos C. Loizou
|
||||
% $Revision: 0.0 $ $Date: 10/09/2006 $
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
if nargin~=2
|
||||
fprintf('USAGE: LLR=comp_llr(cleanFile.wav, enhancedFile.wav)\n');
|
||||
fprintf('For more help, type: help comp_llr\n\n');
|
||||
return;
|
||||
end
|
||||
|
||||
alpha=0.95;
|
||||
[data1, Srate1, Nbits1]= wavread(cleanFile);
|
||||
[data2, Srate2, Nbits2]= wavread(enhancedFile);
|
||||
if ( Srate1~= Srate2) | ( Nbits1~= Nbits2)
|
||||
error( 'The two files do not match!\n');
|
||||
end
|
||||
|
||||
len= min( length( data1), length( data2));
|
||||
data1= data1( 1: len)+eps;
|
||||
data2= data2( 1: len)+eps;
|
||||
|
||||
IS_dist= llr( data1, data2,Srate1);
|
||||
|
||||
IS_len= round( length( IS_dist)* alpha);
|
||||
IS= sort( IS_dist);
|
||||
|
||||
llr_mean= mean( IS( 1: IS_len));
|
||||
|
||||
|
||||
|
||||
function distortion = llr(clean_speech, processed_speech,sample_rate)
|
||||
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Check the length of the clean and processed speech. Must be the same.
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
clean_length = length(clean_speech);
|
||||
processed_length = length(processed_speech);
|
||||
|
||||
if (clean_length ~= processed_length)
|
||||
disp('Error: Both Speech Files must be same length.');
|
||||
return
|
||||
end
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Global Variables
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
winlength = round(30*sample_rate/1000); %240; % window length in samples
|
||||
skiprate = floor(winlength/4); % window skip in samples
|
||||
if sample_rate<10000
|
||||
P = 10; % LPC Analysis Order
|
||||
else
|
||||
P=16; % this could vary depending on sampling frequency.
|
||||
end
|
||||
% ----------------------------------------------------------------------
|
||||
% For each frame of input speech, calculate the Log Likelihood Ratio
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames
|
||||
start = 1; % starting sample
|
||||
window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
|
||||
|
||||
for frame_count = 1:num_frames
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (1) Get the Frames for the test and reference speech.
|
||||
% Multiply by Hanning Window.
|
||||
% ----------------------------------------------------------
|
||||
|
||||
clean_frame = clean_speech(start:start+winlength-1);
|
||||
processed_frame = processed_speech(start:start+winlength-1);
|
||||
clean_frame = clean_frame.*window;
|
||||
processed_frame = processed_frame.*window;
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (2) Get the autocorrelation lags and LPC parameters used
|
||||
% to compute the LLR measure.
|
||||
% ----------------------------------------------------------
|
||||
|
||||
[R_clean, Ref_clean, A_clean] = ...
|
||||
lpcoeff(clean_frame, P);
|
||||
[R_processed, Ref_processed, A_processed] = ...
|
||||
lpcoeff(processed_frame, P);
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (3) Compute the LLR measure
|
||||
% ----------------------------------------------------------
|
||||
|
||||
numerator = A_processed*toeplitz(R_clean)*A_processed';
|
||||
denominator = A_clean*toeplitz(R_clean)*A_clean';
|
||||
distortion(frame_count) = min(2,log(numerator/denominator));
|
||||
start = start + skiprate;
|
||||
|
||||
end
|
||||
|
||||
|
||||
function [acorr, refcoeff, lpparams] = lpcoeff(speech_frame, model_order)
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (1) Compute Autocorrelation Lags
|
||||
% ----------------------------------------------------------
|
||||
|
||||
winlength = max(size(speech_frame));
|
||||
for k=1:model_order+1
|
||||
R(k) = sum(speech_frame(1:winlength-k+1) ...
|
||||
.*speech_frame(k:winlength));
|
||||
end
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (2) Levinson-Durbin
|
||||
% ----------------------------------------------------------
|
||||
|
||||
a = ones(1,model_order);
|
||||
E(1)=R(1);
|
||||
for i=1:model_order
|
||||
a_past(1:i-1) = a(1:i-1);
|
||||
sum_term = sum(a_past(1:i-1).*R(i:-1:2));
|
||||
rcoeff(i)=(R(i+1) - sum_term) / E(i);
|
||||
a(i)=rcoeff(i);
|
||||
a(1:i-1) = a_past(1:i-1) - rcoeff(i).*a_past(i-1:-1:1);
|
||||
E(i+1)=(1-rcoeff(i)*rcoeff(i))*E(i);
|
||||
end
|
||||
|
||||
acorr = R;
|
||||
refcoeff = rcoeff;
|
||||
lpparams = [1 -a];
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,132 @@
|
||||
function [snr_mean, segsnr_mean]= comp_SNR(cleanFile, enhdFile);
|
||||
%
|
||||
% Segmental Signal-to-Noise Ratio Objective Speech Quality Measure
|
||||
%
|
||||
% This function implements the segmental signal-to-noise ratio
|
||||
% as defined in [1, p. 45] (see Equation 2.12).
|
||||
%
|
||||
% Usage: [SNRovl, SNRseg]=comp_snr(cleanFile.wav, enhancedFile.wav)
|
||||
%
|
||||
% cleanFile.wav - clean input file in .wav format
|
||||
% enhancedFile - enhanced output file in .wav format
|
||||
% SNRovl - overall SNR (dB)
|
||||
% SNRseg - segmental SNR (dB)
|
||||
%
|
||||
% This function returns 2 parameters. The first item is the
|
||||
% overall SNR for the two speech signals. The second value
|
||||
% is the segmental signal-to-noise ratio (1 seg-snr per
|
||||
% frame of input). The segmental SNR is clamped to range
|
||||
% between 35dB and -10dB (see suggestions in [2]).
|
||||
%
|
||||
% Example call: [SNRovl,SNRseg]=comp_SNR('sp04.wav','enhanced.wav')
|
||||
%
|
||||
% References:
|
||||
%
|
||||
% [1] S. R. Quackenbush, T. P. Barnwell, and M. A. Clements,
|
||||
% Objective Measures of Speech Quality. Prentice Hall
|
||||
% Advanced Reference Series, Englewood Cliffs, NJ, 1988,
|
||||
% ISBN: 0-13-629056-6.
|
||||
%
|
||||
% [2] P. E. Papamichalis, Practical Approaches to Speech
|
||||
% Coding, Prentice-Hall, Englewood Cliffs, NJ, 1987.
|
||||
% ISBN: 0-13-689019-9. (see pages 179-181).
|
||||
%
|
||||
% Authors: Bryan L. Pellom and John H. L. Hansen (July 1998)
|
||||
% Modified by: Philipos C. Loizou (Oct 2006)
|
||||
%
|
||||
% Copyright (c) 2006 by Philipos C. Loizou
|
||||
% $Revision: 0.0 $ $Date: 10/09/2006 $
|
||||
%-------------------------------------------------------------------------
|
||||
|
||||
if nargin ~=2
|
||||
fprintf('USAGE: [snr_mean, segsnr_mean]= comp_SNR(cleanFile, enhdFile) \n');
|
||||
return;
|
||||
end
|
||||
|
||||
[data1, Srate1, Nbits1]= wavread(cleanFile);
|
||||
[data2, Srate2, Nbits2]= wavread(enhdFile);
|
||||
if (( Srate1~= Srate2) | ( Nbits1~= Nbits2))
|
||||
error( 'The two files do not match!\n');
|
||||
end
|
||||
|
||||
len= min( length( data1), length( data2));
|
||||
data1= data1( 1: len);
|
||||
data2= data2( 1: len);
|
||||
|
||||
[snr_dist, segsnr_dist]= snr( data1, data2,Srate1);
|
||||
|
||||
snr_mean= snr_dist;
|
||||
segsnr_mean= mean( segsnr_dist);
|
||||
|
||||
|
||||
% =========================================================================
|
||||
function [overall_snr, segmental_snr] = snr(clean_speech, processed_speech,sample_rate)
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Check the length of the clean and processed speech. Must be the same.
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
clean_length = length(clean_speech);
|
||||
processed_length = length(processed_speech);
|
||||
|
||||
if (clean_length ~= processed_length)
|
||||
disp('Error: Both Speech Files must be same length.');
|
||||
return
|
||||
end
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Scale both clean speech and processed speech to have same dynamic
|
||||
% range. Also remove DC component from each signal
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
%clean_speech = clean_speech - mean(clean_speech);
|
||||
%processed_speech = processed_speech - mean(processed_speech);
|
||||
|
||||
%processed_speech = processed_speech.*(max(abs(clean_speech))/ max(abs(processed_speech)));
|
||||
|
||||
overall_snr = 10* log10( sum(clean_speech.^2)/sum((clean_speech-processed_speech).^2));
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Global Variables
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
|
||||
winlength = round(30*sample_rate/1000); %240; % window length in samples for 30-msecs
|
||||
skiprate = floor(winlength/4); %60; % window skip in samples
|
||||
MIN_SNR = -10; % minimum SNR in dB
|
||||
MAX_SNR = 35; % maximum SNR in dB
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% For each frame of input speech, calculate the Segmental SNR
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames
|
||||
start = 1; % starting sample
|
||||
window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
|
||||
|
||||
for frame_count = 1: num_frames
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (1) Get the Frames for the test and reference speech.
|
||||
% Multiply by Hanning Window.
|
||||
% ----------------------------------------------------------
|
||||
|
||||
clean_frame = clean_speech(start:start+winlength-1);
|
||||
processed_frame = processed_speech(start:start+winlength-1);
|
||||
clean_frame = clean_frame.*window;
|
||||
processed_frame = processed_frame.*window;
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (2) Compute the Segmental SNR
|
||||
% ----------------------------------------------------------
|
||||
|
||||
signal_energy = sum(clean_frame.^2);
|
||||
noise_energy = sum((clean_frame-processed_frame).^2);
|
||||
segmental_snr(frame_count) = 10*log10(signal_energy/(noise_energy+eps)+eps);
|
||||
segmental_snr(frame_count) = max(segmental_snr(frame_count),MIN_SNR);
|
||||
segmental_snr(frame_count) = min(segmental_snr(frame_count),MAX_SNR);
|
||||
|
||||
start = start + skiprate;
|
||||
|
||||
end
|
||||
|
||||
@@ -0,0 +1,299 @@
|
||||
function wss_dist= comp_wss(cleanFile, enhancedFile);
|
||||
% ----------------------------------------------------------------------
|
||||
%
|
||||
% Weighted Spectral Slope (WSS) Objective Speech Quality Measure
|
||||
%
|
||||
% This function implements the Weighted Spectral Slope (WSS)
|
||||
% distance measure originally proposed in [1]. The algorithm
|
||||
% works by first decomposing the speech signal into a set of
|
||||
% frequency bands (this is done for both the test and reference
|
||||
% frame). The intensities within each critical band are
|
||||
% measured. Then, a weighted distances between the measured
|
||||
% slopes of the log-critical band spectra are computed.
|
||||
% This measure is also described in Section 2.2.9 (pages 56-58)
|
||||
% of [2].
|
||||
%
|
||||
% Whereas Klatt's original measure used 36 critical-band
|
||||
% filters to estimate the smoothed short-time spectrum, this
|
||||
% implementation considers a bank of 25 filters spanning
|
||||
% the 4 kHz bandwidth.
|
||||
%
|
||||
% Usage: wss_dist=comp_wss(cleanFile.wav, enhancedFile.wav)
|
||||
%
|
||||
% cleanFile.wav - clean input file in .wav format
|
||||
% enhancedFile - enhanced output file in .wav format
|
||||
% wss_dist - computed spectral slope distance
|
||||
%
|
||||
% Example call: ws =comp_wss('sp04.wav','enhanced.wav')
|
||||
%
|
||||
% References:
|
||||
%
|
||||
% [1] D. H. Klatt, "Prediction of Perceived Phonetic Distance
|
||||
% from Critical-Band Spectra: A First Step", Proc. IEEE
|
||||
% ICASSP'82, Volume 2, pp. 1278-1281, May, 1982.
|
||||
%
|
||||
% [2] S. R. Quackenbush, T. P. Barnwell, and M. A. Clements,
|
||||
% Objective Measures of Speech Quality. Prentice Hall
|
||||
% Advanced Reference Series, Englewood Cliffs, NJ, 1988,
|
||||
% ISBN: 0-13-629056-6.
|
||||
%
|
||||
% Authors: Bryan L. Pellom and John H. L. Hansen (July 1998)
|
||||
% Modified by: Philipos C. Loizou (Oct 2006)
|
||||
%
|
||||
% Copyright (c) 2006 by Philipos C. Loizou
|
||||
% $Revision: 0.0 $ $Date: 10/09/2006 $
|
||||
%
|
||||
% ----------------------------------------------------------------------
|
||||
if nargin~=2
|
||||
fprintf('USAGE: WSS=comp_wss(cleanFile.wav, enhancedFile.wav)\n');
|
||||
fprintf('For more help, type: help comp_wss\n\n');
|
||||
return;
|
||||
end
|
||||
|
||||
alpha= 0.95;
|
||||
|
||||
[data1, Srate1, Nbits1]= wavread(cleanFile);
|
||||
[data2, Srate2, Nbits2]= wavread(enhancedFile);
|
||||
if ( Srate1~= Srate2) | ( Nbits1~= Nbits2)
|
||||
error( 'The two files do not match!\n');
|
||||
end
|
||||
|
||||
len= min( length( data1), length( data2));
|
||||
data1= data1( 1: len)+eps;
|
||||
data2= data2( 1: len)+eps;
|
||||
|
||||
wss_dist_vec= wss( data1, data2,Srate1);
|
||||
wss_dist_vec= sort( wss_dist_vec);
|
||||
wss_dist= mean( wss_dist_vec( 1: round( length( wss_dist_vec)*alpha)));
|
||||
|
||||
|
||||
|
||||
function distortion = wss(clean_speech, processed_speech,sample_rate)
|
||||
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Check the length of the clean and processed speech. Must be the same.
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
clean_length = length(clean_speech);
|
||||
processed_length = length(processed_speech);
|
||||
|
||||
if (clean_length ~= processed_length)
|
||||
disp('Error: Files musthave same length.');
|
||||
return
|
||||
end
|
||||
|
||||
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Global Variables
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
winlength = round(30*sample_rate/1000); % window length in samples
|
||||
skiprate = floor(winlength/4); % window skip in samples
|
||||
max_freq = sample_rate/2; % maximum bandwidth
|
||||
num_crit = 25; % number of critical bands
|
||||
|
||||
USE_FFT_SPECTRUM = 1; % defaults to 10th order LP spectrum
|
||||
n_fft = 2^nextpow2(2*winlength);
|
||||
n_fftby2 = n_fft/2; % FFT size/2
|
||||
Kmax = 20; % value suggested by Klatt, pg 1280
|
||||
Klocmax = 1; % value suggested by Klatt, pg 1280
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Critical Band Filter Definitions (Center Frequency and Bandwidths in Hz)
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
cent_freq(1) = 50.0000; bandwidth(1) = 70.0000;
|
||||
cent_freq(2) = 120.000; bandwidth(2) = 70.0000;
|
||||
cent_freq(3) = 190.000; bandwidth(3) = 70.0000;
|
||||
cent_freq(4) = 260.000; bandwidth(4) = 70.0000;
|
||||
cent_freq(5) = 330.000; bandwidth(5) = 70.0000;
|
||||
cent_freq(6) = 400.000; bandwidth(6) = 70.0000;
|
||||
cent_freq(7) = 470.000; bandwidth(7) = 70.0000;
|
||||
cent_freq(8) = 540.000; bandwidth(8) = 77.3724;
|
||||
cent_freq(9) = 617.372; bandwidth(9) = 86.0056;
|
||||
cent_freq(10) = 703.378; bandwidth(10) = 95.3398;
|
||||
cent_freq(11) = 798.717; bandwidth(11) = 105.411;
|
||||
cent_freq(12) = 904.128; bandwidth(12) = 116.256;
|
||||
cent_freq(13) = 1020.38; bandwidth(13) = 127.914;
|
||||
cent_freq(14) = 1148.30; bandwidth(14) = 140.423;
|
||||
cent_freq(15) = 1288.72; bandwidth(15) = 153.823;
|
||||
cent_freq(16) = 1442.54; bandwidth(16) = 168.154;
|
||||
cent_freq(17) = 1610.70; bandwidth(17) = 183.457;
|
||||
cent_freq(18) = 1794.16; bandwidth(18) = 199.776;
|
||||
cent_freq(19) = 1993.93; bandwidth(19) = 217.153;
|
||||
cent_freq(20) = 2211.08; bandwidth(20) = 235.631;
|
||||
cent_freq(21) = 2446.71; bandwidth(21) = 255.255;
|
||||
cent_freq(22) = 2701.97; bandwidth(22) = 276.072;
|
||||
cent_freq(23) = 2978.04; bandwidth(23) = 298.126;
|
||||
cent_freq(24) = 3276.17; bandwidth(24) = 321.465;
|
||||
cent_freq(25) = 3597.63; bandwidth(25) = 346.136;
|
||||
|
||||
bw_min = bandwidth (1); % minimum critical bandwidth
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Set up the critical band filters. Note here that Gaussianly shaped
|
||||
% filters are used. Also, the sum of the filter weights are equivalent
|
||||
% for each critical band filter. Filter less than -30 dB and set to
|
||||
% zero.
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
min_factor = exp (-30.0 / (2.0 * 2.303)); % -30 dB point of filter
|
||||
|
||||
for i = 1:num_crit
|
||||
f0 = (cent_freq (i) / max_freq) * (n_fftby2);
|
||||
all_f0(i) = floor(f0);
|
||||
bw = (bandwidth (i) / max_freq) * (n_fftby2);
|
||||
norm_factor = log(bw_min) - log(bandwidth(i));
|
||||
j = 0:1:n_fftby2-1;
|
||||
crit_filter(i,:) = exp (-11 *(((j - floor(f0)) ./bw).^2) + norm_factor);
|
||||
crit_filter(i,:) = crit_filter(i,:).*(crit_filter(i,:) > min_factor);
|
||||
end
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% For each frame of input speech, calculate the Weighted Spectral
|
||||
% Slope Measure
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames
|
||||
start = 1; % starting sample
|
||||
window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
|
||||
|
||||
for frame_count = 1:num_frames
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (1) Get the Frames for the test and reference speech.
|
||||
% Multiply by Hanning Window.
|
||||
% ----------------------------------------------------------
|
||||
|
||||
clean_frame = clean_speech(start:start+winlength-1);
|
||||
processed_frame = processed_speech(start:start+winlength-1);
|
||||
clean_frame = clean_frame.*window;
|
||||
processed_frame = processed_frame.*window;
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (2) Compute the Power Spectrum of Clean and Processed
|
||||
% ----------------------------------------------------------
|
||||
|
||||
if (USE_FFT_SPECTRUM)
|
||||
clean_spec = (abs(fft(clean_frame,n_fft)).^2);
|
||||
processed_spec = (abs(fft(processed_frame,n_fft)).^2);
|
||||
else
|
||||
a_vec = zeros(1,n_fft);
|
||||
a_vec(1:11) = lpc(clean_frame,10);
|
||||
clean_spec = 1.0/(abs(fft(a_vec,n_fft)).^2)';
|
||||
|
||||
a_vec = zeros(1,n_fft);
|
||||
a_vec(1:11) = lpc(processed_frame,10);
|
||||
processed_spec = 1.0/(abs(fft(a_vec,n_fft)).^2)';
|
||||
end
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (3) Compute Filterbank Output Energies (in dB scale)
|
||||
% ----------------------------------------------------------
|
||||
|
||||
for i = 1:num_crit
|
||||
clean_energy(i) = sum(clean_spec(1:n_fftby2) ...
|
||||
.*crit_filter(i,:)');
|
||||
processed_energy(i) = sum(processed_spec(1:n_fftby2) ...
|
||||
.*crit_filter(i,:)');
|
||||
end
|
||||
clean_energy = 10*log10(max(clean_energy,1E-10));
|
||||
processed_energy = 10*log10(max(processed_energy,1E-10));
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (4) Compute Spectral Slope (dB[i+1]-dB[i])
|
||||
% ----------------------------------------------------------
|
||||
|
||||
clean_slope = clean_energy(2:num_crit) - ...
|
||||
clean_energy(1:num_crit-1);
|
||||
processed_slope = processed_energy(2:num_crit) - ...
|
||||
processed_energy(1:num_crit-1);
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (5) Find the nearest peak locations in the spectra to
|
||||
% each critical band. If the slope is negative, we
|
||||
% search to the left. If positive, we search to the
|
||||
% right.
|
||||
% ----------------------------------------------------------
|
||||
|
||||
for i = 1:num_crit-1
|
||||
|
||||
% find the peaks in the clean speech signal
|
||||
|
||||
if (clean_slope(i)>0) % search to the right
|
||||
n = i;
|
||||
while ((n<num_crit) & (clean_slope(n) > 0))
|
||||
n = n+1;
|
||||
end
|
||||
clean_loc_peak(i) = clean_energy(n-1);
|
||||
else % search to the left
|
||||
n = i;
|
||||
while ((n>0) & (clean_slope(n) <= 0))
|
||||
n = n-1;
|
||||
end
|
||||
clean_loc_peak(i) = clean_energy(n+1);
|
||||
end
|
||||
|
||||
% find the peaks in the processed speech signal
|
||||
|
||||
if (processed_slope(i)>0) % search to the right
|
||||
n = i;
|
||||
while ((n<num_crit) & (processed_slope(n) > 0))
|
||||
n = n+1;
|
||||
end
|
||||
processed_loc_peak(i) = processed_energy(n-1);
|
||||
else % search to the left
|
||||
n = i;
|
||||
while ((n>0) & (processed_slope(n) <= 0))
|
||||
n = n-1;
|
||||
end
|
||||
processed_loc_peak(i) = processed_energy(n+1);
|
||||
end
|
||||
|
||||
end
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (6) Compute the WSS Measure for this frame. This
|
||||
% includes determination of the weighting function.
|
||||
% ----------------------------------------------------------
|
||||
|
||||
dBMax_clean = max(clean_energy);
|
||||
dBMax_processed = max(processed_energy);
|
||||
|
||||
% The weights are calculated by averaging individual
|
||||
% weighting factors from the clean and processed frame.
|
||||
% These weights W_clean and W_processed should range
|
||||
% from 0 to 1 and place more emphasis on spectral
|
||||
% peaks and less emphasis on slope differences in spectral
|
||||
% valleys. This procedure is described on page 1280 of
|
||||
% Klatt's 1982 ICASSP paper.
|
||||
|
||||
Wmax_clean = Kmax ./ (Kmax + dBMax_clean - ...
|
||||
clean_energy(1:num_crit-1));
|
||||
Wlocmax_clean = Klocmax ./ ( Klocmax + clean_loc_peak - ...
|
||||
clean_energy(1:num_crit-1));
|
||||
W_clean = Wmax_clean .* Wlocmax_clean;
|
||||
|
||||
Wmax_processed = Kmax ./ (Kmax + dBMax_processed - ...
|
||||
processed_energy(1:num_crit-1));
|
||||
Wlocmax_processed = Klocmax ./ ( Klocmax + processed_loc_peak - ...
|
||||
processed_energy(1:num_crit-1));
|
||||
W_processed = Wmax_processed .* Wlocmax_processed;
|
||||
|
||||
W = (W_clean + W_processed)./2.0;
|
||||
|
||||
distortion(frame_count) = sum(W.*(clean_slope(1:num_crit-1) - ...
|
||||
processed_slope(1:num_crit-1)).^2);
|
||||
|
||||
% this normalization is not part of Klatt's paper, but helps
|
||||
% to normalize the measure. Here we scale the measure by the
|
||||
% sum of the weights.
|
||||
|
||||
distortion(frame_count) = distortion(frame_count)/sum(W);
|
||||
|
||||
start = start + skiprate;
|
||||
|
||||
end
|
||||
|
||||
@@ -0,0 +1,496 @@
|
||||
function [Csig,Cbak,Covl]= composite(cleanFile, enhancedFile);
|
||||
% ----------------------------------------------------------------------
|
||||
% Composite Objective Speech Quality Measure
|
||||
%
|
||||
% This function implements the composite objective measure proposed in
|
||||
% [1].
|
||||
%
|
||||
% Usage: [sig,bak,ovl]=composite(cleanFile.wav, enhancedFile.wav)
|
||||
%
|
||||
% cleanFile.wav - clean input file in .wav format
|
||||
% enhancedFile - enhanced output file in .wav format
|
||||
% sig - predicted rating [1-5] of speech distortion
|
||||
% bak - predicted rating [1-5] of noise distortion
|
||||
% ovl - predicted rating [1-5] of overall quality
|
||||
%
|
||||
% In addition to the above ratings (sig, bak, & ovl) it returns
|
||||
% the individual values of the LLR, SNRseg, WSS and PESQ measures.
|
||||
%
|
||||
% Example call: [sig,bak,ovl] =composite('sp04.wav','enhanced.wav')
|
||||
%
|
||||
%
|
||||
% References:
|
||||
%
|
||||
% [1] Hu, Y. and Loizou, P. (2006). Evaluation of objective measures
|
||||
% for speech enhancement. Proc. Interspeech, Pittsburg, PA.
|
||||
%
|
||||
% Authors: Yi Hu and Philipos C. Loizou
|
||||
% (the LLR, SNRseg and WSS measures were based on Bryan Pellom and John
|
||||
% Hansen's implementations)
|
||||
%
|
||||
% Copyright (c) 2006 by Philipos C. Loizou
|
||||
% $Revision: 0.0 $ $Date: 10/09/2006 $
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
if nargin~=2
|
||||
fprintf('USAGE: [sig,bak,ovl]=composite(cleanFile.wav, enhancedFile.wav)\n');
|
||||
fprintf('For more help, type: help composite\n\n');
|
||||
return;
|
||||
end
|
||||
|
||||
alpha= 0.95;
|
||||
|
||||
[data1, Srate1, Nbits1]= wavread(cleanFile);
|
||||
[data2, Srate2, Nbits2]= wavread(enhancedFile);
|
||||
if ( Srate1~= Srate2) | ( Nbits1~= Nbits2)
|
||||
error( 'The two files do not match!\n');
|
||||
end
|
||||
|
||||
len= min( length( data1), length( data2));
|
||||
data1= data1( 1: len)+eps;
|
||||
data2= data2( 1: len)+eps;
|
||||
|
||||
|
||||
% -- compute the WSS measure ---
|
||||
%
|
||||
wss_dist_vec= wss( data1, data2,Srate1);
|
||||
wss_dist_vec= sort( wss_dist_vec);
|
||||
wss_dist= mean( wss_dist_vec( 1: round( length( wss_dist_vec)*alpha)));
|
||||
|
||||
% --- compute the LLR measure ---------
|
||||
%
|
||||
LLR_dist= llr( data1, data2,Srate1);
|
||||
LLRs= sort(LLR_dist);
|
||||
LLR_len= round( length(LLR_dist)* alpha);
|
||||
llr_mean= mean( LLRs( 1: LLR_len));
|
||||
|
||||
% --- compute the SNRseg ----------------
|
||||
%
|
||||
[snr_dist, segsnr_dist]= snr( data1, data2,Srate1);
|
||||
snr_mean= snr_dist;
|
||||
segSNR= mean( segsnr_dist);
|
||||
|
||||
|
||||
% -- compute the pesq ----
|
||||
[pesq_mos]= pesq(cleanFile, enhancedFile);
|
||||
|
||||
|
||||
% --- now compute the composite measures ------------------
|
||||
%
|
||||
Csig = 3.093 - 1.029*llr_mean + 0.603*pesq_mos-0.009*wss_dist;
|
||||
Csig = max(1,Csig); Csig=min(5, Csig); % limit values to [1, 5]
|
||||
Cbak = 1.634 + 0.478 *pesq_mos - 0.007*wss_dist + 0.063*segSNR;
|
||||
Cbak = max(1, Cbak); Cbak=min(5,Cbak); % limit values to [1, 5]
|
||||
Covl = 1.594 + 0.805*pesq_mos - 0.512*llr_mean - 0.007*wss_dist;
|
||||
Covl = max(1, Covl); Covl=min(5, Covl); % limit values to [1, 5]
|
||||
|
||||
fprintf('\n LLR=%f SNRseg=%f WSS=%f PESQ=%f\n',llr_mean,segSNR,wss_dist,pesq_mos);
|
||||
|
||||
return; %=================================================================
|
||||
|
||||
|
||||
function distortion = wss(clean_speech, processed_speech,sample_rate)
|
||||
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Check the length of the clean and processed speech. Must be the same.
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
clean_length = length(clean_speech);
|
||||
processed_length = length(processed_speech);
|
||||
|
||||
if (clean_length ~= processed_length)
|
||||
disp('Error: Files musthave same length.');
|
||||
return
|
||||
end
|
||||
|
||||
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Global Variables
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
winlength = round(30*sample_rate/1000); %240; % window length in samples
|
||||
skiprate = floor(winlength/4); % window skip in samples
|
||||
max_freq = sample_rate/2; % maximum bandwidth
|
||||
num_crit = 25; % number of critical bands
|
||||
|
||||
USE_FFT_SPECTRUM = 1; % defaults to 10th order LP spectrum
|
||||
n_fft = 2^nextpow2(2*winlength);
|
||||
n_fftby2 = n_fft/2; % FFT size/2
|
||||
Kmax = 20; % value suggested by Klatt, pg 1280
|
||||
Klocmax = 1; % value suggested by Klatt, pg 1280
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Critical Band Filter Definitions (Center Frequency and Bandwidths in Hz)
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
cent_freq(1) = 50.0000; bandwidth(1) = 70.0000;
|
||||
cent_freq(2) = 120.000; bandwidth(2) = 70.0000;
|
||||
cent_freq(3) = 190.000; bandwidth(3) = 70.0000;
|
||||
cent_freq(4) = 260.000; bandwidth(4) = 70.0000;
|
||||
cent_freq(5) = 330.000; bandwidth(5) = 70.0000;
|
||||
cent_freq(6) = 400.000; bandwidth(6) = 70.0000;
|
||||
cent_freq(7) = 470.000; bandwidth(7) = 70.0000;
|
||||
cent_freq(8) = 540.000; bandwidth(8) = 77.3724;
|
||||
cent_freq(9) = 617.372; bandwidth(9) = 86.0056;
|
||||
cent_freq(10) = 703.378; bandwidth(10) = 95.3398;
|
||||
cent_freq(11) = 798.717; bandwidth(11) = 105.411;
|
||||
cent_freq(12) = 904.128; bandwidth(12) = 116.256;
|
||||
cent_freq(13) = 1020.38; bandwidth(13) = 127.914;
|
||||
cent_freq(14) = 1148.30; bandwidth(14) = 140.423;
|
||||
cent_freq(15) = 1288.72; bandwidth(15) = 153.823;
|
||||
cent_freq(16) = 1442.54; bandwidth(16) = 168.154;
|
||||
cent_freq(17) = 1610.70; bandwidth(17) = 183.457;
|
||||
cent_freq(18) = 1794.16; bandwidth(18) = 199.776;
|
||||
cent_freq(19) = 1993.93; bandwidth(19) = 217.153;
|
||||
cent_freq(20) = 2211.08; bandwidth(20) = 235.631;
|
||||
cent_freq(21) = 2446.71; bandwidth(21) = 255.255;
|
||||
cent_freq(22) = 2701.97; bandwidth(22) = 276.072;
|
||||
cent_freq(23) = 2978.04; bandwidth(23) = 298.126;
|
||||
cent_freq(24) = 3276.17; bandwidth(24) = 321.465;
|
||||
cent_freq(25) = 3597.63; bandwidth(25) = 346.136;
|
||||
|
||||
bw_min = bandwidth (1); % minimum critical bandwidth
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Set up the critical band filters. Note here that Gaussianly shaped
|
||||
% filters are used. Also, the sum of the filter weights are equivalent
|
||||
% for each critical band filter. Filter less than -30 dB and set to
|
||||
% zero.
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
min_factor = exp (-30.0 / (2.0 * 2.303)); % -30 dB point of filter
|
||||
|
||||
for i = 1:num_crit
|
||||
f0 = (cent_freq (i) / max_freq) * (n_fftby2);
|
||||
all_f0(i) = floor(f0);
|
||||
bw = (bandwidth (i) / max_freq) * (n_fftby2);
|
||||
norm_factor = log(bw_min) - log(bandwidth(i));
|
||||
j = 0:1:n_fftby2-1;
|
||||
crit_filter(i,:) = exp (-11 *(((j - floor(f0)) ./bw).^2) + norm_factor);
|
||||
crit_filter(i,:) = crit_filter(i,:).*(crit_filter(i,:) > min_factor);
|
||||
end
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% For each frame of input speech, calculate the Weighted Spectral
|
||||
% Slope Measure
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames
|
||||
start = 1; % starting sample
|
||||
window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
|
||||
|
||||
for frame_count = 1:num_frames
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (1) Get the Frames for the test and reference speech.
|
||||
% Multiply by Hanning Window.
|
||||
% ----------------------------------------------------------
|
||||
|
||||
clean_frame = clean_speech(start:start+winlength-1);
|
||||
processed_frame = processed_speech(start:start+winlength-1);
|
||||
clean_frame = clean_frame.*window;
|
||||
processed_frame = processed_frame.*window;
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (2) Compute the Power Spectrum of Clean and Processed
|
||||
% ----------------------------------------------------------
|
||||
|
||||
if (USE_FFT_SPECTRUM)
|
||||
clean_spec = (abs(fft(clean_frame,n_fft)).^2);
|
||||
processed_spec = (abs(fft(processed_frame,n_fft)).^2);
|
||||
else
|
||||
a_vec = zeros(1,n_fft);
|
||||
a_vec(1:11) = lpc(clean_frame,10);
|
||||
clean_spec = 1.0/(abs(fft(a_vec,n_fft)).^2)';
|
||||
|
||||
a_vec = zeros(1,n_fft);
|
||||
a_vec(1:11) = lpc(processed_frame,10);
|
||||
processed_spec = 1.0/(abs(fft(a_vec,n_fft)).^2)';
|
||||
end
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (3) Compute Filterbank Output Energies (in dB scale)
|
||||
% ----------------------------------------------------------
|
||||
|
||||
for i = 1:num_crit
|
||||
clean_energy(i) = sum(clean_spec(1:n_fftby2) ...
|
||||
.*crit_filter(i,:)');
|
||||
processed_energy(i) = sum(processed_spec(1:n_fftby2) ...
|
||||
.*crit_filter(i,:)');
|
||||
end
|
||||
clean_energy = 10*log10(max(clean_energy,1E-10));
|
||||
processed_energy = 10*log10(max(processed_energy,1E-10));
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (4) Compute Spectral Slope (dB[i+1]-dB[i])
|
||||
% ----------------------------------------------------------
|
||||
|
||||
clean_slope = clean_energy(2:num_crit) - ...
|
||||
clean_energy(1:num_crit-1);
|
||||
processed_slope = processed_energy(2:num_crit) - ...
|
||||
processed_energy(1:num_crit-1);
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (5) Find the nearest peak locations in the spectra to
|
||||
% each critical band. If the slope is negative, we
|
||||
% search to the left. If positive, we search to the
|
||||
% right.
|
||||
% ----------------------------------------------------------
|
||||
|
||||
for i = 1:num_crit-1
|
||||
|
||||
% find the peaks in the clean speech signal
|
||||
|
||||
if (clean_slope(i)>0) % search to the right
|
||||
n = i;
|
||||
while ((n<num_crit) & (clean_slope(n) > 0))
|
||||
n = n+1;
|
||||
end
|
||||
clean_loc_peak(i) = clean_energy(n-1);
|
||||
else % search to the left
|
||||
n = i;
|
||||
while ((n>0) & (clean_slope(n) <= 0))
|
||||
n = n-1;
|
||||
end
|
||||
clean_loc_peak(i) = clean_energy(n+1);
|
||||
end
|
||||
|
||||
% find the peaks in the processed speech signal
|
||||
|
||||
if (processed_slope(i)>0) % search to the right
|
||||
n = i;
|
||||
while ((n<num_crit) & (processed_slope(n) > 0))
|
||||
n = n+1;
|
||||
end
|
||||
processed_loc_peak(i) = processed_energy(n-1);
|
||||
else % search to the left
|
||||
n = i;
|
||||
while ((n>0) & (processed_slope(n) <= 0))
|
||||
n = n-1;
|
||||
end
|
||||
processed_loc_peak(i) = processed_energy(n+1);
|
||||
end
|
||||
|
||||
end
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (6) Compute the WSS Measure for this frame. This
|
||||
% includes determination of the weighting function.
|
||||
% ----------------------------------------------------------
|
||||
|
||||
dBMax_clean = max(clean_energy);
|
||||
dBMax_processed = max(processed_energy);
|
||||
|
||||
% The weights are calculated by averaging individual
|
||||
% weighting factors from the clean and processed frame.
|
||||
% These weights W_clean and W_processed should range
|
||||
% from 0 to 1 and place more emphasis on spectral
|
||||
% peaks and less emphasis on slope differences in spectral
|
||||
% valleys. This procedure is described on page 1280 of
|
||||
% Klatt's 1982 ICASSP paper.
|
||||
|
||||
Wmax_clean = Kmax ./ (Kmax + dBMax_clean - ...
|
||||
clean_energy(1:num_crit-1));
|
||||
Wlocmax_clean = Klocmax ./ ( Klocmax + clean_loc_peak - ...
|
||||
clean_energy(1:num_crit-1));
|
||||
W_clean = Wmax_clean .* Wlocmax_clean;
|
||||
|
||||
Wmax_processed = Kmax ./ (Kmax + dBMax_processed - ...
|
||||
processed_energy(1:num_crit-1));
|
||||
Wlocmax_processed = Klocmax ./ ( Klocmax + processed_loc_peak - ...
|
||||
processed_energy(1:num_crit-1));
|
||||
W_processed = Wmax_processed .* Wlocmax_processed;
|
||||
|
||||
W = (W_clean + W_processed)./2.0;
|
||||
|
||||
distortion(frame_count) = sum(W.*(clean_slope(1:num_crit-1) - ...
|
||||
processed_slope(1:num_crit-1)).^2);
|
||||
|
||||
% this normalization is not part of Klatt's paper, but helps
|
||||
% to normalize the measure. Here we scale the measure by the
|
||||
% sum of the weights.
|
||||
|
||||
distortion(frame_count) = distortion(frame_count)/sum(W);
|
||||
|
||||
start = start + skiprate;
|
||||
|
||||
end
|
||||
|
||||
%-----------------------------------------------
|
||||
function distortion = llr(clean_speech, processed_speech,sample_rate)
|
||||
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Check the length of the clean and processed speech. Must be the same.
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
clean_length = length(clean_speech);
|
||||
processed_length = length(processed_speech);
|
||||
|
||||
if (clean_length ~= processed_length)
|
||||
disp('Error: Both Speech Files must be same length.');
|
||||
return
|
||||
end
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Global Variables
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
winlength = round(30*sample_rate/1000); % window length in samples
|
||||
skiprate = floor(winlength/4); % window skip in samples
|
||||
if sample_rate<10000
|
||||
P = 10; % LPC Analysis Order
|
||||
else
|
||||
P=16; % this could vary depending on sampling frequency.
|
||||
end
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% For each frame of input speech, calculate the Log Likelihood Ratio
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames
|
||||
start = 1; % starting sample
|
||||
window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
|
||||
|
||||
for frame_count = 1:num_frames
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (1) Get the Frames for the test and reference speech.
|
||||
% Multiply by Hanning Window.
|
||||
% ----------------------------------------------------------
|
||||
|
||||
clean_frame = clean_speech(start:start+winlength-1);
|
||||
processed_frame = processed_speech(start:start+winlength-1);
|
||||
clean_frame = clean_frame.*window;
|
||||
processed_frame = processed_frame.*window;
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (2) Get the autocorrelation lags and LPC parameters used
|
||||
% to compute the LLR measure.
|
||||
% ----------------------------------------------------------
|
||||
|
||||
[R_clean, Ref_clean, A_clean] = ...
|
||||
lpcoeff(clean_frame, P);
|
||||
[R_processed, Ref_processed, A_processed] = ...
|
||||
lpcoeff(processed_frame, P);
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (3) Compute the LLR measure
|
||||
% ----------------------------------------------------------
|
||||
|
||||
numerator = A_processed*toeplitz(R_clean)*A_processed';
|
||||
denominator = A_clean*toeplitz(R_clean)*A_clean';
|
||||
distortion(frame_count) = log(numerator/denominator);
|
||||
start = start + skiprate;
|
||||
|
||||
end
|
||||
|
||||
%---------------------------------------------
|
||||
function [acorr, refcoeff, lpparams] = lpcoeff(speech_frame, model_order)
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (1) Compute Autocorrelation Lags
|
||||
% ----------------------------------------------------------
|
||||
|
||||
winlength = max(size(speech_frame));
|
||||
for k=1:model_order+1
|
||||
R(k) = sum(speech_frame(1:winlength-k+1) ...
|
||||
.*speech_frame(k:winlength));
|
||||
end
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (2) Levinson-Durbin
|
||||
% ----------------------------------------------------------
|
||||
|
||||
a = ones(1,model_order);
|
||||
E(1)=R(1);
|
||||
for i=1:model_order
|
||||
a_past(1:i-1) = a(1:i-1);
|
||||
sum_term = sum(a_past(1:i-1).*R(i:-1:2));
|
||||
rcoeff(i)=(R(i+1) - sum_term) / E(i);
|
||||
a(i)=rcoeff(i);
|
||||
a(1:i-1) = a_past(1:i-1) - rcoeff(i).*a_past(i-1:-1:1);
|
||||
E(i+1)=(1-rcoeff(i)*rcoeff(i))*E(i);
|
||||
end
|
||||
|
||||
acorr = R;
|
||||
refcoeff = rcoeff;
|
||||
lpparams = [1 -a];
|
||||
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
function [overall_snr, segmental_snr] = snr(clean_speech, processed_speech,sample_rate)
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Check the length of the clean and processed speech. Must be the same.
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
clean_length = length(clean_speech);
|
||||
processed_length = length(processed_speech);
|
||||
|
||||
if (clean_length ~= processed_length)
|
||||
disp('Error: Both Speech Files must be same length.');
|
||||
return
|
||||
end
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Scale both clean speech and processed speech to have same dynamic
|
||||
% range. Also remove DC component from each signal
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
%clean_speech = clean_speech - mean(clean_speech);
|
||||
%processed_speech = processed_speech - mean(processed_speech);
|
||||
|
||||
%processed_speech = processed_speech.*(max(abs(clean_speech))/ max(abs(processed_speech)));
|
||||
|
||||
overall_snr = 10* log10( sum(clean_speech.^2)/sum((clean_speech-processed_speech).^2));
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Global Variables
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
winlength = round(30*sample_rate/1000); %240; % window length in samples
|
||||
skiprate = floor(winlength/4); % window skip in samples
|
||||
MIN_SNR = -10; % minimum SNR in dB
|
||||
MAX_SNR = 35; % maximum SNR in dB
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% For each frame of input speech, calculate the Segmental SNR
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames
|
||||
start = 1; % starting sample
|
||||
window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
|
||||
|
||||
for frame_count = 1: num_frames
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (1) Get the Frames for the test and reference speech.
|
||||
% Multiply by Hanning Window.
|
||||
% ----------------------------------------------------------
|
||||
|
||||
clean_frame = clean_speech(start:start+winlength-1);
|
||||
processed_frame = processed_speech(start:start+winlength-1);
|
||||
clean_frame = clean_frame.*window;
|
||||
processed_frame = processed_frame.*window;
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (2) Compute the Segmental SNR
|
||||
% ----------------------------------------------------------
|
||||
|
||||
signal_energy = sum(clean_frame.^2);
|
||||
noise_energy = sum((clean_frame-processed_frame).^2);
|
||||
segmental_snr(frame_count) = 10*log10(signal_energy/(noise_energy+eps)+eps);
|
||||
segmental_snr(frame_count) = max(segmental_snr(frame_count),MIN_SNR);
|
||||
segmental_snr(frame_count) = min(segmental_snr(frame_count),MAX_SNR);
|
||||
|
||||
start = start + skiprate;
|
||||
|
||||
end
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,84 @@
|
||||
function crude_align( ref_logVAD, ref_Nsamples, deg_logVAD, ...
|
||||
deg_Nsamples, Utt_id)
|
||||
|
||||
global Downsample
|
||||
global Nutterances Largest_uttsize Nsurf_samples Crude_DelayEst
|
||||
global Crude_DelayConf UttSearch_Start UttSearch_End Utt_DelayEst
|
||||
global Utt_Delay Utt_DelayConf Utt_Start Utt_End
|
||||
global MAXNUTTERANCES WHOLE_SIGNAL
|
||||
global pesq_mos subj_mos cond_nr
|
||||
|
||||
if (Utt_id== WHOLE_SIGNAL )
|
||||
nr = floor( ref_Nsamples/ Downsample);
|
||||
nd = floor( deg_Nsamples/ Downsample);
|
||||
startr= 1;
|
||||
startd= 1;
|
||||
elseif Utt_id== MAXNUTTERANCES
|
||||
startr= UttSearch_Start(MAXNUTTERANCES);
|
||||
startd= startr+ Utt_DelayEst(MAXNUTTERANCES)/ Downsample;
|
||||
if ( startd< 0 )
|
||||
startr= 1- Utt_DelayEst(MAXNUTTERANCES)/ Downsample;
|
||||
startd= 1;
|
||||
end
|
||||
|
||||
nr= UttSearch_End(MAXNUTTERANCES)- startr;
|
||||
nd= nr;
|
||||
|
||||
if( startd+ nd> floor( deg_Nsamples/ Downsample) )
|
||||
nd= floor( deg_Nsamples/ Downsample)- startd;
|
||||
end
|
||||
% fprintf( 'nr,nd is %d,%d\n', nr, nd);
|
||||
|
||||
else
|
||||
startr= UttSearch_Start(Utt_id);
|
||||
startd= startr+ Crude_DelayEst/ Downsample;
|
||||
|
||||
if ( startd< 0 )
|
||||
startr= 1- Crude_DelayEst/ Downsample;
|
||||
startd= 1;
|
||||
end
|
||||
|
||||
nr= UttSearch_End(Utt_id)- startr;
|
||||
nd = nr;
|
||||
if( startd+ nd> floor( deg_Nsamples/ Downsample)+ 1)
|
||||
nd = floor( deg_Nsamples/ Downsample)- startd+ 1;
|
||||
end
|
||||
end
|
||||
|
||||
max_Y= 0.0;
|
||||
I_max_Y= nr;
|
||||
if( (nr> 1) && (nd> 1) )
|
||||
Y= FFTNXCorr( ref_logVAD, startr, nr, deg_logVAD, startd, nd);
|
||||
[max_Y, I_max_Y]= max( Y);
|
||||
if (max_Y<= 0)
|
||||
max_Y= 0;
|
||||
I_max_Y= nr;
|
||||
end
|
||||
end
|
||||
|
||||
% fprintf( 'max_Y, I_max_Y is %f, %d\n', max_Y, I_max_Y);
|
||||
|
||||
if( Utt_id== WHOLE_SIGNAL )
|
||||
Crude_DelayEst= (I_max_Y- nr)* Downsample;
|
||||
Crude_DelayConf= 0.0;
|
||||
% fprintf( 1, 'I_max_Y, nr, Crude_DelayEst is %f, %f, %f\n', ...
|
||||
% I_max_Y, nr, Crude_DelayEst);
|
||||
elseif( Utt_id == MAXNUTTERANCES )
|
||||
Utt_Delay(MAXNUTTERANCES)= (I_max_Y- nr)* Downsample+ ...
|
||||
Utt_DelayEst(MAXNUTTERANCES);
|
||||
% fprintf( 'startr, startd, nr, nd, I_max, Utt_Delay[%d] is %d, %d, %d, %d, %d, %d\n', ...
|
||||
% MAXNUTTERANCES, startr, startd, nr, nd, ...
|
||||
% I_max_Y, Utt_Delay(MAXNUTTERANCES) );
|
||||
else
|
||||
% fprintf( 'I_max_Y, nr is %d, %d\n', I_max_Y, nr);
|
||||
Utt_DelayEst(Utt_id)= (I_max_Y- nr)* Downsample+ ...
|
||||
Crude_DelayEst;
|
||||
end
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,21 @@
|
||||
function mod_data= fix_power_level( data, data_Nsamples, maxNsamples)
|
||||
% this function is used for level normalization, i.e., to fix the power
|
||||
% level of data to a preset number, and return it to mod_data.
|
||||
|
||||
global Downsample DATAPADDING_MSECS SEARCHBUFFER Fs
|
||||
global TARGET_AVG_POWER
|
||||
TARGET_AVG_POWER= 1e7;
|
||||
|
||||
align_filter_dB= [0,-500; 50, -500; 100, -500; 125, -500; 160, -500; 200, -500;
|
||||
250, -500; 300, -500; 350, 0; 400, 0; 500, 0; 600, 0; 630, 0;
|
||||
800, 0; 1000, 0; 1250, 0; 1600, 0; 2000, 0; 2500, 0; 3000, 0;
|
||||
3250, 0; 3500, -500; 4000, -500; 5000, -500; 6300, -500; 8000, -500];
|
||||
|
||||
align_filtered= apply_filter( data, data_Nsamples, align_filter_dB);
|
||||
power_above_300Hz = pow_of (align_filtered, SEARCHBUFFER* Downsample+ 1, ...
|
||||
data_Nsamples- SEARCHBUFFER* Downsample+ DATAPADDING_MSECS* (Fs/ 1000), ...
|
||||
maxNsamples- 2* SEARCHBUFFER* Downsample+ DATAPADDING_MSECS* (Fs/ 1000));
|
||||
|
||||
global_scale= sqrt( TARGET_AVG_POWER/ power_above_300Hz);
|
||||
% fprintf( 1, '\tglobal_scale is %f\n', global_scale);
|
||||
mod_data= data* global_scale;
|
||||
@@ -0,0 +1,68 @@
|
||||
function id_searchwindows( ref_VAD, ref_Nsamples, deg_VAD, deg_Nsamples);
|
||||
|
||||
global MINUTTLENGTH Downsample MINUTTLENGTH SEARCHBUFFER
|
||||
global Crude_DelayEst Nutterances UttSearch_Start UttSearch_End
|
||||
|
||||
Utt_num = 1;
|
||||
speech_flag = 0;
|
||||
|
||||
VAD_length= floor( ref_Nsamples/ Downsample);
|
||||
del_deg_start= MINUTTLENGTH- Crude_DelayEst/ Downsample;
|
||||
del_deg_end= floor((deg_Nsamples- Crude_DelayEst)/ Downsample)-...
|
||||
MINUTTLENGTH;
|
||||
|
||||
for count= 1: VAD_length
|
||||
VAD_value= ref_VAD(count);
|
||||
if( (VAD_value> 0) && (speech_flag== 0) )
|
||||
speech_flag= 1;
|
||||
this_start= count;
|
||||
UttSearch_Start(Utt_num)= count- SEARCHBUFFER;
|
||||
if( UttSearch_Start(Utt_num)< 0 )
|
||||
UttSearch_Start(Utt_num)= 0;
|
||||
end
|
||||
end
|
||||
|
||||
if( ((VAD_value== 0) || (count == (VAD_length-1))) && ...
|
||||
(speech_flag == 1) )
|
||||
speech_flag = 0;
|
||||
UttSearch_End(Utt_num) = count + SEARCHBUFFER;
|
||||
if( UttSearch_End(Utt_num) > VAD_length - 1 )
|
||||
UttSearch_End(Utt_num) = VAD_length -1;
|
||||
end
|
||||
|
||||
if( ((count - this_start) >= MINUTTLENGTH) &&...
|
||||
(this_start < del_deg_end) &&...
|
||||
(count > del_deg_start) )
|
||||
Utt_num= Utt_num + 1;
|
||||
end
|
||||
end
|
||||
end
|
||||
Utt_num= Utt_num- 1;
|
||||
Nutterances = Utt_num;
|
||||
|
||||
% fprintf( 1, 'Nutterances is %d\n', Nutterances);
|
||||
|
||||
% fid= fopen( 'mat_utt.txt', 'wt');
|
||||
% fprintf( fid, '%d\n', UttSearch_Start( 1: Nutterances));
|
||||
% fprintf( fid, '\n');
|
||||
% fprintf( fid, '%d\n', UttSearch_End( 1: Nutterances));
|
||||
% fclose(fid);
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,85 @@
|
||||
function id_utterances( ref_Nsamples, ref_VAD, deg_Nsamples)
|
||||
|
||||
global Largest_uttsize MINUTTLENGTH MINUTTLENGTH Crude_DelayEst
|
||||
global Downsample SEARCHBUFFER Nutterances Utt_Start
|
||||
global Utt_End Utt_Delay
|
||||
|
||||
Utt_num = 1;
|
||||
speech_flag = 0;
|
||||
VAD_length = floor( ref_Nsamples / Downsample);
|
||||
% fprintf( 1, 'VAD_length is %d\n', VAD_length);
|
||||
|
||||
del_deg_start = MINUTTLENGTH - Crude_DelayEst / Downsample;
|
||||
del_deg_end = floor((deg_Nsamples- Crude_DelayEst)/ Downsample) ...
|
||||
- MINUTTLENGTH;
|
||||
|
||||
for count = 1: VAD_length
|
||||
VAD_value = ref_VAD(count);
|
||||
if( (VAD_value > 0.0) && (speech_flag == 0) )
|
||||
speech_flag = 1;
|
||||
this_start = count;
|
||||
Utt_Start (Utt_num) = count;
|
||||
end
|
||||
|
||||
if( ((VAD_value == 0) || (count == VAD_length)) && ...
|
||||
(speech_flag == 1) )
|
||||
speech_flag = 0;
|
||||
Utt_End (Utt_num) = count;
|
||||
|
||||
if( ((count - this_start) >= MINUTTLENGTH) && ...
|
||||
(this_start < del_deg_end) && ...
|
||||
(count > del_deg_start) )
|
||||
Utt_num = Utt_num + 1;
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
Utt_Start(1) = SEARCHBUFFER+ 1;
|
||||
Utt_End(Nutterances) = VAD_length - SEARCHBUFFER+ 1;
|
||||
|
||||
for Utt_num = 2: Nutterances
|
||||
this_start = Utt_Start(Utt_num)- 1;
|
||||
last_end = Utt_End(Utt_num - 1)- 1;
|
||||
count = floor( (this_start + last_end) / 2);
|
||||
Utt_Start(Utt_num) = count+ 1;
|
||||
Utt_End(Utt_num - 1) = count+ 1;
|
||||
end
|
||||
|
||||
this_start = (Utt_Start(1)- 1) * Downsample + Utt_Delay(1);
|
||||
if( this_start < (SEARCHBUFFER * Downsample) )
|
||||
count = SEARCHBUFFER + floor( ...
|
||||
(Downsample - 1 - Utt_Delay(1)) / Downsample);
|
||||
Utt_Start(1) = count+ 1;
|
||||
end
|
||||
|
||||
last_end = (Utt_End(Nutterances)- 1) * Downsample + 1 + ...
|
||||
Utt_Delay(Nutterances);
|
||||
% fprintf( 'Utt_End(%d) is %d\n', Nutterances, Utt_End(Nutterances));
|
||||
% fprintf( 'last_end is %d\n', last_end);
|
||||
% fprintf( 'Utt_Delay(%d) is %d\n', Nutterances, Utt_Delay(Nutterances));
|
||||
if( last_end > (deg_Nsamples - SEARCHBUFFER * Downsample+ 1) )
|
||||
count = floor( (deg_Nsamples - Utt_Delay(Nutterances)) / Downsample) ...
|
||||
- SEARCHBUFFER;
|
||||
Utt_End(Nutterances) = count+ 1;
|
||||
end
|
||||
|
||||
for Utt_num = 2: Nutterances
|
||||
this_start = (Utt_Start(Utt_num)- 1) * Downsample + Utt_Delay(Utt_num);
|
||||
last_end = (Utt_End(Utt_num - 1)- 1) * Downsample + Utt_Delay(Utt_num - 1);
|
||||
if( this_start < last_end )
|
||||
count = floor( (this_start + last_end) / 2);
|
||||
this_start = floor( (Downsample- 1+ count- Utt_Delay(Utt_num))...
|
||||
/ Downsample);
|
||||
last_end = floor( (count - Utt_Delay(Utt_num - 1))...
|
||||
/ Downsample);
|
||||
Utt_Start(Utt_num) = this_start+ 1;
|
||||
Utt_End(Utt_num- 1) = last_end+ 1;
|
||||
end
|
||||
end
|
||||
|
||||
Largest_uttsize= max( Utt_End- Utt_Start);
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,9 @@
|
||||
function [mod_ref_data, mod_deg_data]= input_filter( ref_data, ref_Nsamples, ...
|
||||
deg_data, deg_Nsamples)
|
||||
|
||||
mod_ref_data= DC_block( ref_data, ref_Nsamples);
|
||||
mod_deg_data= DC_block( deg_data, deg_Nsamples);
|
||||
|
||||
mod_ref_data= apply_filters( mod_ref_data, ref_Nsamples);
|
||||
mod_deg_data= apply_filters( mod_deg_data, deg_Nsamples);
|
||||
|
||||
@@ -0,0 +1,127 @@
|
||||
function [pesq_mos]= pesq(ref_wav, deg_wav)
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% PESQ objective speech quality measure
|
||||
%
|
||||
% This function implements the PESQ measure based on the ITU standard
|
||||
% P.862 [1].
|
||||
%
|
||||
%
|
||||
% Usage: pval=pesq(cleanFile.wav, enhancedFile.wav)
|
||||
%
|
||||
% cleanFile.wav - clean input file in .wav format
|
||||
% enhancedFile - enhanced output file in .wav format
|
||||
% pval - PESQ value
|
||||
%
|
||||
% Note that the PESQ routine only supports sampling rates of 8 kHz and
|
||||
% 16 kHz [1]
|
||||
%
|
||||
% Example call: pval = pesq ('sp04.wav','enhanced.wav')
|
||||
%
|
||||
%
|
||||
% References:
|
||||
% [1] ITU (2000). Perceptual evaluation of speech quality (PESQ), and
|
||||
% objective method for end-to-end speech quality assessment of
|
||||
% narrowband telephone networks and speech codecs. ITU-T
|
||||
% Recommendation P. 862
|
||||
%
|
||||
% Authors: Yi Hu and Philipos C. Loizou
|
||||
%
|
||||
%
|
||||
% Copyright (c) 2006 by Philipos C. Loizou
|
||||
% $Revision: 0.0 $ $Date: 10/09/2006 $
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
if nargin<2
|
||||
fprintf('Usage: [pesq_mos]=pesq(cleanfile.wav,enhanced.wav) \n');
|
||||
return;
|
||||
end;
|
||||
|
||||
global Downsample DATAPADDING_MSECS SEARCHBUFFER Fs WHOLE_SIGNAL
|
||||
global Align_Nfft Window
|
||||
|
||||
[ref_data,sampling_rate]= audioread( ref_wav);
|
||||
if sampling_rate~=8000 & sampling_rate~=16000
|
||||
error('Sampling frequency needs to be either 8000 or 16000 Hz');
|
||||
end
|
||||
|
||||
setup_global( sampling_rate);
|
||||
|
||||
% Window= hann( Align_Nfft, 'periodic'); %Hanning window
|
||||
% Window= Window';
|
||||
TWOPI= 6.28318530717959;
|
||||
%for count = 0: Align_Nfft- 1
|
||||
% Window(1+ count) = 0.5 * (1.0 - cos((TWOPI * count) / Align_Nfft));
|
||||
%end
|
||||
|
||||
count=0:Align_Nfft- 1;
|
||||
Window= 0.5 * (1.0 - cos((TWOPI * count) / Align_Nfft));
|
||||
|
||||
|
||||
|
||||
ref_data= ref_data';
|
||||
ref_data= ref_data* 32768;
|
||||
ref_Nsamples= length( ref_data)+ 2* SEARCHBUFFER* Downsample;
|
||||
ref_data= [zeros( 1, SEARCHBUFFER* Downsample), ref_data, ...
|
||||
zeros( 1, DATAPADDING_MSECS* (Fs/ 1000)+ SEARCHBUFFER* Downsample)];
|
||||
|
||||
deg_data= audioread( deg_wav);
|
||||
deg_data= deg_data';
|
||||
deg_data= deg_data* 32768;
|
||||
deg_Nsamples= length( deg_data)+ 2* SEARCHBUFFER* Downsample;
|
||||
deg_data= [zeros( 1, SEARCHBUFFER* Downsample), deg_data, ...
|
||||
zeros( 1, DATAPADDING_MSECS* (Fs/ 1000)+ SEARCHBUFFER* Downsample)];
|
||||
|
||||
maxNsamples= max( ref_Nsamples, deg_Nsamples);
|
||||
|
||||
ref_data= fix_power_level( ref_data, ref_Nsamples, maxNsamples);
|
||||
deg_data= fix_power_level( deg_data, deg_Nsamples, maxNsamples);
|
||||
|
||||
standard_IRS_filter_dB= [0, -200; 50, -40; 100, -20; 125, -12; 160, -6; 200, 0;...
|
||||
250, 4; 300, 6; 350, 8; 400, 10; 500, 11; 600, 12; 700, 12; 800, 12;...
|
||||
1000, 12; 1300, 12; 1600, 12; 2000, 12; 2500, 12; 3000, 12; 3250, 12;...
|
||||
3500, 4; 4000, -200; 5000, -200; 6300, -200; 8000, -200];
|
||||
|
||||
ref_data= apply_filter( ref_data, ref_Nsamples, standard_IRS_filter_dB);
|
||||
deg_data= apply_filter( deg_data, deg_Nsamples, standard_IRS_filter_dB);
|
||||
%
|
||||
|
||||
|
||||
|
||||
% for later use in psychoacoustical model
|
||||
model_ref= ref_data;
|
||||
model_deg= deg_data;
|
||||
|
||||
[ref_data, deg_data]= input_filter( ref_data, ref_Nsamples, deg_data, ...
|
||||
deg_Nsamples);
|
||||
|
||||
|
||||
[ref_VAD, ref_logVAD]= apply_VAD( ref_data, ref_Nsamples);
|
||||
[deg_VAD, deg_logVAD]= apply_VAD( deg_data, deg_Nsamples);
|
||||
|
||||
|
||||
crude_align (ref_logVAD, ref_Nsamples, deg_logVAD, deg_Nsamples,...
|
||||
WHOLE_SIGNAL);
|
||||
|
||||
utterance_locate (ref_data, ref_Nsamples, ref_VAD, ref_logVAD,...
|
||||
deg_data, deg_Nsamples, deg_VAD, deg_logVAD);
|
||||
|
||||
ref_data= model_ref;
|
||||
deg_data= model_deg;
|
||||
|
||||
% make ref_data and deg_data equal length
|
||||
if (ref_Nsamples< deg_Nsamples)
|
||||
newlen= deg_Nsamples+ DATAPADDING_MSECS* (Fs/ 1000);
|
||||
ref_data( newlen)= 0;
|
||||
elseif (ref_Nsamples> deg_Nsamples)
|
||||
newlen= ref_Nsamples+ DATAPADDING_MSECS* (Fs/ 1000);
|
||||
deg_data( newlen)= 0;
|
||||
end
|
||||
|
||||
|
||||
pesq_mos= pesq_psychoacoustic_model (ref_data, ref_Nsamples, deg_data, ...
|
||||
deg_Nsamples );
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,920 @@
|
||||
function pesq_mos= pesq_psychoacoustic_model (ref_data, ref_Nsamples, deg_data, ...
|
||||
deg_Nsamples )
|
||||
|
||||
global CALIBRATE Nfmax Nb Sl Sp
|
||||
global nr_of_hz_bands_per_bark_band centre_of_band_bark
|
||||
global width_of_band_hz centre_of_band_hz width_of_band_bark
|
||||
global pow_dens_correction_factor abs_thresh_power
|
||||
global Downsample SEARCHBUFFER DATAPADDING_MSECS Fs Nutterances
|
||||
global Utt_Start Utt_End Utt_Delay NUMBER_OF_PSQM_FRAMES_PER_SYLLABE
|
||||
global Fs Plot_Frame
|
||||
|
||||
% Plot_Frame= 75; % this is the frame whose spectrum will be plotted
|
||||
|
||||
FALSE= 0;
|
||||
TRUE= 1;
|
||||
NUMBER_OF_PSQM_FRAMES_PER_SYLLABE= 20;
|
||||
|
||||
maxNsamples = max (ref_Nsamples, deg_Nsamples);
|
||||
Nf = Downsample * 8;
|
||||
MAX_NUMBER_OF_BAD_INTERVALS = 1000;
|
||||
|
||||
start_frame_of_bad_interval= zeros( 1, MAX_NUMBER_OF_BAD_INTERVALS);
|
||||
stop_frame_of_bad_interval= zeros( 1, MAX_NUMBER_OF_BAD_INTERVALS);
|
||||
start_sample_of_bad_interval= zeros( 1, MAX_NUMBER_OF_BAD_INTERVALS);
|
||||
stop_sample_of_bad_interval= zeros( 1, MAX_NUMBER_OF_BAD_INTERVALS);
|
||||
number_of_samples_in_bad_interval= zeros( 1, MAX_NUMBER_OF_BAD_INTERVALS);
|
||||
delay_in_samples_in_bad_interval= zeros( 1, MAX_NUMBER_OF_BAD_INTERVALS);
|
||||
number_of_bad_intervals= 0;
|
||||
there_is_a_bad_frame= FALSE;
|
||||
|
||||
Whanning= hann( Nf, 'periodic');
|
||||
Whanning= Whanning';
|
||||
|
||||
D_POW_F = 2;
|
||||
D_POW_S = 6;
|
||||
D_POW_T = 2;
|
||||
A_POW_F = 1;
|
||||
A_POW_S = 6;
|
||||
A_POW_T = 2;
|
||||
D_WEIGHT= 0.1;
|
||||
A_WEIGHT= 0.0309;
|
||||
|
||||
CRITERIUM_FOR_SILENCE_OF_5_SAMPLES = 500;
|
||||
samples_to_skip_at_start = 0;
|
||||
sum_of_5_samples= 0;
|
||||
while ((sum_of_5_samples< CRITERIUM_FOR_SILENCE_OF_5_SAMPLES) ...
|
||||
&& (samples_to_skip_at_start < maxNsamples / 2))
|
||||
sum_of_5_samples= sum( abs( ref_data( samples_to_skip_at_start...
|
||||
+ SEARCHBUFFER * Downsample + 1: samples_to_skip_at_start...
|
||||
+ SEARCHBUFFER * Downsample + 5)));
|
||||
|
||||
if (sum_of_5_samples< CRITERIUM_FOR_SILENCE_OF_5_SAMPLES)
|
||||
samples_to_skip_at_start = samples_to_skip_at_start+ 1;
|
||||
end
|
||||
end
|
||||
% fprintf( 'samples_to_skip_at_start is %d\n', samples_to_skip_at_start);
|
||||
|
||||
samples_to_skip_at_end = 0;
|
||||
sum_of_5_samples= 0;
|
||||
while ((sum_of_5_samples< CRITERIUM_FOR_SILENCE_OF_5_SAMPLES) ...
|
||||
&& (samples_to_skip_at_end < maxNsamples / 2))
|
||||
sum_of_5_samples= sum( abs( ref_data( maxNsamples - ...
|
||||
SEARCHBUFFER* Downsample + DATAPADDING_MSECS* (Fs/ 1000) ...
|
||||
- samples_to_skip_at_end - 4: maxNsamples - ...
|
||||
SEARCHBUFFER* Downsample + DATAPADDING_MSECS* (Fs/ 1000) ...
|
||||
- samples_to_skip_at_end)));
|
||||
if (sum_of_5_samples< CRITERIUM_FOR_SILENCE_OF_5_SAMPLES)
|
||||
samples_to_skip_at_end = samples_to_skip_at_end+ 1;
|
||||
end
|
||||
end
|
||||
% fprintf( 'samples_to_skip_at_end is %d\n', samples_to_skip_at_end);
|
||||
|
||||
start_frame = floor( samples_to_skip_at_start/ (Nf/ 2));
|
||||
stop_frame = floor( (maxNsamples- 2* SEARCHBUFFER* Downsample ...
|
||||
+ DATAPADDING_MSECS* (Fs/ 1000)- samples_to_skip_at_end) ...
|
||||
/ (Nf/ 2))- 1;
|
||||
% number of frames in speech data plus DATAPADDING_MSECS
|
||||
% fprintf( 'start/end frame is %d/%d\n', start_frame, stop_frame);
|
||||
|
||||
D_disturbance= zeros( stop_frame+ 1, Nb);
|
||||
DA_disturbance= zeros( stop_frame+ 1, Nb);
|
||||
|
||||
power_ref = pow_of (ref_data, SEARCHBUFFER* Downsample, ...
|
||||
maxNsamples- SEARCHBUFFER* Downsample+ DATAPADDING_MSECS* (Fs/ 1000),...
|
||||
maxNsamples- 2* SEARCHBUFFER* Downsample+ DATAPADDING_MSECS* (Fs/ 1000));
|
||||
power_deg = pow_of (deg_data, SEARCHBUFFER * Downsample, ...
|
||||
maxNsamples- SEARCHBUFFER* Downsample+ DATAPADDING_MSECS* (Fs/ 1000),...
|
||||
maxNsamples- 2* SEARCHBUFFER* Downsample+ DATAPADDING_MSECS* (Fs/ 1000));
|
||||
% fprintf( 'ref/deg power is %f/%f\n', power_ref, power_deg);
|
||||
|
||||
hz_spectrum_ref = zeros( 1, Nf/ 2);
|
||||
hz_spectrum_deg = zeros( 1, Nf/ 2);
|
||||
frame_is_bad = zeros( 1, stop_frame + 1);
|
||||
smeared_frame_is_bad = zeros( 1, stop_frame + 1);
|
||||
silent = zeros( 1, stop_frame + 1);
|
||||
|
||||
pitch_pow_dens_ref = zeros( stop_frame + 1, Nb);
|
||||
pitch_pow_dens_deg = zeros( stop_frame + 1, Nb);
|
||||
|
||||
frame_was_skipped = zeros( 1, stop_frame + 1);
|
||||
frame_disturbance = zeros( 1, stop_frame + 1);
|
||||
frame_disturbance_asym_add = zeros( 1, stop_frame + 1);
|
||||
|
||||
avg_pitch_pow_dens_ref = zeros( 1, Nb);
|
||||
avg_pitch_pow_dens_deg = zeros( 1, Nb);
|
||||
loudness_dens_ref = zeros( 1, Nb);
|
||||
loudness_dens_deg = zeros( 1, Nb);
|
||||
deadzone = zeros( 1, Nb);
|
||||
disturbance_dens = zeros( 1, Nb);
|
||||
disturbance_dens_asym_add = zeros( 1, Nb);
|
||||
|
||||
time_weight = zeros( 1, stop_frame + 1);
|
||||
total_power_ref = zeros( 1, stop_frame + 1);
|
||||
|
||||
% fid= fopen( 'tmp_mat.txt', 'wt');
|
||||
|
||||
for frame = 0: stop_frame
|
||||
start_sample_ref = 1+ SEARCHBUFFER * Downsample + frame* (Nf/ 2);
|
||||
hz_spectrum_ref= short_term_fft (Nf, ref_data, Whanning, ...
|
||||
start_sample_ref);
|
||||
|
||||
utt = Nutterances;
|
||||
while ((utt >= 1) && ((Utt_Start(utt)- 1)* Downsample+ 1 ...
|
||||
> start_sample_ref))
|
||||
utt= utt - 1;
|
||||
end
|
||||
|
||||
if (utt >= 1)
|
||||
delay = Utt_Delay(utt);
|
||||
else
|
||||
delay = Utt_Delay(1);
|
||||
end
|
||||
|
||||
start_sample_deg = start_sample_ref + delay;
|
||||
|
||||
if ((start_sample_deg > 0) && (start_sample_deg + Nf- 1 < ...
|
||||
maxNsamples+ DATAPADDING_MSECS* (Fs/ 1000)))
|
||||
hz_spectrum_deg= short_term_fft (Nf, deg_data, Whanning, ...
|
||||
start_sample_deg);
|
||||
else
|
||||
hz_spectrum_deg( 1: Nf/ 2)= 0;
|
||||
end
|
||||
|
||||
pitch_pow_dens_ref( frame+ 1, :)= freq_warping (...
|
||||
hz_spectrum_ref, Nb, frame);
|
||||
%peak = maximum_of (pitch_pow_dens_ref, 0, Nb);
|
||||
pitch_pow_dens_deg( frame+ 1, :)= freq_warping (...
|
||||
hz_spectrum_deg, Nb, frame);
|
||||
|
||||
total_audible_pow_ref = total_audible (frame, pitch_pow_dens_ref, 1E2);
|
||||
total_audible_pow_deg = total_audible (frame, pitch_pow_dens_deg, 1E2);
|
||||
silent(frame+ 1) = (total_audible_pow_ref < 1E7);
|
||||
|
||||
|
||||
end
|
||||
% fclose( fid);
|
||||
|
||||
avg_pitch_pow_dens_ref= time_avg_audible_of (stop_frame + 1, ...
|
||||
silent, pitch_pow_dens_ref, floor((maxNsamples- 2* SEARCHBUFFER* ...
|
||||
Downsample+ DATAPADDING_MSECS* (Fs/ 1000))/ (Nf / 2))- 1);
|
||||
avg_pitch_pow_dens_deg= time_avg_audible_of (stop_frame + 1, ...
|
||||
silent, pitch_pow_dens_deg, floor((maxNsamples- 2* SEARCHBUFFER* ...
|
||||
Downsample+ DATAPADDING_MSECS* (Fs/ 1000))/ (Nf/ 2))- 1);
|
||||
|
||||
% fid= fopen( 'tmp_mat.txt', 'wt');
|
||||
% fprintf( fid, '%f\n', avg_pitch_pow_dens_deg);
|
||||
% fclose( fid);
|
||||
|
||||
if (CALIBRATE== 0)
|
||||
pitch_pow_dens_ref= freq_resp_compensation (stop_frame + 1, ...
|
||||
pitch_pow_dens_ref, avg_pitch_pow_dens_ref, ...
|
||||
avg_pitch_pow_dens_deg, 1000);
|
||||
if (Plot_Frame>= 0) % plot pitch_pow_dens_ref
|
||||
figure;
|
||||
subplot( 1, 2, 1);
|
||||
plot( centre_of_band_hz, 10* log10( eps+ ...
|
||||
pitch_pow_dens_ref( Plot_Frame+ 1, :)));
|
||||
axis( [0 Fs/2 0 95]); %xlabel( 'Hz'); ylabel( 'Db');
|
||||
title( 'reference signal bark spectrum with frequency compensation');
|
||||
subplot( 1, 2, 2);
|
||||
plot( centre_of_band_hz, 10* log10( eps+ ...
|
||||
pitch_pow_dens_deg( Plot_Frame+ 1, :)));
|
||||
axis( [0 Fs/2 0 95]); %xlabel( 'Hz'); ylabel( 'Db');
|
||||
title( 'degraded signal bark spectrum');
|
||||
end
|
||||
|
||||
end
|
||||
% tmp1= pitch_pow_dens_ref';
|
||||
|
||||
|
||||
MAX_SCALE = 5.0;
|
||||
MIN_SCALE = 3e-4;
|
||||
oldScale = 1;
|
||||
THRESHOLD_BAD_FRAMES = 30;
|
||||
for frame = 0: stop_frame
|
||||
|
||||
total_audible_pow_ref = total_audible (frame, pitch_pow_dens_ref, 1);
|
||||
total_audible_pow_deg = total_audible (frame, pitch_pow_dens_deg, 1);
|
||||
total_power_ref (1+ frame) = total_audible_pow_ref;
|
||||
|
||||
scale = (total_audible_pow_ref + 5e3)/ (total_audible_pow_deg + 5e3);
|
||||
if (frame > 0)
|
||||
scale = 0.2 * oldScale + 0.8 * scale;
|
||||
end
|
||||
oldScale = scale;
|
||||
|
||||
if (scale > MAX_SCALE)
|
||||
scale = MAX_SCALE;
|
||||
elseif (scale < MIN_SCALE)
|
||||
scale = MIN_SCALE;
|
||||
end
|
||||
|
||||
pitch_pow_dens_deg( 1+ frame, :) = ...
|
||||
pitch_pow_dens_deg( 1+ frame, :) * scale;
|
||||
|
||||
if (frame== Plot_Frame)
|
||||
figure;
|
||||
subplot( 1, 2, 1);
|
||||
plot( centre_of_band_hz, 10* log10( eps+ ...
|
||||
pitch_pow_dens_ref( Plot_Frame+ 1, :)));
|
||||
axis( [0 Fs/2 0 95]); %xlabel( 'Hz'); ylabel( 'Db');
|
||||
subplot( 1, 2, 2);
|
||||
plot( centre_of_band_hz, 10* log10( eps+ ...
|
||||
pitch_pow_dens_deg( Plot_Frame+ 1, :)));
|
||||
axis( [0 Fs/2 0 95]); %xlabel( 'Hz'); ylabel( 'Db');
|
||||
end
|
||||
|
||||
loudness_dens_ref = intensity_warping_of (frame, pitch_pow_dens_ref);
|
||||
loudness_dens_deg = intensity_warping_of (frame, pitch_pow_dens_deg);
|
||||
disturbance_dens = loudness_dens_deg - loudness_dens_ref;
|
||||
|
||||
if (frame== Plot_Frame)
|
||||
figure;
|
||||
subplot( 1, 2, 1);
|
||||
plot( centre_of_band_hz, 10* log10( eps+ ...
|
||||
loudness_dens_ref));
|
||||
axis( [0 Fs/2 0 15]); %xlabel( 'Hz'); ylabel( 'Db');
|
||||
title( 'reference signal loudness density');
|
||||
subplot( 1, 2, 2);
|
||||
plot( centre_of_band_hz, 10* log10( eps+ ...
|
||||
loudness_dens_deg));
|
||||
axis( [0 Fs/2 0 15]); %xlabel( 'Hz'); ylabel( 'Db');
|
||||
title( 'degraded signal loudness density');
|
||||
end
|
||||
|
||||
for band =1: Nb
|
||||
deadzone (band) = 0.25* min (loudness_dens_deg (band), ...
|
||||
loudness_dens_ref (band));
|
||||
end
|
||||
|
||||
for band = 1: Nb
|
||||
d = disturbance_dens (band);
|
||||
m = deadzone (band);
|
||||
|
||||
if (d > m)
|
||||
disturbance_dens (band) = disturbance_dens (band)- m;
|
||||
% disturbance_dens (band) = d- m;
|
||||
else
|
||||
if (d < -m)
|
||||
disturbance_dens (band) = disturbance_dens (band)+ m;
|
||||
% disturbance_dens (band) = d+ m;
|
||||
else
|
||||
disturbance_dens (band) = 0;
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
if (frame== Plot_Frame)
|
||||
figure;
|
||||
subplot( 1, 2, 1);
|
||||
plot( centre_of_band_hz, disturbance_dens);
|
||||
axis( [0 Fs/2 -1 50]); %xlabel( 'Hz'); ylabel( 'Db');
|
||||
title( 'disturbance');
|
||||
end
|
||||
D_disturbance( frame+ 1, :)= disturbance_dens;
|
||||
|
||||
frame_disturbance (1+ frame) = pseudo_Lp (disturbance_dens, D_POW_F);
|
||||
if (frame_disturbance (1+ frame) > THRESHOLD_BAD_FRAMES)
|
||||
there_is_a_bad_frame = TRUE;
|
||||
end
|
||||
|
||||
disturbance_dens= multiply_with_asymmetry_factor (...
|
||||
disturbance_dens, frame, pitch_pow_dens_ref, pitch_pow_dens_deg);
|
||||
|
||||
if (frame== Plot_Frame)
|
||||
subplot( 1, 2, 2);
|
||||
plot( centre_of_band_hz, disturbance_dens);
|
||||
axis( [0 Fs/2 -1 50]); %xlabel( 'Hz'); ylabel( 'Db');
|
||||
title( 'disturbance after asymmetry processing');
|
||||
end
|
||||
DA_disturbance( frame+ 1, :)= disturbance_dens;
|
||||
|
||||
|
||||
frame_disturbance_asym_add (1+ frame) = ...
|
||||
pseudo_Lp (disturbance_dens, A_POW_F);
|
||||
end
|
||||
% fid= fopen( 'tmp_mat.txt', 'wt');
|
||||
% fprintf( fid, '%f\n', frame_disturbance);
|
||||
% fclose( fid);
|
||||
|
||||
frame_was_skipped (1: 1+ stop_frame) = FALSE;
|
||||
|
||||
for utt = 2: Nutterances
|
||||
frame1 = floor (((Utt_Start(utt)- 1- SEARCHBUFFER )* Downsample+ 1+ ...
|
||||
Utt_Delay(utt))/ (Nf/ 2));
|
||||
j = floor( floor(((Utt_End(utt-1)- 1- SEARCHBUFFER)* Downsample+ 1+ ...
|
||||
Utt_Delay(utt-1)))/(Nf/ 2));
|
||||
delay_jump = Utt_Delay(utt) - Utt_Delay(utt-1);
|
||||
if (frame1 > j)
|
||||
frame1 = j;
|
||||
elseif (frame1 < 0)
|
||||
frame1 = 0;
|
||||
end
|
||||
% fprintf( 'frame1, j, delay_jump is %d, %d, %d\n', frame1, ...
|
||||
% j, delay_jump);
|
||||
|
||||
if (delay_jump < -(Nf/ 2))
|
||||
frame2 = floor (((Utt_Start(utt)- 1- SEARCHBUFFER)* Downsample+ 1 ...
|
||||
+ max (0, abs (delay_jump)))/ (Nf/ 2)) + 1;
|
||||
|
||||
for frame = frame1: frame2
|
||||
if (frame < stop_frame)
|
||||
frame_was_skipped (1+ frame) = TRUE;
|
||||
frame_disturbance (1+ frame) = 0;
|
||||
frame_disturbance_asym_add (1+ frame) = 0;
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
nn = DATAPADDING_MSECS* (Fs/ 1000) + maxNsamples;
|
||||
tweaked_deg = zeros( 1, nn);
|
||||
% fprintf( 'nn is %d\n', nn);
|
||||
|
||||
for i= SEARCHBUFFER* Downsample+ 1: nn- SEARCHBUFFER* Downsample
|
||||
utt = Nutterances;
|
||||
|
||||
while ((utt >= 1) && ((Utt_Start (utt)- 1)* Downsample> i))
|
||||
utt = utt- 1;
|
||||
end
|
||||
if (utt >= 1)
|
||||
delay = Utt_Delay (utt);
|
||||
else
|
||||
delay = Utt_Delay (1);
|
||||
end
|
||||
|
||||
j = i + delay;
|
||||
if (j < SEARCHBUFFER * Downsample+ 1)
|
||||
j = SEARCHBUFFER * Downsample+ 1;
|
||||
end
|
||||
if (j > nn - SEARCHBUFFER * Downsample)
|
||||
j = nn - SEARCHBUFFER * Downsample;
|
||||
end
|
||||
tweaked_deg (i) = deg_data (j);
|
||||
end
|
||||
|
||||
if (there_is_a_bad_frame)
|
||||
|
||||
for frame = 0: stop_frame
|
||||
frame_is_bad (1+ frame) = (frame_disturbance (1+ frame)...
|
||||
> THRESHOLD_BAD_FRAMES);
|
||||
smeared_frame_is_bad (1+ frame) = FALSE;
|
||||
end
|
||||
frame_is_bad (1) = FALSE;
|
||||
SMEAR_RANGE = 2;
|
||||
|
||||
for frame = SMEAR_RANGE: stop_frame- 1- SMEAR_RANGE
|
||||
max_itself_and_left = frame_is_bad (1+ frame);
|
||||
max_itself_and_right = frame_is_bad (1+ frame);
|
||||
|
||||
for i = -SMEAR_RANGE: 0
|
||||
if (max_itself_and_left < frame_is_bad (1+ frame+ i))
|
||||
max_itself_and_left = frame_is_bad (1+ frame+ i);
|
||||
end
|
||||
end
|
||||
|
||||
for i = 0: SMEAR_RANGE
|
||||
if (max_itself_and_right < frame_is_bad (1+ frame + i))
|
||||
max_itself_and_right = frame_is_bad (1+ frame + i);
|
||||
end
|
||||
end
|
||||
|
||||
mini = max_itself_and_left;
|
||||
if (mini > max_itself_and_right)
|
||||
mini = max_itself_and_right;
|
||||
end
|
||||
|
||||
smeared_frame_is_bad (1+ frame) = mini;
|
||||
end
|
||||
|
||||
MINIMUM_NUMBER_OF_BAD_FRAMES_IN_BAD_INTERVAL = 5;
|
||||
number_of_bad_intervals = 0;
|
||||
frame = 0;
|
||||
while (frame <= stop_frame)
|
||||
while ((frame <= stop_frame) && (~smeared_frame_is_bad (1+ frame)))
|
||||
frame= frame+ 1;
|
||||
end
|
||||
|
||||
if (frame <= stop_frame)
|
||||
start_frame_of_bad_interval(1+ number_of_bad_intervals)= ...
|
||||
1+ frame;
|
||||
|
||||
while ((frame <= stop_frame) && (...
|
||||
smeared_frame_is_bad (1+ frame)))
|
||||
frame= frame+ 1;
|
||||
end
|
||||
|
||||
if (frame <= stop_frame)
|
||||
stop_frame_of_bad_interval(1+ number_of_bad_intervals)= ...
|
||||
1+ frame;
|
||||
if (stop_frame_of_bad_interval(1+ number_of_bad_intervals)- ...
|
||||
start_frame_of_bad_interval(1+ number_of_bad_intervals)...
|
||||
>= MINIMUM_NUMBER_OF_BAD_FRAMES_IN_BAD_INTERVAL)
|
||||
number_of_bad_intervals= number_of_bad_intervals+ 1;
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
for bad_interval = 0: number_of_bad_intervals - 1
|
||||
start_sample_of_bad_interval(1+ bad_interval) = ...
|
||||
(start_frame_of_bad_interval(1+ bad_interval)- 1) * (Nf/ 2) ...
|
||||
+ SEARCHBUFFER * Downsample+ 1;
|
||||
stop_sample_of_bad_interval(1+ bad_interval) = ...
|
||||
(stop_frame_of_bad_interval(1+ bad_interval)- 1) * (Nf/ 2) ...
|
||||
+ Nf + SEARCHBUFFER* Downsample;
|
||||
if (stop_frame_of_bad_interval(1+ bad_interval) > stop_frame+ 1)
|
||||
stop_frame_of_bad_interval(1+ bad_interval) = stop_frame+ 1;
|
||||
end
|
||||
|
||||
number_of_samples_in_bad_interval(1+ bad_interval) = ...
|
||||
stop_sample_of_bad_interval(1+ bad_interval) - ...
|
||||
start_sample_of_bad_interval(1+ bad_interval)+ 1;
|
||||
end
|
||||
% fprintf( 'number of bad intervals %d\n', number_of_bad_intervals);
|
||||
% fprintf( '%d %d\n', number_of_samples_in_bad_interval(1), ...
|
||||
% number_of_samples_in_bad_interval(2));
|
||||
% fprintf( '%d %d\n', start_sample_of_bad_interval(1), ...
|
||||
% start_sample_of_bad_interval(2));
|
||||
|
||||
SEARCH_RANGE_IN_TRANSFORM_LENGTH = 4;
|
||||
search_range_in_samples= SEARCH_RANGE_IN_TRANSFORM_LENGTH * Nf;
|
||||
|
||||
for bad_interval= 0: number_of_bad_intervals- 1
|
||||
ref = zeros (1, 2 * search_range_in_samples + ...
|
||||
number_of_samples_in_bad_interval (1+ bad_interval));
|
||||
deg = zeros (1, 2 * search_range_in_samples + ...
|
||||
number_of_samples_in_bad_interval (1+ bad_interval));
|
||||
|
||||
ref(1: search_range_in_samples) = 0;
|
||||
|
||||
ref (search_range_in_samples+ 1: search_range_in_samples+ ...
|
||||
number_of_samples_in_bad_interval (1+ bad_interval)) = ...
|
||||
ref_data (start_sample_of_bad_interval( 1+ bad_interval) + 1: ...
|
||||
start_sample_of_bad_interval( 1+ bad_interval) + ...
|
||||
number_of_samples_in_bad_interval (1+ bad_interval));
|
||||
|
||||
ref (search_range_in_samples + ...
|
||||
number_of_samples_in_bad_interval (1+ bad_interval) + 1: ...
|
||||
search_range_in_samples + ...
|
||||
number_of_samples_in_bad_interval (1+ bad_interval) + ...
|
||||
search_range_in_samples) = 0;
|
||||
|
||||
for i = 0: 2 * search_range_in_samples + ...
|
||||
number_of_samples_in_bad_interval (1+ bad_interval) - 1
|
||||
j = start_sample_of_bad_interval (1+ bad_interval) - ...
|
||||
search_range_in_samples + i;
|
||||
nn = maxNsamples - SEARCHBUFFER * Downsample + ...
|
||||
DATAPADDING_MSECS * (Fs / 1000);
|
||||
if (j <= SEARCHBUFFER * Downsample)
|
||||
j = SEARCHBUFFER * Downsample+ 1;
|
||||
end
|
||||
if (j > nn)
|
||||
j = nn;
|
||||
end
|
||||
deg (1+ i) = tweaked_deg (j);
|
||||
end
|
||||
|
||||
[delay_in_samples, best_correlation]= compute_delay ...
|
||||
(1, 2 * search_range_in_samples + ...
|
||||
number_of_samples_in_bad_interval (1+ bad_interval), ...
|
||||
search_range_in_samples, ref, deg);
|
||||
delay_in_samples_in_bad_interval (1+ bad_interval) = ...
|
||||
delay_in_samples;
|
||||
% fprintf( 'delay_in_samples, best_correlation is \n\t%d, %f\n', ...
|
||||
% delay_in_samples, best_correlation);
|
||||
%
|
||||
if (best_correlation < 0.5)
|
||||
delay_in_samples_in_bad_interval (1+ bad_interval) = 0;
|
||||
end
|
||||
end
|
||||
|
||||
if (number_of_bad_intervals > 0)
|
||||
doubly_tweaked_deg = tweaked_deg( 1: maxNsamples + ...
|
||||
DATAPADDING_MSECS * (Fs / 1000));
|
||||
for bad_interval= 0: number_of_bad_intervals- 1
|
||||
delay = delay_in_samples_in_bad_interval (1+ bad_interval);
|
||||
|
||||
for i = start_sample_of_bad_interval (1+ bad_interval): ...
|
||||
stop_sample_of_bad_interval (1+ bad_interval)
|
||||
j = i + delay;
|
||||
if (j < 1)
|
||||
j = 1;
|
||||
end
|
||||
if (j > maxNsamples)
|
||||
j = maxNsamples;
|
||||
end
|
||||
h = tweaked_deg (j);
|
||||
doubly_tweaked_deg (i) = h;
|
||||
end
|
||||
end
|
||||
|
||||
untweaked_deg = deg_data;
|
||||
deg_data = doubly_tweaked_deg;
|
||||
|
||||
for bad_interval= 0: number_of_bad_intervals- 1
|
||||
for frame = start_frame_of_bad_interval (1+ bad_interval): ...
|
||||
stop_frame_of_bad_interval (1+ bad_interval)- 1
|
||||
frame= frame- 1;
|
||||
start_sample_ref = SEARCHBUFFER * Downsample + ...
|
||||
frame * Nf / 2+ 1;
|
||||
start_sample_deg = start_sample_ref;
|
||||
hz_spectrum_deg= short_term_fft (Nf, deg_data, ...
|
||||
Whanning, start_sample_deg);
|
||||
pitch_pow_dens_deg( 1+ frame, :)= freq_warping (...
|
||||
hz_spectrum_deg, Nb, frame);
|
||||
end
|
||||
|
||||
oldScale = 1;
|
||||
for frame = start_frame_of_bad_interval (1+ bad_interval): ...
|
||||
stop_frame_of_bad_interval (1+ bad_interval)- 1
|
||||
frame= frame- 1;
|
||||
% see implementation for detail why 1 needed to be
|
||||
% subtracted
|
||||
total_audible_pow_ref = total_audible (frame, ...
|
||||
pitch_pow_dens_ref, 1);
|
||||
total_audible_pow_deg = total_audible (frame, ...
|
||||
pitch_pow_dens_deg, 1);
|
||||
scale = (total_audible_pow_ref + 5e3) / ...
|
||||
(total_audible_pow_deg + 5e3);
|
||||
if (frame > 0)
|
||||
scale = 0.2 * oldScale + 0.8*scale;
|
||||
end
|
||||
oldScale = scale;
|
||||
if (scale > MAX_SCALE)
|
||||
scale = MAX_SCALE;
|
||||
end
|
||||
if (scale < MIN_SCALE)
|
||||
scale = MIN_SCALE;
|
||||
end
|
||||
|
||||
pitch_pow_dens_deg (1+ frame, :) = ...
|
||||
pitch_pow_dens_deg (1+ frame, :)* scale;
|
||||
loudness_dens_ref= intensity_warping_of (frame, ...
|
||||
pitch_pow_dens_ref);
|
||||
loudness_dens_deg= intensity_warping_of (frame, ...
|
||||
pitch_pow_dens_deg);
|
||||
disturbance_dens = loudness_dens_deg - loudness_dens_ref;
|
||||
|
||||
for band = 1: Nb
|
||||
deadzone(band) = min (loudness_dens_deg(band), ...
|
||||
loudness_dens_ref(band));
|
||||
deadzone(band) = deadzone(band)* 0.25;
|
||||
end
|
||||
|
||||
for band = 1: Nb
|
||||
d = disturbance_dens (band);
|
||||
m = deadzone (band);
|
||||
|
||||
if (d > m)
|
||||
disturbance_dens (band) = ...
|
||||
disturbance_dens (band)- m;
|
||||
else
|
||||
if (d < -m)
|
||||
disturbance_dens (band) = ...
|
||||
disturbance_dens (band)+ m;
|
||||
else
|
||||
disturbance_dens (band) = 0;
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
frame_disturbance( 1+ frame) = min (...
|
||||
frame_disturbance( 1+ frame), pseudo_Lp(...
|
||||
disturbance_dens, D_POW_F));
|
||||
disturbance_dens= multiply_with_asymmetry_factor ...
|
||||
(disturbance_dens, frame, pitch_pow_dens_ref, ...
|
||||
pitch_pow_dens_deg);
|
||||
frame_disturbance_asym_add(1+ frame) = min (...
|
||||
frame_disturbance_asym_add(1+ frame), ...
|
||||
pseudo_Lp (disturbance_dens, A_POW_F));
|
||||
end
|
||||
end
|
||||
deg_data = untweaked_deg;
|
||||
end
|
||||
end
|
||||
|
||||
for frame = 0: stop_frame
|
||||
h = 1;
|
||||
if (stop_frame + 1 > 1000)
|
||||
n = floor( (maxNsamples - 2 * SEARCHBUFFER * Downsample)...
|
||||
/ (Nf / 2)) - 1;
|
||||
timeWeightFactor = (n - 1000) / 5500;
|
||||
if (timeWeightFactor > 0.5)
|
||||
timeWeightFactor = 0.5;
|
||||
end
|
||||
h = (1.0 - timeWeightFactor) + timeWeightFactor * frame / n;
|
||||
end
|
||||
|
||||
time_weight (1 +frame) = h;
|
||||
end
|
||||
|
||||
% fid= fopen( 'tmp_mat1.txt', 'at');
|
||||
% fprintf( '\n');
|
||||
for frame = 0: stop_frame
|
||||
h = ((total_power_ref (1+ frame) + 1e5) / 1e7)^ 0.04;
|
||||
% if (frame== 118)
|
||||
% fprintf( '%f\n', h);
|
||||
% fprintf( '%f\n', frame_disturbance( 1+ frame));
|
||||
% end
|
||||
frame_disturbance( 1+ frame) = frame_disturbance( 1+ frame)/ h;
|
||||
|
||||
% if (frame== 118)
|
||||
% fprintf( '%f\n', frame_disturbance( 1+ frame));
|
||||
% end
|
||||
%
|
||||
frame_disturbance_asym_add( 1+ frame) = ...
|
||||
frame_disturbance_asym_add( 1+ frame)/ h;
|
||||
if (frame_disturbance( 1+ frame) > 45)
|
||||
frame_disturbance( 1+ frame) = 45;
|
||||
end
|
||||
if (frame_disturbance_asym_add( 1+ frame)> 45)
|
||||
frame_disturbance_asym_add( 1+ frame) = 45;
|
||||
end
|
||||
end
|
||||
% fclose ( fid);
|
||||
|
||||
d_indicator = Lpq_weight (start_frame, stop_frame, ...
|
||||
D_POW_S, D_POW_T, frame_disturbance, time_weight);
|
||||
a_indicator = Lpq_weight (start_frame, stop_frame, ...
|
||||
A_POW_S, A_POW_T, frame_disturbance_asym_add, time_weight);
|
||||
|
||||
pesq_mos = 4.5 - D_WEIGHT * d_indicator - A_WEIGHT * a_indicator;
|
||||
|
||||
if (Plot_Frame> 0)
|
||||
figure;
|
||||
subplot( 1, 2, 1);
|
||||
mesh( 0: stop_frame, centre_of_band_hz, D_disturbance');
|
||||
title( 'disturbance');
|
||||
subplot( 1, 2, 2);
|
||||
mesh( 0: stop_frame, centre_of_band_hz, DA_disturbance');
|
||||
title( 'disturbance after asymmetry processing');
|
||||
end
|
||||
|
||||
% fid= fopen( 'tmp_mat.txt', 'wt');
|
||||
% fprintf( fid, 'time_weight\n');
|
||||
% fprintf( fid, '%f\n', time_weight);
|
||||
% fprintf( fid, 'frame_disturbance:\n');
|
||||
% fprintf( fid, '%f\n', frame_disturbance);
|
||||
% fprintf( fid, 'frame_disturbance_asym_add\n');
|
||||
% fprintf( fid, '%f\n', frame_disturbance_asym_add);
|
||||
% fclose( fid);
|
||||
|
||||
function result_time= Lpq_weight(start_frame, stop_frame, ...
|
||||
power_syllable, power_time, frame_disturbance, time_weight)
|
||||
|
||||
global NUMBER_OF_PSQM_FRAMES_PER_SYLLABE
|
||||
|
||||
% fid= fopen( 'tmp_mat1.txt', 'at');
|
||||
% fprintf( 'result_time:\n');
|
||||
|
||||
result_time= 0;
|
||||
total_time_weight_time = 0;
|
||||
% fprintf( 'start/end frame: %d/%d\n', start_frame, stop_frame);
|
||||
for start_frame_of_syllable = start_frame: ...
|
||||
NUMBER_OF_PSQM_FRAMES_PER_SYLLABE/2: stop_frame
|
||||
result_syllable = 0;
|
||||
count_syllable = 0;
|
||||
|
||||
for frame = start_frame_of_syllable: ...
|
||||
start_frame_of_syllable + NUMBER_OF_PSQM_FRAMES_PER_SYLLABE- 1
|
||||
if (frame <= stop_frame)
|
||||
h = frame_disturbance(1+ frame);
|
||||
% if (start_frame_of_syllable== 101)
|
||||
% fprintf( fid, '%f\n', h);
|
||||
% end
|
||||
result_syllable = result_syllable+ (h^ power_syllable);
|
||||
end
|
||||
count_syllable = count_syllable+ 1;
|
||||
end
|
||||
|
||||
result_syllable = result_syllable/ count_syllable;
|
||||
result_syllable = result_syllable^ (1/power_syllable);
|
||||
|
||||
result_time= result_time+ (time_weight (...
|
||||
1+ start_frame_of_syllable - start_frame) * ...
|
||||
result_syllable)^ power_time;
|
||||
total_time_weight_time = total_time_weight_time+ ...
|
||||
time_weight (1+ start_frame_of_syllable - start_frame)^ power_time;
|
||||
|
||||
% fprintf( fid, '%f\n', result_time);
|
||||
end
|
||||
% fclose (fid);
|
||||
|
||||
% fprintf( 'total_time_weight_time is %f\n', total_time_weight_time);
|
||||
result_time = result_time/ total_time_weight_time;
|
||||
result_time= result_time^ (1/ power_time);
|
||||
% fprintf( 'result_time is %f\n\n', result_time);
|
||||
|
||||
|
||||
function [best_delay, max_correlation] = compute_delay (...
|
||||
start_sample, stop_sample, search_range, ...
|
||||
time_series1, time_series2)
|
||||
|
||||
n = stop_sample - start_sample+ 1;
|
||||
power_of_2 = 2^ (ceil( log2( 2 * n)));
|
||||
|
||||
power1 = pow_of (time_series1, start_sample, stop_sample, n)* ...
|
||||
n/ power_of_2;
|
||||
power2 = pow_of (time_series2, start_sample, stop_sample, n)* ...
|
||||
n/ power_of_2;
|
||||
normalization = sqrt (power1 * power2);
|
||||
% fprintf( 'normalization is %f\n', normalization);
|
||||
|
||||
if ((power1 <= 1e-6) || (power2 <= 1e-6))
|
||||
max_correlation = 0;
|
||||
best_delay= 0;
|
||||
end
|
||||
|
||||
x1( 1: power_of_2)= 0;
|
||||
x2( 1: power_of_2)= 0;
|
||||
y( 1: power_of_2)= 0;
|
||||
|
||||
x1( 1: n)= abs( time_series1( start_sample: ...
|
||||
stop_sample));
|
||||
x2( 1: n)= abs( time_series2( start_sample: ...
|
||||
stop_sample));
|
||||
|
||||
x1_fft= fft( x1, power_of_2)/ power_of_2;
|
||||
x2_fft= fft( x2, power_of_2);
|
||||
x1_fft_conj= conj( x1_fft);
|
||||
y= ifft( x1_fft_conj.* x2_fft, power_of_2);
|
||||
|
||||
best_delay = 0;
|
||||
max_correlation = 0;
|
||||
|
||||
% these loop can be rewritten
|
||||
for i = -search_range: -1
|
||||
h = abs (y (1+ i + power_of_2)) / normalization;
|
||||
if (h > max_correlation)
|
||||
max_correlation = h;
|
||||
best_delay= i;
|
||||
end
|
||||
end
|
||||
for i = 0: search_range- 1
|
||||
h = abs (y (1+i)) / normalization;
|
||||
if (h > max_correlation)
|
||||
max_correlation = h;
|
||||
best_delay= i;
|
||||
end
|
||||
end
|
||||
best_delay= best_delay- 1;
|
||||
|
||||
function mod_disturbance_dens= multiply_with_asymmetry_factor (...
|
||||
disturbance_dens, frame, pitch_pow_dens_ref, pitch_pow_dens_deg)
|
||||
|
||||
global Nb
|
||||
for i = 1: Nb
|
||||
ratio = (pitch_pow_dens_deg(1+ frame, i) + 50)...
|
||||
/ (pitch_pow_dens_ref (1+ frame, i) + 50);
|
||||
h = ratio^ 1.2;
|
||||
if (h > 12)
|
||||
h = 12;
|
||||
elseif (h < 3)
|
||||
h = 0.0;
|
||||
end
|
||||
mod_disturbance_dens (i) = disturbance_dens (i) * h;
|
||||
end
|
||||
|
||||
|
||||
function loudness_dens = intensity_warping_of (...
|
||||
frame, pitch_pow_dens)
|
||||
|
||||
global abs_thresh_power Sl Nb centre_of_band_bark
|
||||
ZWICKER_POWER= 0.23;
|
||||
for band = 1: Nb
|
||||
threshold = abs_thresh_power (band);
|
||||
input = pitch_pow_dens (1+ frame, band);
|
||||
|
||||
if (centre_of_band_bark (band) < 4)
|
||||
h = 6 / (centre_of_band_bark (band) + 2);
|
||||
else
|
||||
h = 1;
|
||||
end
|
||||
|
||||
if (h > 2)
|
||||
h = 2;
|
||||
end
|
||||
h = h^ 0.15;
|
||||
modified_zwicker_power = ZWICKER_POWER * h;
|
||||
if (input > threshold)
|
||||
loudness_dens (band) = ((threshold / 0.5)^ modified_zwicker_power)...
|
||||
* ((0.5 + 0.5 * input / threshold)^ modified_zwicker_power- 1);
|
||||
else
|
||||
loudness_dens (band) = 0;
|
||||
end
|
||||
|
||||
loudness_dens (band) = loudness_dens (band)* Sl;
|
||||
end
|
||||
|
||||
function result= pseudo_Lp (x, p)
|
||||
|
||||
global Nb width_of_band_bark
|
||||
totalWeight = 0;
|
||||
result = 0;
|
||||
for band = 2: Nb
|
||||
h = abs (x (band));
|
||||
w = width_of_band_bark (band);
|
||||
prod = h * w;
|
||||
|
||||
result = result+ prod^ p;
|
||||
totalWeight = totalWeight+ w;
|
||||
end
|
||||
result = (result/ totalWeight)^ (1/p);
|
||||
result = result* totalWeight;
|
||||
|
||||
|
||||
function mod_pitch_pow_dens_ref= freq_resp_compensation (number_of_frames, ...
|
||||
pitch_pow_dens_ref, avg_pitch_pow_dens_ref, ...
|
||||
avg_pitch_pow_dens_deg, constant)
|
||||
|
||||
global Nb
|
||||
|
||||
for band = 1: Nb
|
||||
x = (avg_pitch_pow_dens_deg (band) + constant) / ...
|
||||
(avg_pitch_pow_dens_ref (band) + constant);
|
||||
if (x > 100.0)
|
||||
x = 100.0;
|
||||
elseif (x < 0.01)
|
||||
x = 0.01;
|
||||
end
|
||||
|
||||
for frame = 1: number_of_frames
|
||||
mod_pitch_pow_dens_ref(frame, band) = ...
|
||||
pitch_pow_dens_ref(frame, band) * x;
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
|
||||
function avg_pitch_pow_dens= time_avg_audible_of(number_of_frames, ...
|
||||
silent, pitch_pow_dens, total_number_of_frames)
|
||||
|
||||
global Nb abs_thresh_power
|
||||
|
||||
for band = 1: Nb
|
||||
result = 0;
|
||||
for frame = 1: number_of_frames
|
||||
if (~silent (frame))
|
||||
h = pitch_pow_dens (frame, band);
|
||||
if (h > 100 * abs_thresh_power (band))
|
||||
result = result + h;
|
||||
end
|
||||
end
|
||||
|
||||
avg_pitch_pow_dens (band) = result/ total_number_of_frames;
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
|
||||
function hz_spectrum= short_term_fft (Nf, data, Whanning, start_sample)
|
||||
|
||||
x1= data( start_sample: start_sample+ Nf-1).* Whanning;
|
||||
x1_fft= fft( x1);
|
||||
hz_spectrum= abs( x1_fft( 1: Nf/ 2)).^ 2;
|
||||
hz_spectrum( 1)= 0;
|
||||
|
||||
|
||||
function pitch_pow_dens= freq_warping( hz_spectrum, Nb, frame)
|
||||
|
||||
global nr_of_hz_bands_per_bark_band pow_dens_correction_factor
|
||||
global Sp
|
||||
|
||||
hz_band = 1;
|
||||
for bark_band = 1: Nb
|
||||
n = nr_of_hz_bands_per_bark_band (bark_band);
|
||||
sum = 0;
|
||||
for i = 1: n
|
||||
sum = sum+ hz_spectrum( hz_band);
|
||||
hz_band= hz_band+ 1;
|
||||
end
|
||||
sum = sum* pow_dens_correction_factor (bark_band);
|
||||
sum = sum* Sp;
|
||||
pitch_pow_dens (bark_band) = sum;
|
||||
|
||||
end
|
||||
|
||||
|
||||
function total_audible_pow = total_audible (frame, ...
|
||||
pitch_pow_dens, factor)
|
||||
|
||||
global Nb abs_thresh_power
|
||||
|
||||
total_audible_pow = 0;
|
||||
for band= 2: Nb
|
||||
h = pitch_pow_dens (frame+ 1,band);
|
||||
threshold = factor * abs_thresh_power (band);
|
||||
if (h > threshold)
|
||||
total_audible_pow = total_audible_pow+ h;
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,3 @@
|
||||
function power= pow_of( data, start_point, end_point, divisor)
|
||||
|
||||
power= sum( data( start_point: end_point).^ 2)/ divisor;
|
||||
@@ -0,0 +1,301 @@
|
||||
function setup_global( sampling_rate);
|
||||
|
||||
global Downsample InIIR_Hsos InIIR_Nsos Align_Nfft
|
||||
global DATAPADDING_MSECS SEARCHBUFFER Fs MINSPEECHLGTH JOINSPEECHLGTH
|
||||
|
||||
global Nutterances Largest_uttsize Nsurf_samples Crude_DelayEst
|
||||
global Crude_DelayConf UttSearch_Start UttSearch_End Utt_DelayEst
|
||||
global Utt_Delay Utt_DelayConf Utt_Start Utt_End
|
||||
global MAXNUTTERANCES WHOLE_SIGNAL
|
||||
global pesq_mos subj_mos cond_nr MINUTTLENGTH
|
||||
global CALIBRATE Nfmax Nb Sl Sp
|
||||
global nr_of_hz_bands_per_bark_band centre_of_band_bark
|
||||
global width_of_band_hz centre_of_band_hz width_of_band_bark
|
||||
global pow_dens_correction_factor abs_thresh_power
|
||||
|
||||
CALIBRATE= 0;
|
||||
Nfmax= 512;
|
||||
|
||||
MAXNUTTERANCES= 50;
|
||||
MINUTTLENGTH= 50;
|
||||
WHOLE_SIGNAL= -1;
|
||||
UttSearch_Star= zeros( 1, MAXNUTTERANCES);
|
||||
UttSearch_End= zeros( 1, MAXNUTTERANCES);
|
||||
Utt_DelayEst= zeros( 1, MAXNUTTERANCES);
|
||||
Utt_Delay= zeros( 1, MAXNUTTERANCES);
|
||||
Utt_DelayConf= zeros( 1, MAXNUTTERANCES);
|
||||
Utt_Start= zeros( 1, MAXNUTTERANCES);
|
||||
Utt_End= zeros( 1, MAXNUTTERANCES);
|
||||
|
||||
DATAPADDING_MSECS= 320;
|
||||
SEARCHBUFFER= 75;
|
||||
MINSPEECHLGTH= 4;
|
||||
JOINSPEECHLGTH= 50;
|
||||
|
||||
Sp_16k = 6.910853e-006;
|
||||
Sl_16k = 1.866055e-001;
|
||||
fs_16k= 16000;
|
||||
Downsample_16k = 64;
|
||||
Align_Nfft_16k = 1024;
|
||||
InIIR_Nsos_16k = 12;
|
||||
InIIR_Hsos_16k = [
|
||||
0.325631521, -0.086782860, -0.238848661, -1.079416490, 0.434583902;
|
||||
0.403961804, -0.556985881, 0.153024077, -0.415115835, 0.696590244;
|
||||
4.736162769, 3.287251046, 1.753289019, -1.859599046, 0.876284034;
|
||||
0.365373469, 0.000000000, 0.000000000, -0.634626531, 0.000000000;
|
||||
0.884811506, 0.000000000, 0.000000000, -0.256725271, 0.141536777;
|
||||
0.723593055, -1.447186099, 0.723593044, -1.129587469, 0.657232737;
|
||||
1.644910855, -1.817280902, 1.249658063, -1.778403899, 0.801724355;
|
||||
0.633692689, -0.284644314, -0.319789663, 0.000000000, 0.000000000;
|
||||
1.032763031, 0.268428979, 0.602913323, 0.000000000, 0.000000000;
|
||||
1.001616361, -0.823749013, 0.439731942, -0.885778255, 0.000000000;
|
||||
0.752472096, -0.375388990, 0.188977609, -0.077258216, 0.247230734;
|
||||
1.023700575, 0.001661628, 0.521284240, -0.183867259, 0.354324187
|
||||
];
|
||||
|
||||
Sp_8k = 2.764344e-5;
|
||||
Sl_8k = 1.866055e-1;
|
||||
fs_8k= 8000;
|
||||
Downsample_8k = 32;
|
||||
Align_Nfft_8k = 512;
|
||||
InIIR_Nsos_8k = 8;
|
||||
InIIR_Hsos_8k = [
|
||||
0.885535424, -0.885535424, 0.000000000, -0.771070709, 0.000000000;
|
||||
0.895092588, 1.292907193, 0.449260174, 1.268869037, 0.442025372;
|
||||
4.049527940, -7.865190042, 3.815662102, -1.746859852, 0.786305963;
|
||||
0.500002353, -0.500002353, 0.000000000, 0.000000000, 0.000000000;
|
||||
0.565002834, -0.241585934, -0.306009671, 0.259688659, 0.249979657;
|
||||
2.115237288, 0.919935084, 1.141240051, -1.587313419, 0.665935315;
|
||||
0.912224584, -0.224397719, -0.641121413, -0.246029464, -0.556720590;
|
||||
0.444617727, -0.307589321, 0.141638062, -0.996391149, 0.502251622
|
||||
];
|
||||
|
||||
nr_of_hz_bands_per_bark_band_8k = [
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 2, 1, ...
|
||||
1, 1, 1, 1, 2, 1, 1, 2, 2, 2, ...
|
||||
2, 2, 2, 2, 2, 3, 3, 3, 3, 4, ...
|
||||
3, 4, 5, 4, 5, 6, 6, 7, 8, 9, ...
|
||||
9, 11
|
||||
];
|
||||
|
||||
centre_of_band_bark_8k = [
|
||||
0.078672, 0.316341, 0.636559, 0.961246, 1.290450, ...
|
||||
1.624217, 1.962597, 2.305636, 2.653383, 3.005889, ...
|
||||
3.363201, 3.725371, 4.092449, 4.464486, 4.841533, ...
|
||||
5.223642, 5.610866, 6.003256, 6.400869, 6.803755, ...
|
||||
7.211971, 7.625571, 8.044611, 8.469146, 8.899232, ...
|
||||
9.334927, 9.776288, 10.223374, 10.676242, 11.134952,...
|
||||
11.599563, 12.070135, 12.546731, 13.029408, 13.518232,...
|
||||
14.013264, 14.514566, 15.022202, 15.536238, 16.056736,...
|
||||
16.583761, 17.117382
|
||||
];
|
||||
|
||||
centre_of_band_hz_8k = [
|
||||
7.867213, 31.634144, 63.655895, 96.124611, 129.044968,...
|
||||
162.421738, 196.259659, 230.563568, 265.338348, 300.588867,...
|
||||
336.320129, 372.537140, 409.244934, 446.448578, 484.568604,...
|
||||
526.600586, 570.303833, 619.423340, 672.121643, 728.525696,...
|
||||
785.675964, 846.835693, 909.691650, 977.063293, 1049.861694,...
|
||||
1129.635986, 1217.257568, 1312.109497, 1412.501465, 1517.999390,...
|
||||
1628.894165, 1746.194336, 1871.568848, 2008.776123, 2158.979248,...
|
||||
2326.743164, 2513.787109, 2722.488770, 2952.586670, 3205.835449,...
|
||||
3492.679932, 3820.219238
|
||||
];
|
||||
|
||||
width_of_band_bark_8k = [
|
||||
0.157344, 0.317994, 0.322441, 0.326934, 0.331474, ...
|
||||
0.336061, 0.340697, 0.345381, 0.350114, 0.354897, ...
|
||||
0.359729, 0.364611, 0.369544, 0.374529, 0.379565, ...
|
||||
0.384653, 0.389794, 0.394989, 0.400236, 0.405538, ...
|
||||
0.410894, 0.416306, 0.421773, 0.427297, 0.432877, ...
|
||||
0.438514, 0.444209, 0.449962, 0.455774, 0.461645, ...
|
||||
0.467577, 0.473569, 0.479621, 0.485736, 0.491912, ...
|
||||
0.498151, 0.504454, 0.510819, 0.517250, 0.523745, ...
|
||||
0.530308, 0.536934
|
||||
];
|
||||
|
||||
width_of_band_hz_8k = [
|
||||
15.734426, 31.799433, 32.244064, 32.693359, 33.147385, ...
|
||||
33.606140, 34.069702, 34.538116, 35.011429, 35.489655, ...
|
||||
35.972870, 36.461121, 36.954407, 37.452911, 40.269653, ...
|
||||
42.311859, 45.992554, 51.348511, 55.040527, 56.775208, ...
|
||||
58.699402, 62.445862, 64.820923, 69.195374, 76.745667, ...
|
||||
84.016235, 90.825684, 97.931152, 103.348877, 107.801880, ...
|
||||
113.552246, 121.490601, 130.420410, 143.431763, 158.486816, ...
|
||||
176.872803, 198.314697, 219.549561, 240.600098, 268.702393, ...
|
||||
306.060059, 349.937012
|
||||
];
|
||||
|
||||
pow_dens_correction_factor_8k = [
|
||||
100.000000, 99.999992, 100.000000, 100.000008, 100.000008,...
|
||||
100.000015, 99.999992, 99.999969, 50.000027, 100.000000,...
|
||||
99.999969, 100.000015, 99.999947, 100.000061, 53.047077, ...
|
||||
110.000046, 117.991989, 65.000000, 68.760147, 69.999931, ...
|
||||
71.428818, 75.000038, 76.843384, 80.968781, 88.646126, ...
|
||||
63.864388, 68.155350, 72.547775, 75.584831, 58.379192,...
|
||||
80.950836, 64.135651, 54.384785, 73.821884, 64.437073, ...
|
||||
59.176456, 65.521278, 61.399822, 58.144047, 57.004543,...
|
||||
64.126297, 59.248363
|
||||
];
|
||||
|
||||
abs_thresh_power_8k = [
|
||||
51286152, 2454709.500, 70794.593750, ...
|
||||
4897.788574, 1174.897705, 389.045166, ...
|
||||
104.712860, 45.708820, 17.782795, ...
|
||||
9.772372, 4.897789, 3.090296, ...
|
||||
1.905461, 1.258925, 0.977237, ...
|
||||
0.724436, 0.562341, 0.457088, ...
|
||||
0.389045, 0.331131, 0.295121, ...
|
||||
0.269153, 0.257040, 0.251189, ...
|
||||
0.251189, 0.251189, 0.251189, ...
|
||||
0.263027, 0.288403, 0.309030, ...
|
||||
0.338844, 0.371535, 0.398107, ...
|
||||
0.436516, 0.467735, 0.489779, ...
|
||||
0.501187, 0.501187, 0.512861, ...
|
||||
0.524807, 0.524807, 0.524807
|
||||
];
|
||||
|
||||
nr_of_hz_bands_per_bark_band_16k = [
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 2, 1, ...
|
||||
1, 1, 1, 1, 2, 1, 1, 2, 2, 2, ...
|
||||
2, 2, 2, 2, 2, 3, 3, 3, 3, 4, ...
|
||||
3, 4, 5, 4, 5, 6, 6, 7, 8, 9, ...
|
||||
9, 12, 12, 15, 16, 18, 21, 25, 20
|
||||
];
|
||||
|
||||
centre_of_band_bark_16k = [
|
||||
0.078672, 0.316341, 0.636559, 0.961246, 1.290450, ...
|
||||
1.624217, 1.962597, 2.305636, 2.653383, 3.005889, ...
|
||||
3.363201, 3.725371, 4.092449, 4.464486, 4.841533, ...
|
||||
5.223642, 5.610866, 6.003256, 6.400869, 6.803755, ...
|
||||
7.211971, 7.625571, 8.044611, 8.469146, 8.899232, ...
|
||||
9.334927, 9.776288, 10.223374, 10.676242, 11.134952, ...
|
||||
11.599563, 12.070135, 12.546731, 13.029408, 13.518232, ...
|
||||
14.013264, 14.514566, 15.022202, 15.536238, 16.056736, ...
|
||||
16.583761, 17.117382, 17.657663, 18.204674, 18.758478, ...
|
||||
19.319147, 19.886751, 20.461355, 21.043034
|
||||
];
|
||||
|
||||
centre_of_band_hz_16k = [
|
||||
7.867213, 31.634144, 63.655895, 96.124611, 129.044968,...
|
||||
162.421738, 196.259659, 230.563568, 265.338348, 300.588867,...
|
||||
336.320129, 372.537140, 409.244934, 446.448578, 484.568604,...
|
||||
526.600586, 570.303833, 619.423340, 672.121643, 728.525696,...
|
||||
785.675964, 846.835693, 909.691650, 977.063293, 1049.861694,...
|
||||
1129.635986, 1217.257568, 1312.109497, 1412.501465, 1517.999390,...
|
||||
1628.894165, 1746.194336, 1871.568848, 2008.776123, 2158.979248,...
|
||||
2326.743164, 2513.787109, 2722.488770, 2952.586670, 3205.835449,...
|
||||
3492.679932, 3820.219238, 4193.938477, 4619.846191, 5100.437012,...
|
||||
5636.199219, 6234.313477, 6946.734863, 7796.473633
|
||||
];
|
||||
|
||||
width_of_band_bark_16k = [
|
||||
0.157344, 0.317994, 0.322441, 0.326934, 0.331474,...
|
||||
0.336061, 0.340697, 0.345381, 0.350114, 0.354897,...
|
||||
0.359729, 0.364611, 0.369544, 0.374529, 0.379565,...
|
||||
0.384653, 0.389794, 0.394989, 0.400236, 0.405538,...
|
||||
0.410894, 0.416306, 0.421773, 0.427297, 0.432877,...
|
||||
0.438514, 0.444209, 0.449962, 0.455774, 0.461645,...
|
||||
0.467577, 0.473569, 0.479621, 0.485736, 0.491912,...
|
||||
0.498151, 0.504454, 0.510819, 0.517250, 0.523745,...
|
||||
0.530308, 0.536934, 0.543629, 0.550390, 0.557220,...
|
||||
0.564119, 0.571085, 0.578125, 0.585232
|
||||
];
|
||||
|
||||
width_of_band_hz_16k = [
|
||||
15.734426, 31.799433, 32.244064, 32.693359, ...
|
||||
33.147385, 33.606140, 34.069702, 34.538116, ...
|
||||
35.011429, 35.489655, 35.972870, 36.461121, ...
|
||||
36.954407, 37.452911, 40.269653, 42.311859, ...
|
||||
45.992554, 51.348511, 55.040527, 56.775208, ...
|
||||
58.699402, 62.445862, 64.820923, 69.195374, ...
|
||||
76.745667, 84.016235, 90.825684, 97.931152, ...
|
||||
103.348877, 107.801880, 113.552246, 121.490601, ...
|
||||
130.420410, 143.431763, 158.486816, 176.872803, ...
|
||||
198.314697, 219.549561, 240.600098, 268.702393, ...
|
||||
306.060059, 349.937012, 398.686279, 454.713867, ...
|
||||
506.841797, 564.863770, 637.261230, 794.717285, ...
|
||||
931.068359
|
||||
];
|
||||
|
||||
pow_dens_correction_factor_16k = [
|
||||
100.000000, 99.999992, 100.000000, 100.000008,...
|
||||
100.000008, 100.000015, 99.999992, 99.999969, ...
|
||||
50.000027, 100.000000, 99.999969, 100.000015, ...
|
||||
99.999947, 100.000061, 53.047077, 110.000046, ...
|
||||
117.991989, 65.000000, 68.760147, 69.999931, ...
|
||||
71.428818, 75.000038, 76.843384, 80.968781, ...
|
||||
88.646126, 63.864388, 68.155350, 72.547775, ...
|
||||
75.584831, 58.379192, 80.950836, 64.135651, ...
|
||||
54.384785, 73.821884, 64.437073, 59.176456, ...
|
||||
65.521278, 61.399822, 58.144047, 57.004543, ...
|
||||
64.126297, 54.311001, 61.114979, 55.077751, ...
|
||||
56.849335, 55.628868, 53.137054, 54.985844, ...
|
||||
79.546974
|
||||
];
|
||||
|
||||
abs_thresh_power_16k = [
|
||||
51286152.00, 2454709.500, 70794.593750, ...
|
||||
4897.788574, 1174.897705, 389.045166, ...
|
||||
104.712860, 45.708820, 17.782795, ...
|
||||
9.772372, 4.897789, 3.090296, ...
|
||||
1.905461, 1.258925, 0.977237, ...
|
||||
0.724436, 0.562341, 0.457088, ...
|
||||
0.389045, 0.331131, 0.295121, ...
|
||||
0.269153, 0.257040, 0.251189, ...
|
||||
0.251189, 0.251189, 0.251189, ...
|
||||
0.263027, 0.288403, 0.309030, ...
|
||||
0.338844, 0.371535, 0.398107, ...
|
||||
0.436516, 0.467735, 0.489779, ...
|
||||
0.501187, 0.501187, 0.512861, ...
|
||||
0.524807, 0.524807, 0.524807, ...
|
||||
0.512861, 0.478630, 0.426580, ...
|
||||
0.371535, 0.363078, 0.416869, ...
|
||||
0.537032
|
||||
];
|
||||
|
||||
if (sampling_rate== fs_16k)
|
||||
Downsample = Downsample_16k;
|
||||
InIIR_Hsos = InIIR_Hsos_16k;
|
||||
InIIR_Nsos = InIIR_Nsos_16k;
|
||||
Align_Nfft = Align_Nfft_16k;
|
||||
Fs= fs_16k;
|
||||
|
||||
Nb = 49;
|
||||
Sl = Sl_16k;
|
||||
Sp = Sp_16k;
|
||||
nr_of_hz_bands_per_bark_band = nr_of_hz_bands_per_bark_band_16k;
|
||||
centre_of_band_bark = centre_of_band_bark_16k;
|
||||
centre_of_band_hz = centre_of_band_hz_16k;
|
||||
width_of_band_bark = width_of_band_bark_16k;
|
||||
width_of_band_hz = width_of_band_hz_16k;
|
||||
pow_dens_correction_factor = pow_dens_correction_factor_16k;
|
||||
abs_thresh_power = abs_thresh_power_16k;
|
||||
|
||||
return;
|
||||
end
|
||||
|
||||
if (sampling_rate== fs_8k)
|
||||
Downsample = Downsample_8k;
|
||||
InIIR_Hsos = InIIR_Hsos_8k;
|
||||
InIIR_Nsos = InIIR_Nsos_8k;
|
||||
Align_Nfft = Align_Nfft_8k;
|
||||
Fs= fs_8k;
|
||||
|
||||
Nb = 42;
|
||||
Sl = Sl_8k;
|
||||
Sp = Sp_8k;
|
||||
nr_of_hz_bands_per_bark_band = nr_of_hz_bands_per_bark_band_8k;
|
||||
centre_of_band_bark = centre_of_band_bark_8k;
|
||||
centre_of_band_hz = centre_of_band_hz_8k;
|
||||
width_of_band_bark = width_of_band_bark_8k;
|
||||
width_of_band_hz = width_of_band_hz_8k;
|
||||
pow_dens_correction_factor = pow_dens_correction_factor_8k;
|
||||
abs_thresh_power = abs_thresh_power_8k;
|
||||
return;
|
||||
end
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,390 @@
|
||||
function split_align( ref_data, ref_Nsamples, ref_VAD, ref_logVAD, ...
|
||||
deg_data, deg_Nsamples, deg_VAD, deg_logVAD, ...
|
||||
Utt_Start_l, Utt_SpeechStart, Utt_SpeechEnd, Utt_End_l, ...
|
||||
Utt_DelayEst_l, Utt_DelayConf_l)
|
||||
|
||||
global MAXNUTTERANCES Align_Nfft Downsample Window
|
||||
global Utt_DelayEst Utt_Delay UttSearch_Start UttSearch_End
|
||||
global Best_ED1 Best_D1 Best_DC1 Best_ED2 Best_D2 Best_DC2 Best_BP
|
||||
|
||||
Utt_BPs= zeros( 1, 41);
|
||||
Utt_ED1= zeros( 1, 41);
|
||||
Utt_ED2= zeros( 1, 41);
|
||||
Utt_D1= zeros( 1, 41);
|
||||
Utt_D2= zeros( 1, 41);
|
||||
Utt_DC1= zeros( 1, 41);
|
||||
Utt_DC2= zeros( 1, 41);
|
||||
|
||||
|
||||
Utt_Len = Utt_SpeechEnd - Utt_SpeechStart;
|
||||
Utt_Test = MAXNUTTERANCES;
|
||||
Best_DC1 = 0.0;
|
||||
Best_DC2 = 0.0;
|
||||
kernel = Align_Nfft / 64;
|
||||
Delta = Align_Nfft / (4 * Downsample);
|
||||
Step = floor( ((0.801 * Utt_Len + 40 * Delta - 1)/(40 * Delta)));
|
||||
Step = Step* Delta;
|
||||
% fprintf( 'Step is %f\n', Step);
|
||||
|
||||
Pad = floor( Utt_Len / 10);
|
||||
if( Pad < 75 )
|
||||
Pad = 75;
|
||||
end
|
||||
|
||||
Utt_BPs(1) = Utt_SpeechStart + Pad;
|
||||
N_BPs = 1;
|
||||
while( 1)
|
||||
N_BPs= N_BPs+ 1;
|
||||
Utt_BPs(N_BPs)= Utt_BPs(N_BPs- 1)+ Step;
|
||||
if (~((Utt_BPs(N_BPs) <= (Utt_SpeechEnd- Pad)) && (N_BPs <= 40) ))
|
||||
break;
|
||||
end
|
||||
end
|
||||
|
||||
if( N_BPs <= 1 )
|
||||
return;
|
||||
end
|
||||
|
||||
% fprintf( 'Utt_DelayEst_l, Utt_Start_l, N_BPs is %d,%d,%d\n', ...
|
||||
% Utt_DelayEst_l, Utt_Start_l, N_BPs);
|
||||
for bp = 1: N_BPs- 1
|
||||
Utt_DelayEst(Utt_Test) = Utt_DelayEst_l;
|
||||
UttSearch_Start(Utt_Test) = Utt_Start_l;
|
||||
UttSearch_End(Utt_Test) = Utt_BPs(bp);
|
||||
% fprintf( 'bp,Utt_BPs(%d) is %d,%d\n', bp,bp,Utt_BPs(bp));
|
||||
|
||||
crude_align( ref_logVAD, ref_Nsamples, deg_logVAD, ...
|
||||
deg_Nsamples, MAXNUTTERANCES);
|
||||
Utt_ED1(bp) = Utt_Delay(Utt_Test);
|
||||
|
||||
Utt_DelayEst(Utt_Test) = Utt_DelayEst_l;
|
||||
UttSearch_Start(Utt_Test) = Utt_BPs(bp);
|
||||
UttSearch_End(Utt_Test) = Utt_End_l;
|
||||
|
||||
crude_align( ref_logVAD, ref_Nsamples, deg_logVAD, ...
|
||||
deg_Nsamples, MAXNUTTERANCES);
|
||||
Utt_ED2(bp) = Utt_Delay(Utt_Test);
|
||||
end
|
||||
|
||||
% stream = fopen( 'matmat.txt', 'wt' );
|
||||
% for count= 1: N_BPs- 1
|
||||
% fprintf( stream, '%d\n', Utt_ED2(count));
|
||||
% end
|
||||
% fclose( stream );
|
||||
|
||||
|
||||
Utt_DC1(1: N_BPs-1) = -2.0;
|
||||
% stream= fopen( 'what_mmm.txt', 'at');
|
||||
while( 1 )
|
||||
bp = 1;
|
||||
while( (bp <= N_BPs- 1) && (Utt_DC1(bp) > -2.0) )
|
||||
bp = bp+ 1;
|
||||
end
|
||||
if( bp >= N_BPs )
|
||||
break;
|
||||
end
|
||||
|
||||
estdelay = Utt_ED1(bp);
|
||||
% fprintf( 'bp,estdelay is %d,%d\n', bp, estdelay);
|
||||
H(1: Align_Nfft)= 0;
|
||||
Hsum = 0.0;
|
||||
|
||||
startr = (Utt_Start_l- 1) * Downsample+ 1;
|
||||
startd = startr + estdelay;
|
||||
% fprintf( 'startr/startd is %d/%d\n', startr, startd);
|
||||
|
||||
if ( startd < 0 )
|
||||
startr = -estdelay+ 1;
|
||||
startd = 1;
|
||||
end
|
||||
|
||||
while( ((startd + Align_Nfft) <= 1+ deg_Nsamples) &&...
|
||||
((startr + Align_Nfft) <= (1+ (Utt_BPs(bp)- 1) * Downsample)) )
|
||||
X1= ref_data(startr: startr+ Align_Nfft- 1).* Window;
|
||||
X2= deg_data(startd: startd+ Align_Nfft- 1).* Window;
|
||||
|
||||
X1_fft= fft( X1, Align_Nfft );
|
||||
X1_fft_conj= conj( X1_fft);
|
||||
X2_fft= fft( X2, Align_Nfft );
|
||||
X1= ifft( X1_fft_conj.* X2_fft, Align_Nfft);
|
||||
|
||||
X1= abs( X1);
|
||||
v_max= max( X1)* 0.99;
|
||||
n_max = (v_max^ 0.125 )/ kernel;
|
||||
% fprintf( stream, '%f %f\n', v_max, n_max);
|
||||
|
||||
for count = 0: Align_Nfft- 1
|
||||
if( X1(count+ 1) > v_max )
|
||||
Hsum = Hsum+ n_max * kernel;
|
||||
for k = 1-kernel: kernel- 1
|
||||
H(1+ rem( count+ k+ Align_Nfft, Align_Nfft))= ...
|
||||
H(1+ rem(count+ k+ Align_Nfft, Align_Nfft))+ ...
|
||||
n_max* (kernel- abs(k));
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
startr = startr+ (Align_Nfft / 4);
|
||||
startd = startd+ (Align_Nfft / 4);
|
||||
end
|
||||
|
||||
[v_max, I_max] = max( H);
|
||||
if( I_max- 1 >= (Align_Nfft/2) )
|
||||
I_max = I_max- Align_Nfft;
|
||||
end
|
||||
|
||||
Utt_D1(bp) = estdelay + I_max- 1;
|
||||
if( Hsum > 0.0 )
|
||||
% if (Utt_Len== 236)
|
||||
% fprintf( 'v_max, Hsum is %f, %f\n', v_max, Hsum);
|
||||
% end
|
||||
Utt_DC1(bp) = v_max / Hsum;
|
||||
else
|
||||
Utt_DC1(bp) = 0.0;
|
||||
end
|
||||
|
||||
% fprintf( 'bp/startr/startd is %d/%d/%d\n', bp, startr, startd);
|
||||
while( bp < (N_BPs - 1) )
|
||||
bp = bp + 1;
|
||||
|
||||
if( (Utt_ED1(bp) == estdelay) && (Utt_DC1(bp) <= -2.0) )
|
||||
% loopno= 0;
|
||||
while(((startd+ Align_Nfft)<= 1+ deg_Nsamples) && ...
|
||||
((startr+ Align_Nfft)<= ...
|
||||
((Utt_BPs(bp)- 1)* Downsample+ 1) ))
|
||||
X1= ref_data( startr: startr+ Align_Nfft- 1).* ...
|
||||
Window;
|
||||
% % if (Utt_Len== 321)
|
||||
% fid= fopen( 'what_mat.txt', 'at');
|
||||
% fprintf( fid, '%f\n', Window);
|
||||
% fclose( fid);
|
||||
% % fprintf( '\n');
|
||||
% % end
|
||||
X2= deg_data( startd: startd+ Align_Nfft- 1).* ...
|
||||
Window;
|
||||
X1_fft= fft( X1, Align_Nfft );
|
||||
X1_fft_conj= conj( X1_fft);
|
||||
X2_fft= fft( X2, Align_Nfft );
|
||||
X1= ifft( X1_fft_conj.* X2_fft, Align_Nfft);
|
||||
|
||||
X1= abs( X1);
|
||||
v_max = 0.99* max( X1);
|
||||
n_max = (v_max^ 0.125)/ kernel;
|
||||
% fprintf( 'v_max n_max is %f %f\n', v_max, n_max);
|
||||
|
||||
for count = 0: Align_Nfft- 1
|
||||
if( X1(count+ 1) > v_max )
|
||||
Hsum = Hsum+ n_max * kernel;
|
||||
for k = 1-kernel: kernel-1
|
||||
H(1+ rem( count+ k+ Align_Nfft, Align_Nfft))= ...
|
||||
H(1+ rem(count+ k+ Align_Nfft, Align_Nfft))+ ...
|
||||
n_max* (kernel- abs(k));
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
startr = startr+ (Align_Nfft / 4);
|
||||
startd = startd+ (Align_Nfft / 4);
|
||||
|
||||
% loopno= loopno+ 1;
|
||||
end
|
||||
% fprintf( 'loopno is %d\n', loopno);
|
||||
|
||||
[v_max, I_max] = max( H);
|
||||
% fprintf( 'I_max is %d ', I_max);
|
||||
if( I_max- 1 >= (Align_Nfft/2) )
|
||||
I_max = I_max- Align_Nfft;
|
||||
end
|
||||
|
||||
|
||||
Utt_D1(bp) = estdelay + I_max- 1;
|
||||
if( Hsum > 0.0 )
|
||||
% fprintf( 'v_max Hsum is %f %f\n', v_max, Hsum);
|
||||
Utt_DC1(bp) = v_max / Hsum;
|
||||
else
|
||||
Utt_DC1(bp) = 0.0;
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
% fclose( stream);
|
||||
|
||||
for bp= 1: N_BPs- 1
|
||||
if( Utt_DC1(bp) > Utt_DelayConf_l )
|
||||
Utt_DC2(bp) = -2.0;
|
||||
else
|
||||
Utt_DC2(bp) = 0.0;
|
||||
end
|
||||
end
|
||||
|
||||
while( 1 )
|
||||
bp = N_BPs- 1;
|
||||
while( (bp >= 1) && (Utt_DC2(bp) > -2.0) )
|
||||
bp = bp- 1;
|
||||
end
|
||||
if( bp < 1 )
|
||||
break;
|
||||
end
|
||||
|
||||
estdelay = Utt_ED2(bp);
|
||||
H( 1: Align_Nfft)= 0;
|
||||
Hsum = 0.0;
|
||||
|
||||
startr = (Utt_End_l- 1)* Downsample+ 1- Align_Nfft;
|
||||
startd = startr + estdelay;
|
||||
|
||||
% fprintf( '***NEW startr is %d\n', startr);
|
||||
|
||||
% fprintf( 'startr/d, deg_Nsamples is %d/%d, %d\n', startr,startd, ...
|
||||
% deg_Nsamples);
|
||||
% fprintf( 'deg_data has %d elements\n', numel( deg_data));
|
||||
|
||||
if ( (startd + Align_Nfft) > deg_Nsamples+ 1 )
|
||||
startd = deg_Nsamples - Align_Nfft+ 1;
|
||||
startr = startd - estdelay;
|
||||
end
|
||||
|
||||
while( (startd>= 1) && (startr>= (Utt_BPs(bp)- 1)* Downsample+ 1) )
|
||||
X1= ref_data( startr: startr+ Align_Nfft- 1).* Window;
|
||||
X2= deg_data( startd: startd+ Align_Nfft- 1).* Window;
|
||||
|
||||
X1_fft= fft( X1, Align_Nfft);
|
||||
X1_fft_conj= conj( X1_fft);
|
||||
X2_fft= fft( X2, Align_Nfft);
|
||||
|
||||
X1= ifft( X1_fft_conj.* X2_fft, Align_Nfft );
|
||||
X1= abs( X1);
|
||||
|
||||
v_max = max( X1)* 0.99;
|
||||
n_max = ( v_max^ 0.125 )/ kernel;
|
||||
|
||||
for count = 0: Align_Nfft- 1
|
||||
if( X1(count+ 1) > v_max )
|
||||
Hsum = Hsum+ n_max * kernel;
|
||||
for k = 1-kernel: kernel- 1
|
||||
H(1+ rem(count+ k+ Align_Nfft, Align_Nfft))= ...
|
||||
H(1+ rem(count+ k+ Align_Nfft, Align_Nfft))+ ...
|
||||
n_max* (kernel- abs(k));
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
startr = startr- (Align_Nfft / 4);
|
||||
startd = startd- (Align_Nfft / 4);
|
||||
end
|
||||
|
||||
[v_max, I_max] = max( H);
|
||||
if( I_max- 1 >= (Align_Nfft/2) )
|
||||
I_max = I_max- Align_Nfft;
|
||||
end
|
||||
|
||||
Utt_D2(bp) = estdelay + I_max- 1;
|
||||
if( Hsum > 0.0 )
|
||||
Utt_DC2(bp) = v_max / Hsum;
|
||||
else
|
||||
Utt_DC2(bp) = 0.0;
|
||||
end
|
||||
|
||||
while( bp > 1 )
|
||||
bp = bp - 1;
|
||||
if( (Utt_ED2(bp) == estdelay) && (Utt_DC2(bp) <= -2.0) )
|
||||
while( (startd >= 1) && (startr >= (Utt_BPs(bp)- 1) * Downsample+ 1))
|
||||
X1= ref_data( startr: startr+ Align_Nfft- 1).* Window;
|
||||
X2= deg_data( startd: startd+ Align_Nfft- 1).* Window;
|
||||
X1_fft_conj= conj( fft( X1, Align_Nfft));
|
||||
X2_fft= fft( X2, Align_Nfft);
|
||||
X1= ifft( X1_fft_conj.* X2_fft, Align_Nfft);
|
||||
|
||||
X1= abs( X1);
|
||||
v_max = max( X1)* 0.99;
|
||||
n_max = (v_max^ 0.125)/ kernel;
|
||||
|
||||
for count = 0: Align_Nfft- 1
|
||||
if( X1(count+ 1) > v_max )
|
||||
Hsum = Hsum+ n_max * kernel;
|
||||
for k = 1-kernel: kernel- 1
|
||||
H(1+ rem( count+ k+ Align_Nfft, Align_Nfft))= ...
|
||||
H(1+ rem(count+ k+ Align_Nfft, Align_Nfft))+ ...
|
||||
n_max* (kernel- abs(k));
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
startr = startr- (Align_Nfft / 4);
|
||||
startd = startd- (Align_Nfft / 4);
|
||||
end
|
||||
|
||||
[v_max, I_max] = max( H);
|
||||
if( I_max- 1 >= (Align_Nfft/2) )
|
||||
I_max = I_max- Align_Nfft;
|
||||
end
|
||||
|
||||
|
||||
Utt_D2(bp) = estdelay + I_max- 1;
|
||||
if( Hsum > 0.0 )
|
||||
Utt_DC2(bp) = v_max / Hsum;
|
||||
else
|
||||
Utt_DC2(bp) = 0.0;
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
% fid= fopen( 'uttinfo_mat.txt', 'wt');
|
||||
% fprintf( fid, '%f\n', Utt_D2);
|
||||
% fprintf( fid, '\n');
|
||||
% fprintf( fid, '%f\n', Utt_DC2);
|
||||
% fclose( fid);
|
||||
|
||||
% fprintf( 'Utt_Len, N_BPs is %d, %d\n', Utt_Len, N_BPs);
|
||||
for bp = 1: N_BPs- 1
|
||||
if( (abs(Utt_D2(bp) - Utt_D1(bp)) >= Downsample) && ...
|
||||
((Utt_DC1(bp)+ Utt_DC2(bp))> (Best_DC1 + Best_DC2)) &&...
|
||||
(Utt_DC1(bp) > Utt_DelayConf_l) && ...
|
||||
(Utt_DC2(bp) > Utt_DelayConf_l) )
|
||||
Best_ED1 = Utt_ED1(bp);
|
||||
Best_D1 = Utt_D1(bp);
|
||||
Best_DC1 = Utt_DC1(bp);
|
||||
Best_ED2 = Utt_ED2(bp);
|
||||
Best_D2 = Utt_D2(bp);
|
||||
Best_DC2 = Utt_DC2(bp);
|
||||
Best_BP = Utt_BPs(bp);
|
||||
% fprintf( 'in loop...');
|
||||
end
|
||||
end
|
||||
|
||||
% if (Utt_Len== 236)
|
||||
% fid= fopen( 'matmat.txt', 'wt');
|
||||
% fprintf( fid, 'N_BPs is %d\n', N_BPs);
|
||||
% fprintf( fid, 'Utt_DelayConf is %f\n', Utt_DelayConf_l);
|
||||
% fprintf( fid, 'ED2\t ED1\t D2\t D1\t DC2\t DC1\t BPs\n');
|
||||
% for bp= 1: N_BPs- 1
|
||||
% fprintf( fid, '%d\t %d\t %d\t %d\t %f\t %f\t %d\n', Utt_ED2( bp), ...
|
||||
% Utt_ED1( bp), Utt_D2(bp), Utt_D1(bp), Utt_DC2(bp),...
|
||||
% Utt_DC1( bp), Utt_BPs( bp));
|
||||
% end
|
||||
% fclose( fid);
|
||||
% end
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,76 @@
|
||||
function time_align(ref_data, ref_Nsamples, ...
|
||||
deg_data, deg_Nsamples, Utt_id)
|
||||
|
||||
global Utt_DelayEst Utt_Delay Utt_DelayConf UttSearch_Start UttSearch_End
|
||||
global Align_Nfft Downsample Window
|
||||
|
||||
estdelay = Utt_DelayEst(Utt_id);
|
||||
|
||||
H = zeros( 1, Align_Nfft);
|
||||
X1= zeros( 1, Align_Nfft);
|
||||
X2= zeros( 1, Align_Nfft);
|
||||
|
||||
startr = (UttSearch_Start(Utt_id)- 1)* Downsample+ 1;
|
||||
startd = startr + estdelay;
|
||||
if ( startd < 0 )
|
||||
startr = 1 -estdelay;
|
||||
startd = 1;
|
||||
end
|
||||
|
||||
while( ((startd + Align_Nfft) <= deg_Nsamples) && ...
|
||||
((startr + Align_Nfft) <= ((UttSearch_End(Utt_id)- 1) * Downsample)) )
|
||||
X1= ref_data( startr: startr+ Align_Nfft- 1).* Window;
|
||||
X2= deg_data( startd: startd+ Align_Nfft- 1).* Window;
|
||||
|
||||
% find cross-correlation between X1 and X2
|
||||
X1_fft= fft( X1, Align_Nfft );
|
||||
X1_fft_conj= conj( X1_fft);
|
||||
X2_fft= fft( X2, Align_Nfft );
|
||||
X1= ifft( X1_fft_conj.* X2_fft, Align_Nfft );
|
||||
|
||||
X1= abs( X1);
|
||||
v_max = max( X1)* 0.99;
|
||||
|
||||
X1_greater_vmax= find( X1 > v_max );
|
||||
H( X1_greater_vmax )= H( X1_greater_vmax )+ v_max^ 0.125;
|
||||
|
||||
startr = startr+ Align_Nfft/ 4;
|
||||
startd = startd+ Align_Nfft/ 4;
|
||||
|
||||
end
|
||||
|
||||
X1= H;
|
||||
X2= 0;
|
||||
Hsum = sum( H);
|
||||
|
||||
X2(1) = 1.0;
|
||||
kernel = Align_Nfft / 64;
|
||||
|
||||
for count= 2: kernel
|
||||
X2( count)= 1- (count- 1)/ kernel;
|
||||
X2( Align_Nfft- count+ 2)= 1- (count- 1)/ kernel;
|
||||
end
|
||||
|
||||
X1_fft= fft( X1, Align_Nfft );
|
||||
X2_fft= fft( X2, Align_Nfft );
|
||||
|
||||
X1= ifft( X1_fft.* X2_fft, Align_Nfft );
|
||||
|
||||
if (Hsum> 0)
|
||||
H= abs( X1)/ Hsum;
|
||||
else
|
||||
H= 0;
|
||||
end
|
||||
|
||||
[v_max, I_max] = max( H);
|
||||
if( I_max- 1 >= (Align_Nfft/2) )
|
||||
I_max = I_max- Align_Nfft;
|
||||
end
|
||||
|
||||
Utt_Delay(Utt_id) = estdelay + I_max- 1;
|
||||
Utt_DelayConf(Utt_id) = v_max; % confidence
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,26 @@
|
||||
function utterance_locate (ref_data, ref_Nsamples, ref_VAD, ref_logVAD,...
|
||||
deg_data, deg_Nsamples, deg_VAD, deg_logVAD);
|
||||
|
||||
global Nutterances Utt_Delay Utt_DelayConf Utt_Start Utt_End Utt_DelayEst
|
||||
|
||||
id_searchwindows( ref_VAD, ref_Nsamples, deg_VAD, deg_Nsamples);
|
||||
|
||||
for Utt_id= 1: Nutterances
|
||||
%fprintf( 1, 'Utt_id is %d\n', Utt_id);
|
||||
crude_align( ref_logVAD, ref_Nsamples, deg_logVAD, deg_Nsamples, Utt_id);
|
||||
time_align(ref_data, ref_Nsamples, ...
|
||||
deg_data, deg_Nsamples, Utt_id);
|
||||
end
|
||||
|
||||
id_utterances( ref_Nsamples, ref_VAD, deg_Nsamples);
|
||||
|
||||
|
||||
utterance_split( ref_data, ref_Nsamples, ref_VAD, ref_logVAD, ...
|
||||
deg_data, deg_Nsamples, deg_VAD, deg_logVAD);
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,122 @@
|
||||
function utterance_split( ref_data, ref_Nsamples, ref_VAD, ref_logVAD, ...
|
||||
deg_data, deg_Nsamples, deg_VAD, deg_logVAD)
|
||||
|
||||
global Nutterances MAXNUTTERANCES Downsample SEARCHBUFFER
|
||||
global Utt_DelayEst Utt_Delay Utt_DelayConf UttSearch_Start
|
||||
global Utt_Start Utt_End Largest_uttsize UttSearch_End
|
||||
global Best_ED1 Best_D1 Best_DC1 Best_ED2 Best_D2 Best_DC2 Best_BP
|
||||
|
||||
Utt_id = 1;
|
||||
while( (Utt_id <= Nutterances) && (Nutterances <= MAXNUTTERANCES) )
|
||||
Utt_DelayEst_l = Utt_DelayEst(Utt_id);
|
||||
Utt_Delay_l = Utt_Delay(Utt_id);
|
||||
Utt_DelayConf_l = Utt_DelayConf(Utt_id);
|
||||
Utt_Start_l = Utt_Start(Utt_id);
|
||||
Utt_End_l = Utt_End(Utt_id);
|
||||
|
||||
Utt_SpeechStart = Utt_Start_l;
|
||||
% fprintf( 'SpeechStart is %d\n', Utt_SpeechStart);
|
||||
while( (Utt_SpeechStart < Utt_End_l) && ...
|
||||
(ref_VAD(Utt_SpeechStart)<= 0.0) )
|
||||
Utt_SpeechStart = Utt_SpeechStart + 1;
|
||||
end %find the SpeechStart for each utterance
|
||||
Utt_SpeechEnd = Utt_End_l;
|
||||
% fprintf( 'SpeechEnd is %d\n', Utt_SpeechEnd);
|
||||
while( (Utt_SpeechEnd > Utt_Start_l) && ...
|
||||
(ref_VAD(Utt_SpeechEnd) <= 0))
|
||||
Utt_SpeechEnd = Utt_SpeechEnd- 1;
|
||||
end
|
||||
Utt_SpeechEnd = Utt_SpeechEnd+ 1;
|
||||
%find SpeechEnd for each utterance
|
||||
Utt_Len = Utt_SpeechEnd - Utt_SpeechStart;
|
||||
|
||||
% fprintf( 'Utt_Len is %d\n', Utt_Len);
|
||||
|
||||
if( Utt_Len >= 200 )
|
||||
split_align( ref_data, ref_Nsamples, ref_VAD, ref_logVAD, ...
|
||||
deg_data, deg_Nsamples, deg_VAD, deg_logVAD, ...
|
||||
Utt_Start_l, Utt_SpeechStart, Utt_SpeechEnd, Utt_End_l, ...
|
||||
Utt_DelayEst_l, Utt_DelayConf_l);
|
||||
% fprintf( '\nBest_ED1, Best_D1, Best_DC1 is %d, %d, %f\n',...
|
||||
% Best_ED1, Best_D1, Best_DC1);
|
||||
% fprintf( 'Best_ED2, Best_D2, Best_DC2 is %d, %d, %f\n',...
|
||||
% Best_ED2, Best_D2, Best_DC2);
|
||||
% fprintf( 'Best_BP is %d\n', Best_BP);
|
||||
|
||||
if( (Best_DC1 > Utt_DelayConf_l) && (Best_DC2 > Utt_DelayConf_l) )
|
||||
for step = Nutterances: -1: Utt_id+ 1
|
||||
Utt_DelayEst(step+ 1) = Utt_DelayEst(step);
|
||||
Utt_Delay(step+ 1) = Utt_Delay(step);
|
||||
Utt_DelayConf(step+ 1) = Utt_DelayConf(step);
|
||||
Utt_Start(step+ 1) = Utt_Start(step);
|
||||
Utt_End(step+ 1) = Utt_End(step);
|
||||
UttSearch_Start(step+ 1) = Utt_Start( step);
|
||||
UttSearch_End(step+ 1) = Utt_End( step);
|
||||
end
|
||||
|
||||
Nutterances = Nutterances+ 1;
|
||||
|
||||
Utt_DelayEst(Utt_id) = Best_ED1;
|
||||
Utt_Delay(Utt_id) = Best_D1;
|
||||
Utt_DelayConf(Utt_id) = Best_DC1;
|
||||
|
||||
Utt_DelayEst(Utt_id +1) = Best_ED2;
|
||||
Utt_Delay(Utt_id +1) = Best_D2;
|
||||
Utt_DelayConf(Utt_id +1) = Best_DC2;
|
||||
|
||||
UttSearch_Start(Utt_id +1) = UttSearch_Start(Utt_id);
|
||||
UttSearch_End(Utt_id +1) = UttSearch_End( Utt_id);
|
||||
if( Best_D2 < Best_D1 )
|
||||
Utt_Start(Utt_id) = Utt_Start_l;
|
||||
Utt_End(Utt_id) = Best_BP;
|
||||
Utt_Start(Utt_id +1) = Best_BP;
|
||||
Utt_End(Utt_id +1) = Utt_End_l;
|
||||
else
|
||||
Utt_Start( Utt_id) = Utt_Start_l;
|
||||
Utt_End( Utt_id) = Best_BP + ...
|
||||
floor( (Best_D2- Best_D1)/ (2 * Downsample));
|
||||
Utt_Start( Utt_id +1) = Best_BP - ...
|
||||
floor( (Best_D2- Best_D1)/ (2 * Downsample));
|
||||
Utt_End( Utt_id +1) = Utt_End_l;
|
||||
end
|
||||
|
||||
if( (Utt_Start(Utt_id)- SEARCHBUFFER- 1)* Downsample+ 1+ ...
|
||||
Best_D1 < 0 )
|
||||
Utt_Start(Utt_id) = SEARCHBUFFER+ 1+ ...
|
||||
floor( (Downsample - 1 - Best_D1) / Downsample);
|
||||
end
|
||||
|
||||
if( ((Utt_End( Utt_id +1)- 1)* Downsample+ 1 + Best_D2) >...
|
||||
(deg_Nsamples - SEARCHBUFFER * Downsample) )
|
||||
Utt_End( Utt_id +1) = floor( (deg_Nsamples - Best_D2)...
|
||||
/ Downsample)- SEARCHBUFFER+ 1;
|
||||
end
|
||||
else
|
||||
Utt_id= Utt_id+ 1;
|
||||
end
|
||||
else
|
||||
Utt_id = Utt_id+ 1;
|
||||
end
|
||||
end
|
||||
|
||||
Largest_uttsize = max( Utt_End- Utt_Start);
|
||||
|
||||
% fid= fopen( 'uttinfo_mat.txt', 'wt');
|
||||
% fprintf( fid, 'Number of Utterances is:\n');
|
||||
% fprintf( fid, '%d\n', Nutterances);
|
||||
% fprintf( fid, 'Utterance Delay Estimation:\n');
|
||||
% fprintf( fid, '%d\n', Utt_DelayEst( 1: Nutterances) );
|
||||
% fprintf( fid, 'Utterance Delay:\n');
|
||||
% fprintf( fid, '%d\n', Utt_Delay( 1: Nutterances));
|
||||
% fprintf( fid, 'Utterance Delay Confidence:\n');
|
||||
% fprintf( fid, '%f\n', Utt_DelayConf( 1: Nutterances));
|
||||
% fprintf( fid, 'Utterance Start:\n');
|
||||
% fprintf( fid, '%d\n', Utt_Start( 1: Nutterances));
|
||||
% fprintf( fid, 'Utterance End:\n');
|
||||
% fprintf( fid, '%d\n', Utt_End( 1: Nutterances));
|
||||
% fprintf( fid, 'Largest utterance length:\n');
|
||||
% fprintf( fid, '%d\n', Largest_uttsize);
|
||||
% fclose( fid);
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user