Started code for DSP final project

This commit is contained in:
Aidan Sharpe 2024-04-25 18:38:09 -04:00
parent 824a46b1fd
commit 50a5e57e18
96 changed files with 2160495 additions and 28 deletions

View File

@ -0,0 +1,46 @@
%*********************ADD FROM NOISEX DATABASE *******************
% function [t] = addnoisex(sclean,snoise,snr,outfile)
% add noise from a file in noisex database to signal
% t - noisy signal - written in output wave file
% sclean - clean signal - read either as dat or wave file
% snoise - name of noise file, example: white for 'white.dat'
% - read in as dat or wave file
% snr - desired snr in db
% outfile - The output file is written as a wav file
% Example [t]=addnoisex('s.wav','street.dat',30,'s_noisy_snr30.wav')
%****************************************************
%
function [t] = addnoisex(sclean,snoise,snr,outfile)
% Read input clean sognal and noise file
[s]=load_or_audioread(sclean);
[nfile]=load_or_audioread(snoise);
% Record length of speech signal and noise file
nspeech=length(s);
nns=length(nfile);
% Randomly select starting sample of noise file and
% read same number of samples as speech signal
start=ceil(rand()*(nns-nspeech+1));
finish=start+nspeech-1;
noise=nfile(start:finish);
% Calculate noise power and signal power
powernoise=norm(noise,2);
powersignal=norm(s,2);
% Adjust noise level for desired SNR
u=10^(snr/20);
powerdesirednoise=powersignal/u;
ratio=powerdesirednoise/powernoise;
noise=ratio*noise;
% Add the noise
t=s+noise;
% Display snr
signaltonoise=20.0*log10(powersignal/norm(noise));
% Write as wave file
audiowrite(outfile,t,8000,'BitsPerSample',16);

View File

@ -0,0 +1,19 @@
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Either load in an ascii .dat file or
% uses wavread to read a wave file
% function [speechData] = load_or_audioread(speechFile)
% speechfile in quotes
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
function [speechData] = load_or_audioread(speechFile)
% The speech file is loaded
if (ischar(speechFile))
if(strcmpi(speechFile(end-3:end),'.dat'))
speechData = load(speechFile);
elseif(strcmpi(speechFile(end-3:end),'.wav'))
speechData = audioread(speechFile);
end
elseif (isnumeric(speechFile))
speechData = speechFile;
end

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,26 @@
function mod_data= DC_block( data, Nsamples)
global Downsample DATAPADDING_MSECS SEARCHBUFFER
ofs= SEARCHBUFFER* Downsample;
mod_data= data;
%compute dc component, it is a little weird
facc= sum( data( ofs+ 1: Nsamples- ofs))/ Nsamples;
mod_data( ofs+ 1: Nsamples- ofs)= data( ofs+ 1: Nsamples- ofs)- facc;
mod_data( ofs+ 1: ofs+ Downsample)= mod_data( ofs+ 1: ofs+ Downsample).* ...
( 0.5+ (0: Downsample- 1))/ Downsample;
mod_data( Nsamples- ofs: -1: Nsamples- ofs-Downsample+ 1)= ...
mod_data( Nsamples- ofs: -1: Nsamples- ofs-Downsample+ 1).* ...
( 0.5+ (0: Downsample- 1))/ Downsample;

View File

@ -0,0 +1,53 @@
function Y= FFTNXCorr( ref_VAD, startr, nr, deg_VAD, startd, nd)
% this function has other simple implementations, current implementation is
% consistent with the C version
% one way to do this (in time domain) =====
x1= ref_VAD( startr: startr+ nr- 1);
x2= deg_VAD( startd: startd+ nd- 1);
x1= fliplr( x1);
Y= conv( x2, x1);
% done =====
% % the other way to do this (in freq domain)===
% Nx= 2^ (ceil( log2( max( nr, nd))));
% x1= zeros( 1, 2* Nx);
% x2= zeros( 1, 2* Nx);
% x1( 1: nr)= fliplr( ref_VAD( startr: startr+ nr- 1));
% x2( 1: nd)= deg_VAD( startd: startd+ nd- 1);
%
% if (nr== 491)
% fid= fopen( 'mat_debug.txt', 'wt');
% fprintf( fid, '%f\n', x1);
% fclose( fid);
% end
%
% x1_fft= fft( x1, 2* Nx);
% x2_fft= fft( x2, 2* Nx);
%
% tmp1= ifft( x1_fft.* x2_fft, 2* Nx);
%
% Ny= nr+ nd- 1;
% Y= tmp1( 1: Ny);
% % done ===========

View File

@ -0,0 +1,162 @@
function [VAD, logVAD]= apply_VAD( data, Nsamples)
global Downsample MINSPEECHLGTH JOINSPEECHLGTH
Nwindows= floor( Nsamples/ Downsample);
%number of 4ms window
VAD= zeros( 1, Nwindows);
for count= 1: Nwindows
VAD( count)= sum( data( (count-1)* Downsample+ 1: ...
count* Downsample).^ 2)/ Downsample;
end
%VAD is the power of each 4ms window
LevelThresh = sum( VAD)/ Nwindows;
%LevelThresh is set to mean value of VAD
LevelMin= max( VAD);
if( LevelMin > 0 )
LevelMin= LevelMin* 1.0e-4;
else
LevelMin = 1.0;
end
%fprintf( 1, 'LevelMin is %f\n', LevelMin);
VAD( find( VAD< LevelMin))= LevelMin;
for iteration= 1: 12
LevelNoise= 0;
len= 0;
StDNoise= 0;
VAD_lessthan_LevelThresh= VAD( find( VAD<= LevelThresh));
len= length( VAD_lessthan_LevelThresh);
LevelNoise= sum( VAD_lessthan_LevelThresh);
if (len> 0)
LevelNoise= LevelNoise/ len;
StDNoise= sqrt( sum( ...
(VAD_lessthan_LevelThresh- LevelNoise).^ 2)/ len);
end
LevelThresh= 1.001* (LevelNoise+ 2* StDNoise);
end
%fprintf( 1, 'LevelThresh is %f\n', LevelThresh);
LevelNoise= 0;
LevelSig= 0;
len= 0;
VAD_greaterthan_LevelThresh= VAD( find( VAD> LevelThresh));
len= length( VAD_greaterthan_LevelThresh);
LevelSig= sum( VAD_greaterthan_LevelThresh);
VAD_lessorequal_LevelThresh= VAD( find( VAD<= LevelThresh));
LevelNoise= sum( VAD_lessorequal_LevelThresh);
if (len> 0)
LevelSig= LevelSig/ len;
else
LevelThresh= -1;
end
%fprintf( 1, 'LevelSig is %f\n', LevelSig);
if (len< Nwindows)
LevelNoise= LevelNoise/( Nwindows- len);
else
LevelNoise= 1;
end
%fprintf( 1, 'LevelNoise is %f\n', LevelNoise);
VAD( find( VAD<= LevelThresh))= -VAD( find( VAD<= LevelThresh));
VAD(1)= -LevelMin;
VAD(Nwindows)= -LevelMin;
start= 0;
finish= 0;
for count= 2: Nwindows
if( (VAD(count) > 0.0) && (VAD(count-1) <= 0.0) )
start = count;
end
if( (VAD(count) <= 0.0) && (VAD(count-1) > 0.0) )
finish = count;
if( (finish - start)<= MINSPEECHLGTH )
VAD( start: finish- 1)= -VAD( start: finish- 1);
end
end
end
%to make sure finish- start is more than 4
if( LevelSig >= (LevelNoise* 1000) )
for count= 2: Nwindows
if( (VAD(count)> 0) && (VAD(count-1)<= 0) )
start= count;
end
if( (VAD(count)<= 0) && (VAD(count-1)> 0) )
finish = count;
g = sum( VAD( start: finish- 1));
if( g< 3.0* LevelThresh* (finish - start) )
VAD( start: finish- 1)= -VAD( start: finish- 1);
end
end
end
end
start = 0;
finish = 0;
for count= 2: Nwindows
if( (VAD(count) > 0.0) && (VAD(count-1) <= 0.0) )
start = count;
if( (finish > 0) && ((start - finish) <= JOINSPEECHLGTH) )
VAD( finish: start- 1)= LevelMin;
end
end
if( (VAD(count) <= 0.0) && (VAD(count-1) > 0.0) )
finish = count;
end
end
start= 0;
for count= 2: Nwindows
if( (VAD(count)> 0) && (VAD(count-1)<= 0) )
start= count;
end
end
if( start== 0 )
VAD= abs(VAD);
VAD(1) = -LevelMin;
VAD(Nwindows) = -LevelMin;
end
count = 4;
while( count< (Nwindows-1) )
if( (VAD(count)> 0) && (VAD(count-2) <= 0) )
VAD(count-2)= VAD(count)* 0.1;
VAD(count-1)= VAD(count)* 0.3;
count= count+ 1;
end
if( (VAD(count)<= 0) && (VAD(count-1)> 0) )
VAD(count)= VAD(count-1)* 0.3;
VAD(count+ 1)= VAD(count-1)* 0.1;
count= count+ 3;
end
count= count+ 1;
end
VAD( find( VAD< 0))= 0;
% fid= fopen( 'mat_vad.txt', 'wt');
% fprintf( fid, '%f\n', VAD);
% fclose( fid);
if( LevelThresh<= 0 )
LevelThresh= LevelMin;
end
logVAD( find( VAD<= LevelThresh))= 0;
VAD_greaterthan_LevelThresh= find( VAD> LevelThresh);
logVAD( VAD_greaterthan_LevelThresh)= log( VAD( ...
VAD_greaterthan_LevelThresh)/ LevelThresh);

View File

@ -0,0 +1,40 @@
function align_filtered= apply_filter( data, data_Nsamples, align_filter_dB)
global Downsample DATAPADDING_MSECS SEARCHBUFFER Fs
align_filtered= data;
n= data_Nsamples- 2* SEARCHBUFFER* Downsample+ DATAPADDING_MSECS* (Fs/ 1000);
% now find the next power of 2 which is greater or equal to n
pow_of_2= 2^ (ceil( log2( n)));
[number_of_points, trivial]= size( align_filter_dB);
overallGainFilter= interp1( align_filter_dB( :, 1), align_filter_dB( :, 2), ...
1000);
x= zeros( 1, pow_of_2);
x( 1: n)= data( SEARCHBUFFER* Downsample+ 1: SEARCHBUFFER* Downsample+ n);
x_fft= fft( x, pow_of_2);
freq_resolution= Fs/ pow_of_2;
factorDb( 1: pow_of_2/2+ 1)= interp1( align_filter_dB( :, 1), ...
align_filter_dB( :, 2), (0: pow_of_2/2)* freq_resolution)- ...
overallGainFilter;
factor= 10.^ (factorDb/ 20);
factor= [factor, fliplr( factor( 2: pow_of_2/2))];
x_fft= x_fft.* factor;
y= ifft( x_fft, pow_of_2);
align_filtered( SEARCHBUFFER* Downsample+ 1: SEARCHBUFFER* Downsample+ n)...
= y( 1: n);
% fid= fopen( 'log_mat.txt', 'wt');
% fprintf( fid, '%f\n', y( 1: n));
% fclose( fid);

View File

@ -0,0 +1,26 @@
function mod_data= apply_filters( data, Nsamples)
%IIRFilt( InIIR_Hsos, InIIR_Nsos, data, data_Nsamples);
global InIIR_Hsos InIIR_Nsos DATAPADDING_MSECS Fs
% data_Nsamples= Nsamples+ DATAPADDING_MSECS* (Fs/ 1000);
% now we construct the second order section matrix
sosMatrix= zeros( InIIR_Nsos, 6);
sosMatrix( :, 4)= 1; %set a(1) to 1
% each row of sosMatrix holds [b(1*3) a(1*3)] for each section
sosMatrix( :, 1: 3)= InIIR_Hsos( :, 1: 3);
sosMatrix( :, 5: 6)= InIIR_Hsos( :, 4: 5);
%sosMatrix
% now we construct second order section direct form II filter
iirdf2= dfilt.df2sos( sosMatrix);
mod_data= filter( iirdf2, data);

View File

@ -0,0 +1,200 @@
function cep_mean= comp_cep(cleanFile, enhdFile);
% ----------------------------------------------------------------------
% Cepstrum Distance Objective Speech Quality Measure
%
% This function implements the cepstrum distance measure used
% in [1]
%
% Usage: CEP=comp_cep(cleanFile.wav, enhancedFile.wav)
%
% cleanFile.wav - clean input file in .wav format
% enhancedFile - enhanced output file in .wav format
% CEP - computed cepstrum distance measure
%
% Note that the cepstrum measure is limited in the range [0, 10].
%
% Example call: CEP =comp_cep('sp04.wav','enhanced.wav')
%
%
% References:
%
% [1] Kitawaki, N., Nagabuchi, H., and Itoh, K. (1988). Objective quality
% evaluation for low bit-rate speech coding systems. IEEE J. Select.
% Areas in Comm., 6(2), 262-273.
%
% Author: Philipos C. Loizou
% (LPC routines were written by Bryan Pellom & John Hansen)
%
% Copyright (c) 2006 by Philipos C. Loizou
% $Revision: 0.0 $ $Date: 10/09/2006 $
% ----------------------------------------------------------------------
if nargin~=2
fprintf('USAGE: CEP=comp_cep(cleanFile.wav, enhancedFile.wav)\n');
fprintf('For more help, type: help comp_cep\n\n');
return;
end
alpha=0.95;
[data1, Srate1, Nbits1]= wavread(cleanFile);
[data2, Srate2, Nbits2]= wavread(enhdFile);
if ( Srate1~= Srate2) | ( Nbits1~= Nbits2)
error( 'The two files do not match!\n');
end
len= min( length( data1), length( data2));
data1= data1( 1: len)+eps;
data2= data2( 1: len)+eps;
IS_dist= cepstrum( data1, data2,Srate1);
IS_len= round( length( IS_dist)* alpha);
IS= sort( IS_dist);
cep_mean= mean( IS( 1: IS_len));
function distortion = cepstrum(clean_speech, processed_speech,sample_rate)
% ----------------------------------------------------------------------
% Check the length of the clean and processed speech. Must be the same.
% ----------------------------------------------------------------------
clean_length = length(clean_speech);
processed_length = length(processed_speech);
if (clean_length ~= processed_length)
disp('Error: Both Speech Files must be same length.');
return
end
% ----------------------------------------------------------------------
% Scale both clean speech and processed speech to have same dynamic
% range. Also remove DC component from each signal
% ----------------------------------------------------------------------
%clean_speech = clean_speech - mean(clean_speech);
%processed_speech = processed_speech - mean(processed_speech);
%processed_speech = processed_speech.*(max(abs(clean_speech))/ max(abs(processed_speech)));
% ----------------------------------------------------------------------
% Global Variables
% ----------------------------------------------------------------------
winlength = round(30*sample_rate/1000); %240; % window length in samples
skiprate = floor(winlength/4); % window skip in samples
if sample_rate<10000
P = 10; % LPC Analysis Order
else
P=16; % this could vary depending on sampling frequency.
end
C=10*sqrt(2)/log(10);
% ----------------------------------------------------------------------
% For each frame of input speech, calculate the Itakura-Saito Measure
% ----------------------------------------------------------------------
num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames
start = 1; % starting sample
window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
for frame_count = 1:num_frames
% ----------------------------------------------------------
% (1) Get the Frames for the test and reference speech.
% Multiply by Hanning Window.
% ----------------------------------------------------------
clean_frame = clean_speech(start:start+winlength-1);
processed_frame = processed_speech(start:start+winlength-1);
clean_frame = clean_frame.*window;
processed_frame = processed_frame.*window;
% ----------------------------------------------------------
% (2) Get the autocorrelation lags and LPC parameters used
% to compute the IS measure.
% ----------------------------------------------------------
[R_clean, Ref_clean, A_clean] = ...
lpcoeff(clean_frame, P);
[R_processed, Ref_processed, A_processed] = ...
lpcoeff(processed_frame, P);
C_clean=lpc2cep(A_clean);
C_processed=lpc2cep(A_processed);
% ----------------------------------------------------------
% (3) Compute the cepstrum-distance measure
% ----------------------------------------------------------
distortion(frame_count) = min(10,C*norm(C_clean-C_processed,2));
start = start + skiprate;
end
function [acorr, refcoeff, lpparams] = lpcoeff(speech_frame, model_order)
% ----------------------------------------------------------
% (1) Compute Autocorrelation Lags
% ----------------------------------------------------------
winlength = max(size(speech_frame));
for k=1:model_order+1
R(k) = sum(speech_frame(1:winlength-k+1) ...
.*speech_frame(k:winlength));
end
% ----------------------------------------------------------
% (2) Levinson-Durbin
% ----------------------------------------------------------
a = ones(1,model_order);
E(1)=R(1);
for i=1:model_order
a_past(1:i-1) = a(1:i-1);
sum_term = sum(a_past(1:i-1).*R(i:-1:2));
rcoeff(i)=(R(i+1) - sum_term) / E(i);
a(i)=rcoeff(i);
a(1:i-1) = a_past(1:i-1) - rcoeff(i).*a_past(i-1:-1:1);
E(i+1)=(1-rcoeff(i)*rcoeff(i))*E(i);
end
acorr = R;
refcoeff = rcoeff;
lpparams = [1 -a];
%----------------------------------------------
function [cep]=lpc2cep(a)
%
% converts prediction to cepstrum coefficients
%
% Author: Philipos C. Loizou
M=length(a);
cep=zeros(1,M-1);
cep(1)=-a(2);
for k=2:M-1
ix=1:k-1;
vec1=cep(ix).*a(k-1+1:-1:2).*ix;
cep(k)=-(a(k+1)+sum(vec1)/k);
end

View File

@ -0,0 +1,259 @@
function fwseg_dist= comp_fwseg(cleanFile, enhancedFile);
% ----------------------------------------------------------------------
% Frequency weighted SNRseg Objective Speech Quality Measure
%
% This function implements the frequency-weighted SNRseg measure [1]
% using a different weighting function, the clean spectrum.
%
% Usage: fwSNRseg=comp_fwseg(cleanFile.wav, enhancedFile.wav)
%
% cleanFile.wav - clean input file in .wav format
% enhancedFile - enhanced output file in .wav format
% fwSNRseg - computed frequency weighted SNRseg in dB
%
% Note that large numbers of fwSNRseg are better.
%
% Example call: fwSNRseg =comp_fwseg('sp04.wav','enhanced.wav')
%
%
% References:
% [1] Tribolet, J., Noll, P., McDermott, B., and Crochiere, R. E. (1978).
% A study of complexity and quality of speech waveform coders. Proc.
% IEEE Int. Conf. Acoust. , Speech, Signal Processing, 586-590.
%
% Author: Philipos C. Loizou
% (critical-band filtering routines were written by Bryan Pellom & John Hansen)
%
% Copyright (c) 2006 by Philipos C. Loizou
% $Revision: 0.0 $ $Date: 10/09/2006 $
% ----------------------------------------------------------------------
if nargin~=2
fprintf('USAGE: fwSNRseg=comp_fwseg(cleanFile.wav, enhancedFile.wav)\n');
fprintf('For more help, type: help comp_fwseg\n\n');
return;
end
[data1, Srate1, Nbits1]= wavread(cleanFile);
[data2, Srate2, Nbits2]= wavread(enhancedFile);
if ( Srate1~= Srate2) | ( Nbits1~= Nbits2)
error( 'The two files do not match!\n');
end
len= min( length( data1), length( data2));
data1= data1( 1: len)+eps;
data2= data2( 1: len)+eps;
wss_dist_vec= fwseg( data1, data2,Srate1);
fwseg_dist=mean(wss_dist_vec);
% ----------------------------------------------------------------------
function distortion = fwseg(clean_speech, processed_speech,sample_rate)
% ----------------------------------------------------------------------
% Check the length of the clean and processed speech. Must be the same.
% ----------------------------------------------------------------------
clean_length = length(clean_speech);
processed_length = length(processed_speech);
if (clean_length ~= processed_length)
disp('Error: Files must have same length.');
return
end
% ----------------------------------------------------------------------
% Global Variables
% ----------------------------------------------------------------------
winlength = round(30*sample_rate/1000); % window length in samples
skiprate = floor(winlength/4); % window skip in samples
max_freq = sample_rate/2; % maximum bandwidth
num_crit = 25; % number of critical bands
USE_25=1;
n_fft = 2^nextpow2(2*winlength);
n_fftby2 = n_fft/2; % FFT size/2
gamma=0.2; % power exponent
% ----------------------------------------------------------------------
% Critical Band Filter Definitions (Center Frequency and Bandwidths in Hz)
% ----------------------------------------------------------------------
cent_freq(1) = 50.0000; bandwidth(1) = 70.0000;
cent_freq(2) = 120.000; bandwidth(2) = 70.0000;
cent_freq(3) = 190.000; bandwidth(3) = 70.0000;
cent_freq(4) = 260.000; bandwidth(4) = 70.0000;
cent_freq(5) = 330.000; bandwidth(5) = 70.0000;
cent_freq(6) = 400.000; bandwidth(6) = 70.0000;
cent_freq(7) = 470.000; bandwidth(7) = 70.0000;
cent_freq(8) = 540.000; bandwidth(8) = 77.3724;
cent_freq(9) = 617.372; bandwidth(9) = 86.0056;
cent_freq(10) = 703.378; bandwidth(10) = 95.3398;
cent_freq(11) = 798.717; bandwidth(11) = 105.411;
cent_freq(12) = 904.128; bandwidth(12) = 116.256;
cent_freq(13) = 1020.38; bandwidth(13) = 127.914;
cent_freq(14) = 1148.30; bandwidth(14) = 140.423;
cent_freq(15) = 1288.72; bandwidth(15) = 153.823;
cent_freq(16) = 1442.54; bandwidth(16) = 168.154;
cent_freq(17) = 1610.70; bandwidth(17) = 183.457;
cent_freq(18) = 1794.16; bandwidth(18) = 199.776;
cent_freq(19) = 1993.93; bandwidth(19) = 217.153;
cent_freq(20) = 2211.08; bandwidth(20) = 235.631;
cent_freq(21) = 2446.71; bandwidth(21) = 255.255;
cent_freq(22) = 2701.97; bandwidth(22) = 276.072;
cent_freq(23) = 2978.04; bandwidth(23) = 298.126;
cent_freq(24) = 3276.17; bandwidth(24) = 321.465;
cent_freq(25) = 3597.63; bandwidth(25) = 346.136;
W=[ % articulation index weights
0.003
0.003
0.003
0.007
0.010
0.016
0.016
0.017
0.017
0.022
0.027
0.028
0.030
0.032
0.034
0.035
0.037
0.036
0.036
0.033
0.030
0.029
0.027
0.026
0.026];
W=W';
if USE_25==0 % use 13 bands
% ----- lump adjacent filters together ----------------
k=2;
cent_freq2(1)=cent_freq(1);
bandwidth2(1)=bandwidth(1)+bandwidth(2);
W2(1)=W(1);
for i=2:13
cent_freq2(i)=cent_freq2(i-1)+bandwidth2(i-1);
bandwidth2(i)=bandwidth(k)+bandwidth(k+1);
W2(i)=0.5*(W(k)+W(k+1));
k=k+2;
end
sumW=sum(W2);
bw_min = bandwidth2 (1); % minimum critical bandwidth
else
sumW=sum(W);
bw_min=bandwidth(1);
end
% ----------------------------------------------------------------------
% Set up the critical band filters. Note here that Gaussianly shaped
% filters are used. Also, the sum of the filter weights are equivalent
% for each critical band filter. Filter less than -30 dB and set to
% zero.
% ----------------------------------------------------------------------
min_factor = exp (-30.0 / (2.0 * 2.303)); % -30 dB point of filter
if USE_25==0
num_crit=length(cent_freq2);
for i = 1:num_crit
f0 = (cent_freq2 (i) / max_freq) * (n_fftby2);
all_f0(i) = floor(f0);
bw = (bandwidth2 (i) / max_freq) * (n_fftby2);
norm_factor = log(bw_min) - log(bandwidth2(i));
j = 0:1:n_fftby2-1;
crit_filter(i,:) = exp (-11 *(((j - floor(f0)) ./bw).^2) + norm_factor);
crit_filter(i,:) = crit_filter(i,:).*(crit_filter(i,:) > min_factor);
end
else
for i = 1:num_crit
f0 = (cent_freq (i) / max_freq) * (n_fftby2);
all_f0(i) = floor(f0);
bw = (bandwidth (i) / max_freq) * (n_fftby2);
norm_factor = log(bw_min) - log(bandwidth(i));
j = 0:1:n_fftby2-1;
crit_filter(i,:) = exp (-11 *(((j - floor(f0)) ./bw).^2) + norm_factor);
crit_filter(i,:) = crit_filter(i,:).*(crit_filter(i,:) > min_factor);
end
end
num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames
start = 1; % starting sample
window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
for frame_count = 1:num_frames
% ----------------------------------------------------------
% (1) Get the Frames for the test and reference speech.
% Multiply by Hanning Window.
% ----------------------------------------------------------
clean_frame = clean_speech(start:start+winlength-1);
processed_frame = processed_speech(start:start+winlength-1);
clean_frame = clean_frame.*window;
processed_frame = processed_frame.*window;
% ----------------------------------------------------------
% (2) Compute the magnitude Spectrum of Clean and Processed
% ----------------------------------------------------------
clean_spec = abs(fft(clean_frame,n_fft));
processed_spec = abs(fft(processed_frame,n_fft));
% normalize spectra to have area of one
%
clean_spec=clean_spec/sum(clean_spec(1:n_fftby2));
processed_spec=processed_spec/sum(processed_spec(1:n_fftby2));
% ----------------------------------------------------------
% (3) Compute Filterbank Output Energies
% ----------------------------------------------------------
clean_energy=zeros(1,num_crit);
processed_energy=zeros(1,num_crit);
error_energy=zeros(1,num_crit);
W_freq=zeros(1,num_crit);
for i = 1:num_crit
clean_energy(i) = sum(clean_spec(1:n_fftby2) ...
.*crit_filter(i,:)');
processed_energy(i) = sum(processed_spec(1:n_fftby2) ...
.*crit_filter(i,:)');
error_energy(i)=max((clean_energy(i)-processed_energy(i))^2,eps);
W_freq(i)=(clean_energy(i))^gamma;
end
SNRlog=10*log10((clean_energy.^2)./error_energy);
fwSNR=sum(W_freq.*SNRlog)/sum(W_freq);
distortion(frame_count)=min(max(fwSNR,-10),35);
start = start + skiprate;
end

View File

@ -0,0 +1,493 @@
function [SIG,BAK,OVL]= comp_fwseg_mars(cleanFile, enhancedFile);
% ----------------------------------------------------------------------
% MARS Frequency-variant fwSNRseg objective speech quality measure
%
% This function implements the frequency-variant fwSNRseg measure based
% on MARS analysis (see Chap. 10, Sec. 10.5.4)
%
%
% Usage: [sig,bak,ovl]=comp_fwseg_mars(cleanFile.wav, enhancedFile.wav)
%
% cleanFile.wav - clean input file in .wav format
% enhancedFile - enhanced output file in .wav format
% sig - predicted rating [1-5] of speech distortion
% bak - predicted rating [1-5] of noise distortion
% ovl - predicted rating [1-5] of overall quality
%
%
% Example call: [s,b,o] =comp_fwseg_mars('sp04.wav','enhanced.wav')
%
%
% References:
% [1] Chapter 10, Sec 10.5.4,
% [2] Chapter 11
%
% Authors: Yi Hu and Philipos C. Loizou
% (critical-band filtering routines were written by Bryan Pellom & John Hansen)
%
% Copyright (c) 2006 by Philipos C. Loizou
% $Revision: 0.0 $ $Date: 10/09/2006 $
% ----------------------------------------------------------------------
if nargin~=2
fprintf('USAGE: [sig,bak,ovl]=comp_fwseg_mars(cleanFile.wav, enhancedFile.wav)\n');
fprintf('For more help, type: help comp_fwseg_mars\n\n');
return;
end
[data1, Srate1, Nbits1]= wavread(cleanFile);
[data2, Srate2, Nbits2]= wavread(enhancedFile);
if ( Srate1~= Srate2) | ( Nbits1~= Nbits2)
error( 'The two files do not match!\n');
end
len= min( length( data1), length( data2));
data1= data1( 1: len)+eps;
data2= data2( 1: len)+eps;
wss_dist_matrix= fwseg( data1, data2,Srate1);
wss_dist=mean(wss_dist_matrix);
SIG= sig_mars( wss_dist( 1), wss_dist( 2), wss_dist( 3), wss_dist( 4), ...
wss_dist( 5), wss_dist( 6), wss_dist( 7), wss_dist( 8), ...
wss_dist( 9), wss_dist( 10), wss_dist( 11), wss_dist( 12), ...
wss_dist( 13), wss_dist( 14), wss_dist( 15), wss_dist( 16), ...
wss_dist( 17), wss_dist( 18), wss_dist( 19), wss_dist( 20), ...
wss_dist( 21), wss_dist( 22), wss_dist( 23), wss_dist( 24), ...
wss_dist( 25));
SIG=max(1,SIG); SIG=min(5, SIG); % limit values to [1, 5]
BAK= bak_mars( wss_dist( 1), wss_dist( 2), wss_dist( 3), wss_dist( 4), ...
wss_dist( 5), wss_dist( 6), wss_dist( 7), wss_dist( 8), ...
wss_dist( 9), wss_dist( 10), wss_dist( 11), wss_dist( 12), ...
wss_dist( 13), wss_dist( 14), wss_dist( 15), wss_dist( 16), ...
wss_dist( 17), wss_dist( 18), wss_dist( 19), wss_dist( 20), ...
wss_dist( 21), wss_dist( 22), wss_dist( 23), wss_dist( 24), ...
wss_dist( 25));
BAK=max(1,BAK); BAK=min(5, BAK); % limit values to [1, 5]
OVL= ovl_mars( wss_dist( 1), wss_dist( 2), wss_dist( 3), wss_dist( 4), ...
wss_dist( 5), wss_dist( 6), wss_dist( 7), wss_dist( 8), ...
wss_dist( 9), wss_dist( 10), wss_dist( 11), wss_dist( 12), ...
wss_dist( 13), wss_dist( 14), wss_dist( 15), wss_dist( 16), ...
wss_dist( 17), wss_dist( 18), wss_dist( 19), wss_dist( 20), ...
wss_dist( 21), wss_dist( 22), wss_dist( 23), wss_dist( 24), ...
wss_dist( 25));
OVL=max(1,OVL); OVL=min(5, OVL); % limit values to [1, 5]
%-------------------------------------------------
function Y= bak_mars( FWSEG_VA, V5, V6, V7, V8, V9, V10, V11, V12, ...
V13, V14, V15, V16, V17, V18, V19, V20, ...
V21, V22, V23, V24, V25, V26, V27, V28)
BF1 = max(0, V21 - 0.282);
BF2 = max(0, FWSEG_VA + 9.094);
BF3 = max(0, - 9.094 - FWSEG_VA );
BF5 = max(0, 10.089 - V11 );
BF7 = max(0, 3.624 - V26 ) * BF3;
BF8 = max(0, V24 - 5.584) * BF5;
BF9 = max(0, 5.584 - V24 ) * BF5;
BF10 = max(0, V19 - 8.030) * BF1;
BF11 = max(0, 8.030 - V19 ) * BF1;
BF12 = max(0, V27 - 4.858) * BF1;
BF13 = max(0, 4.858 - V27 ) * BF1;
BF14 = max(0, FWSEG_VA + 7.282) * BF1;
BF15 = max(0, - 7.282 - FWSEG_VA ) * BF1;
BF17 = max(0, 9.458 - V16 ) * BF10;
BF18 = max(0, V27 - 10.431) * BF11;
BF19 = max(0, 10.431 - V27 ) * BF11;
BF21 = max(0, 11.059 - V22 ) * BF1;
BF22 = max(0, V26 - 8.675) * BF1;
BF23 = max(0, 8.675 - V26 ) * BF1;
BF25 = max(0, 11.195 - V6 ) * BF10;
BF26 = max(0, V8 - 7.138) * BF1;
BF27 = max(0, 7.138 - V8 ) * BF1;
BF29 = max(0, 9.006 - V10 ) * BF26;
BF30 = max(0, V14 - 8.210) * BF15;
BF35 = max(0, 7.026 - V19 ) * BF15;
BF36 = max(0, V11 - 3.424) * BF27;
BF39 = max(0, 5.418 - V17 ) * BF23;
BF40 = max(0, V28 - 6.813);
BF41 = max(0, 6.813 - V28 );
BF42 = max(0, V26 - 5.998) * BF14;
BF43 = max(0, 5.998 - V26 ) * BF14;
BF44 = max(0, V5 + 0.206) * BF41;
BF45 = max(0, - 0.206 - V5 ) * BF41;
BF46 = max(0, V22 - 7.901) * BF45;
BF49 = max(0, 7.496 - V8 ) * BF44;
BF51 = max(0, 7.904 - V11 ) * BF45;
BF52 = max(0, V26 - 10.938) * BF27;
BF54 = max(0, V9 - 4.507) * BF26;
BF56 = max(0, V28 - 0.549) * BF15;
BF57 = max(0, 0.549 - V28 ) * BF15;
BF58 = max(0, V25 - 3.252) * BF41;
BF59 = max(0, 3.252 - V25 ) * BF41;
BF60 = max(0, V23 - 7.650) * BF58;
BF61 = max(0, 7.650 - V23 ) * BF58;
BF62 = max(0, V25 - 9.931) * BF44;
BF63 = max(0, 9.931 - V25 ) * BF44;
BF64 = max(0, V25 - 4.923) * BF21;
BF65 = max(0, 4.923 - V25 ) * BF21;
BF67 = max(0, 3.746 - V28 ) * BF10;
BF68 = max(0, V11 - 5.346) * BF41;
BF69 = max(0, 5.346 - V11 ) * BF41;
BF70 = max(0, V12 - 9.026) * BF68;
BF71 = max(0, 9.026 - V12 ) * BF68;
BF73 = max(0, - 2.668 - V28 ) * BF21;
BF74 = max(0, V24 - 7.028) * BF41;
BF75 = max(0, 7.028 - V24 ) * BF41;
BF77 = max(0, - 0.224 - V6 ) * BF74;
BF78 = max(0, V5 - 3.884);
BF79 = max(0, 3.884 - V5 );
BF80 = max(0, V15 - 5.019) * BF78;
BF83 = max(0, - 1.880 - V28 ) * BF13;
BF84 = max(0, V7 - 3.067) * BF12;
BF85 = max(0, 3.067 - V7 ) * BF12;
BF87 = max(0, 5.353 - V6 );
BF88 = max(0, V13 - 3.405) * BF9;
BF89 = max(0, 3.405 - V13 ) * BF9;
BF91 = max(0, 5.599 - V13 ) * BF45;
BF92 = max(0, V15 - 9.821) * BF8;
BF94 = max(0, V14 + 2.594) * BF79;
BF97 = max(0, 8.635 - V23 ) * BF94;
BF99 = max(0, 1.332 - V24 ) * BF45;
BF100 = max(0, V7 - 0.209) * BF1;
Y = 2.751 + 0.135 * BF1 - 0.037 * BF2 + 0.328 * BF3 - 0.098 * BF5 ...
+ 0.988 * BF7 + 0.014 * BF8 - 0.034 * BF11 - 0.011 * BF12 ...
- 0.013 * BF13 - 0.002 * BF17 + 0.014 * BF18 ...
+ 0.004 * BF19 - 0.007 * BF21 - 0.017 * BF22 ...
- .895791E-03 * BF25 + 0.011 * BF26 - 0.009 * BF27 ...
- 0.007 * BF29 + 0.052 * BF30 + 0.022 * BF35 ...
- 0.002 * BF36 - 0.005 * BF39 - 0.059 * BF40 ...
- 0.050 * BF41 + 0.001 * BF42 + .743730E-03 * BF43 ...
+ 0.011 * BF44 + 0.022 * BF45 + 0.009 * BF46 ...
+ 0.004 * BF49 - 0.005 * BF51 + 0.010 * BF52 ...
- 0.001 * BF54 - 0.005 * BF56 - 0.015 * BF57 ...
- 0.032 * BF59 + 0.009 * BF60 - 0.002 * BF61 ...
- 0.009 * BF62 - 0.001 * BF63 + .819374E-03 * BF64 ...
+ 0.002 * BF65 + 0.003 * BF67 + 0.024 * BF69 ...
- 0.011 * BF70 - 0.004 * BF71 + 0.013 * BF73 ...
- 0.026 * BF74 + 0.005 * BF75 + 0.253 * BF77 ...
- 0.065 * BF78 + 0.014 * BF80 - 0.010 * BF83 ...
+ 0.001 * BF84 + 0.018 * BF85 - 0.050 * BF87 ...
- 0.002 * BF88 - 0.020 * BF89 + 0.003 * BF91 ...
- 0.043 * BF92 + .707581E-03 * BF97 - 0.015 * BF99 ...
- 0.005 * BF100;
function Y= sig_mars( FWSEG_VA, V5, V6, V7, V8, V9, V10, V11, V12, ...
V13, V14, V15, V16, V17, V18, V19, V20, ...
V21, V22, V23, V24, V25, V26, V27, V28)
BF1 = max(0, V7 - 9.535);
BF2 = max(0, 9.535 - V7 );
BF3 = max(0, V27 - 1.578);
BF5 = max(0, V6 - 5.422);
BF6 = max(0, 5.422 - V6 );
BF8 = max(0, 11.333 - V19 );
BF10 = max(0, - 6.774 - FWSEG_VA );
BF11 = max(0, V10 - 6.255) * BF8;
BF12 = max(0, 6.255 - V10 ) * BF8;
BF13 = max(0, V24 - 3.894);
BF15 = max(0, V5 - 3.884);
BF16 = max(0, 3.884 - V5 );
BF17 = max(0, V28 - 7.918);
BF18 = max(0, 7.918 - V28 );
BF19 = max(0, V13 - 6.077) * BF18;
BF20 = max(0, 6.077 - V13 ) * BF18;
BF22 = max(0, 6.614 - V20 ) * BF10;
BF23 = max(0, FWSEG_VA + 0.936) * BF8;
BF25 = max(0, V23 - 5.039);
BF26 = max(0, 5.039 - V23 );
BF28 = max(0, 9.007 - V20 ) * BF25;
BF29 = max(0, V25 - 7.582);
BF30 = max(0, 7.582 - V25 );
BF31 = max(0, V11 + 3.336) * BF16;
BF32 = max(0, V26 - 1.877);
BF35 = max(0, - 5.749 - FWSEG_VA ) * BF6;
BF36 = max(0, V7 - 4.451) * BF29;
BF37 = max(0, 4.451 - V7 ) * BF29;
BF38 = max(0, V14 - 10.158);
BF39 = max(0, 10.158 - V14 );
BF41 = max(0, 7.172 - V17 ) * BF39;
BF43 = max(0, 7.810 - V24 ) * BF26;
BF44 = max(0, V8 + 1.636) * BF3;
BF45 = max(0, FWSEG_VA - 10.068) * BF39;
BF47 = max(0, V23 - 4.721) * BF30;
BF48 = max(0, 4.721 - V23 ) * BF30;
BF50 = max(0, - 2.397 - V24 ) * BF16;
BF51 = max(0, V14 - 1.428) * BF17;
BF53 = max(0, V16 + 1.940) * BF18;
BF54 = max(0, V10 - 9.442) * BF18;
BF56 = max(0, V10 + 2.144) * BF16;
BF58 = max(0, 1.969 - V26 ) * BF2;
BF59 = max(0, V19 - 6.089) * BF16;
BF62 = max(0, 8.952 - V21 ) * BF15;
BF63 = max(0, V24 - 7.371) * BF3;
BF65 = max(0, V22 - 8.908) * BF6;
BF66 = max(0, 8.908 - V22 ) * BF6;
BF67 = max(0, V27 - 9.485) * BF30;
BF69 = max(0, V18 - 8.608) * BF10;
BF71 = max(0, V13 - 3.374) * BF25;
BF73 = max(0, V14 - 3.616) * BF13;
BF75 = max(0, V18 - 10.321) * BF32;
BF76 = max(0, 10.321 - V18 ) * BF32;
BF78 = max(0, 3.972 - V15 ) * BF26;
BF79 = max(0, V14 - 7.105) * BF26;
BF80 = max(0, 7.105 - V14 ) * BF26;
Y = 2.638 - 0.089 * BF1 + 0.083 * BF5 - 0.162 * BF6 - 0.037 * BF8 ...
- 0.241 * BF10 + 0.018 * BF11 - 0.008 * BF12 ...
+ 0.059 * BF13 - 0.144 * BF17 - 0.116 * BF18 ...
+ 0.010 * BF19 - 0.012 * BF20 + 0.085 * BF22 ...
+ 0.011 * BF23 + 0.049 * BF25 - 0.159 * BF26 ...
- 0.016 * BF28 - 0.138 * BF29 + 0.010 * BF31 ...
+ 0.016 * BF35 + 0.018 * BF36 + 0.246 * BF37 ...
- 0.417 * BF38 + 0.052 * BF39 - 0.005 * BF41 ...
+ 0.021 * BF43 + 0.006 * BF44 - 0.047 * BF45 ...
- 0.051 * BF47 - 0.014 * BF48 - 0.113 * BF50 ...
+ 0.019 * BF51 + 0.007 * BF53 + 0.017 * BF54 ...
- 0.007 * BF56 - 0.098 * BF58 + 0.011 * BF59 ...
- 0.016 * BF62 - 0.012 * BF63 + 0.113 * BF65 ...
+ 0.016 * BF66 + 0.040 * BF67 - 0.065 * BF69 ...
- 0.018 * BF71 + 0.014 * BF73 - 0.009 * BF75 ...
- 0.008 * BF76 - 0.032 * BF78 + 0.032 * BF79 ...
+ 0.011 * BF80;
function Y= ovl_mars( FWSEG_VA, V5, V6, V7, V8, V9, V10, V11, V12, ...
V13, V14, V15, V16, V17, V18, V19, V20, ...
V21, V22, V23, V24, V25, V26, V27, V28)
BF1 = max(0, V21 - 4.671);
BF3 = max(0, V6 - 5.396);
BF4 = max(0, 5.396 - V6 );
BF7 = max(0, V11 - 7.884);
BF8 = max(0, 7.884 - V11 );
BF9 = max(0, FWSEG_VA + 7.229) * BF1;
BF10 = max(0, - 7.229 - FWSEG_VA ) * BF1;
BF11 = max(0, V19 - 8.128) * BF1;
BF12 = max(0, 8.128 - V19 ) * BF1;
BF13 = max(0, V28 - 7.918);
BF14 = max(0, 7.918 - V28 );
BF15 = max(0, V5 + 2.888) * BF14;
BF16 = max(0, - 2.888 - V5 ) * BF14;
BF17 = max(0, V24 - 2.924) * BF8;
BF18 = max(0, 2.924 - V24 ) * BF8;
BF20 = max(0, 9.071 - V16 ) * BF15;
BF21 = max(0, V10 - 6.286) * BF14;
BF22 = max(0, 6.286 - V10 ) * BF14;
BF24 = max(0, V23 - 5.173);
BF25 = max(0, 5.173 - V23 );
BF26 = max(0, V26 - 8.987);
BF29 = max(0, 12.216 - V27 ) * BF3;
BF30 = max(0, V8 - 4.306) * BF16;
BF34 = max(0, V23 - 7.630) * BF21;
BF35 = max(0, 7.630 - V23 ) * BF21;
BF37 = max(0, 3.638 - V7 ) * BF1;
BF39 = max(0, 8.337 - V21 ) * BF17;
BF41 = max(0, 1.590 - V5 ) * BF11;
BF43 = max(0, 13.993 - V8 ) * BF11;
BF44 = max(0, V14 - 5.993) * BF25;
BF45 = max(0, 5.993 - V14 ) * BF25;
BF46 = max(0, V24 - 1.035);
BF47 = max(0, 1.035 - V24 );
BF49 = max(0, 8.915 - V23 ) * BF12;
BF51 = max(0, - 0.004 - FWSEG_VA );
BF52 = max(0, V27 - 6.520) * BF24;
BF53 = max(0, 6.520 - V27 ) * BF24;
BF54 = max(0, V7 - 11.484) * BF8;
BF55 = max(0, 11.484 - V7 ) * BF8;
BF57 = max(0, 5.742 - V17 ) * BF25;
BF58 = max(0, V12 - 6.949) * BF12;
BF59 = max(0, 6.949 - V12 ) * BF12;
BF60 = max(0, V25 - 9.203) * BF45;
BF63 = max(0, 1.887 - V13 ) * BF7;
BF65 = max(0, 9.498 - V26 ) * BF15;
BF66 = max(0, V5 - 6.566) * BF22;
BF71 = max(0, 13.239 - V19 ) * BF46;
BF72 = max(0, V19 - 9.925) * BF55;
BF77 = max(0, 3.430 - V22 ) * BF18;
BF78 = max(0, V27 - 6.513) * BF45;
BF79 = max(0, 6.513 - V27 ) * BF45;
BF81 = max(0, 12.511 - V18 );
BF82 = max(0, V11 - 6.777) * BF81;
BF83 = max(0, 6.777 - V11 ) * BF81;
BF85 = max(0, 3.433 - V5 ) * BF47;
BF87 = max(0, - 3.524 - FWSEG_VA ) * BF47;
BF88 = max(0, V27 - 11.604) * BF9;
BF91 = max(0, 8.845 - V26 ) * BF52;
BF92 = max(0, V14 - 5.931) * BF82;
BF93 = max(0, 5.931 - V14 ) * BF82;
BF94 = max(0, V21 - 7.245) * BF25;
BF95 = max(0, 7.245 - V21 ) * BF25;
BF96 = max(0, V14 - 5.323) * BF7;
BF98 = max(0, V10 - 6.248) * BF71;
BF100 = max(0, V18 - 0.602) * BF95;
Y = 2.936 + 0.047 * BF1 + 0.061 * BF3 - 0.084 * BF4 - 0.139 * BF8 ...
- 0.064 * BF10 - 0.030 * BF12 - 0.103 * BF13 ...
- 0.039 * BF14 + 0.020 * BF17 - 0.002 * BF20 ...
- 0.005 * BF22 - 0.114 * BF25 - 0.090 * BF26 ...
- 0.011 * BF29 + 0.010 * BF30 + 0.009 * BF34 ...
+ 0.002 * BF35 + 0.079 * BF37 - 0.006 * BF39 ...
+ 0.007 * BF41 - 0.003 * BF43 + 0.017 * BF44 ...
+ 0.076 * BF47 + 0.009 * BF49 + 0.016 * BF51 ...
- 0.042 * BF53 - 0.079 * BF54 - 0.030 * BF57 ...
- 0.018 * BF58 - 0.009 * BF59 - 0.119 * BF60 ...
- 0.210 * BF63 - .456802E-03 * BF65 + 0.028 * BF66 ...
+ 0.020 * BF72 + 0.011 * BF77 + 0.005 * BF78 ...
+ 0.003 * BF79 - 0.049 * BF81 + 0.012 * BF83 ...
- 0.030 * BF85 + 0.070 * BF87 + 0.008 * BF88 ...
- 0.008 * BF91 + 0.010 * BF92 + 0.003 * BF93 ...
+ 0.022 * BF94 - 0.038 * BF96 + .933766E-03 * BF98 ...
+ 0.002 * BF100;
function distortion = fwseg(clean_speech, processed_speech,sample_rate)
% ----------------------------------------------------------------------
% Check the length of the clean and processed speech. Must be the same.
% ----------------------------------------------------------------------
clean_length = length(clean_speech);
processed_length = length(processed_speech);
if (clean_length ~= processed_length)
disp('Error: Files must have same length.');
return
end
% ----------------------------------------------------------------------
% Global Variables
% ----------------------------------------------------------------------
winlength = round(30*sample_rate/1000); % window length in samples
skiprate = floor(winlength/4); % window skip in samples
max_freq = sample_rate/2; % maximum bandwidth
num_crit = 25; % number of critical bands
n_fft = 2^nextpow2(2*winlength);
n_fftby2 = n_fft/2; % FFT size/2
% ----------------------------------------------------------------------
% Critical Band Filter Definitions (Center Frequency and Bandwidths in Hz)
% ----------------------------------------------------------------------
cent_freq(1) = 50.0000; bandwidth(1) = 70.0000;
cent_freq(2) = 120.000; bandwidth(2) = 70.0000;
cent_freq(3) = 190.000; bandwidth(3) = 70.0000;
cent_freq(4) = 260.000; bandwidth(4) = 70.0000;
cent_freq(5) = 330.000; bandwidth(5) = 70.0000;
cent_freq(6) = 400.000; bandwidth(6) = 70.0000;
cent_freq(7) = 470.000; bandwidth(7) = 70.0000;
cent_freq(8) = 540.000; bandwidth(8) = 77.3724;
cent_freq(9) = 617.372; bandwidth(9) = 86.0056;
cent_freq(10) = 703.378; bandwidth(10) = 95.3398;
cent_freq(11) = 798.717; bandwidth(11) = 105.411;
cent_freq(12) = 904.128; bandwidth(12) = 116.256;
cent_freq(13) = 1020.38; bandwidth(13) = 127.914;
cent_freq(14) = 1148.30; bandwidth(14) = 140.423;
cent_freq(15) = 1288.72; bandwidth(15) = 153.823;
cent_freq(16) = 1442.54; bandwidth(16) = 168.154;
cent_freq(17) = 1610.70; bandwidth(17) = 183.457;
cent_freq(18) = 1794.16; bandwidth(18) = 199.776;
cent_freq(19) = 1993.93; bandwidth(19) = 217.153;
cent_freq(20) = 2211.08; bandwidth(20) = 235.631;
cent_freq(21) = 2446.71; bandwidth(21) = 255.255;
cent_freq(22) = 2701.97; bandwidth(22) = 276.072;
cent_freq(23) = 2978.04; bandwidth(23) = 298.126;
cent_freq(24) = 3276.17; bandwidth(24) = 321.465;
cent_freq(25) = 3597.63; bandwidth(25) = 346.136;
bw_min = bandwidth (1); % minimum critical bandwidth
% ----------------------------------------------------------------------
% Set up the critical band filters. Note here that Gaussianly shaped
% filters are used. Also, the sum of the filter weights are equivalent
% for each critical band filter. Filter less than -30 dB and set to
% zero.
% ----------------------------------------------------------------------
min_factor = exp (-30.0 / (2.0 * 2.303)); % -30 dB point of filter
for i = 1:num_crit
f0 = (cent_freq (i) / max_freq) * (n_fftby2);
all_f0(i) = floor(f0);
bw = (bandwidth (i) / max_freq) * (n_fftby2);
norm_factor = log(bw_min) - log(bandwidth(i));
j = 0:1:n_fftby2-1;
crit_filter(i,:) = exp (-11 *(((j - floor(f0)) ./bw).^2) + norm_factor);
crit_filter(i,:) = crit_filter(i,:).*(crit_filter(i,:) > min_factor);
end
% ----------------------------------------------------------------------
% For each frame of input speech, calculate the Weighted Spectral
% Slope Measure
% ----------------------------------------------------------------------
num_frames = floor(clean_length/skiprate-(winlength/skiprate)); % number of frames
start = 1; % starting sample
window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
distortion=zeros(num_frames,num_crit);
for frame_count = 1:num_frames
% ----------------------------------------------------------
% (1) Get the Frames for the test and reference speech.
% Multiply by Hanning Window.
% ----------------------------------------------------------
clean_frame = clean_speech(start:start+winlength-1);
processed_frame = processed_speech(start:start+winlength-1);
clean_frame = clean_frame.*window;
processed_frame = processed_frame.*window;
% ----------------------------------------------------------
% (2) Compute the magnitude Spectrum of Clean and Processed
% ----------------------------------------------------------
clean_spec = abs(fft(clean_frame,n_fft));
processed_spec = abs(fft(processed_frame,n_fft));
% normalize so that spectra have unit area ----
clean_spec=clean_spec/sum(clean_spec(1:n_fftby2));
processed_spec=processed_spec/sum(processed_spec(1:n_fftby2));
% ----------------------------------------------------------
% (3) Compute Filterbank Output Energies
% ----------------------------------------------------------
clean_energy=zeros(1,num_crit);
processed_energy=zeros(1,num_crit);
error_energy=zeros(1,num_crit);
for i = 1:num_crit
clean_energy(i) = sum(clean_spec(1:n_fftby2) ...
.*crit_filter(i,:)');
processed_energy(i) = sum(processed_spec(1:n_fftby2) ...
.*crit_filter(i,:)');
error_energy(i)=max((clean_energy(i)-processed_energy(i))^2,eps);
end
SNRlog=10*log10((clean_energy.^2)./error_energy);
distortion(frame_count,:)=min(max(SNRlog,-10),35);
start = start + skiprate;
end

View File

@ -0,0 +1,221 @@
function [SIG,BAK,OVL]= comp_fwseg_variant(cleanFile, enhancedFile);
% ----------------------------------------------------------------------
% Frequency-variant fwSNRseg Objective Speech Quality Measure
%
% This function implements the frequency-variant fwSNRseg measure [1]
% (see also Chap. 10, Eq. 10.24)
%
%
% Usage: [sig,bak,ovl]=comp_fwseg_variant(cleanFile.wav, enhancedFile.wav)
%
% cleanFile.wav - clean input file in .wav format
% enhancedFile - enhanced output file in .wav format
% sig - predicted rating [1-5] of speech distortion
% bak - predicted rating [1-5] of noise distortion
% ovl - predicted rating [1-5] of overall quality
%
%
% Example call: [s,b,o] =comp_fwseg_variant('sp04.wav','enhanced.wav')
%
%
% References:
% [1] S. R. Quackenbush, T. P. Barnwell, and M. A. Clements,
% Objective Measures of Speech Quality. Prentice Hall
% Advanced Reference Series, Englewood Cliffs, NJ, 1988,
% ISBN: 0-13-629056-6.
%
% Author: Philipos C. Loizou
% (critical-band filtering routines were written by Bryan Pellom & John Hansen)
%
% Copyright (c) 2006 by Philipos C. Loizou
% $Revision: 0.0 $ $Date: 10/09/2006 $
% ----------------------------------------------------------------------
if nargin~=2
fprintf('USAGE: [sig,bak,ovl]=comp_fwseg_variant(cleanFile.wav, enhancedFile.wav)\n');
fprintf('For more help, type: help comp_fwseg_variant\n\n');
return;
end
[data1, Srate1, Nbits1]= wavread(cleanFile);
[data2, Srate2, Nbits2]= wavread(enhancedFile);
if ( Srate1~= Srate2) | ( Nbits1~= Nbits2)
error( 'The two files do not match!\n');
end
len= min( length( data1), length( data2));
data1= data1( 1: len)+eps;
data2= data2( 1: len)+eps;
wss_dist_matrix= fwseg( data1, data2,Srate1);
wss_dist=mean(wss_dist_matrix);
% initialize coefficients obtained from multiple linear
% regression analysis
%
b_sig=[0.021,-0.028,0.088,-0.031,0.048,-0.049,0.065,0.009,0.011,0.033,...
-0.040,-0.002,0.041,-0.007,0.033,0.018,-0.007,0.044,-0.001,0.021,...
-0.002,0.017,-0.03,0.073,0.043];
b_ovl=[-0.003,-0.026,0.066,-0.036,0.038,-0.023,0.037,0.022,0.014,0.009,...
-0.03,0.004,0.044,-0.005,0.017,0.018,-0.001,0.051,0.009,0.011,...
0.011,-0.002,-0.021,0.043,0.031];
b_bak=[-0.03,-0.022,0.03,-0.048,0.034,0.002,0.006,0.037,0.017,-0.016,-0.008,...
0.019,0.024,-0.002,0.01,0.03,-0.018,0.046,0.022,0.005,0.03,-0.028,...
-0.028,0.019,0.005];
SIG=0.567+sum(b_sig.*wss_dist);
SIG=max(1,SIG); SIG=min(5, SIG); % limit values to [1, 5]
BAK=1.013+sum(b_bak.*wss_dist);
BAK=max(1,BAK); BAK=min(5, BAK); % limit values to [1, 5]
OVL=0.446+sum(b_ovl.*wss_dist);
OVL=max(1,OVL); OVL=min(5, OVL); % limit values to [1, 5]
% ----------------------------------------------------------------------
function distortion = fwseg(clean_speech, processed_speech,sample_rate)
% ----------------------------------------------------------------------
% Check the length of the clean and processed speech. Must be the same.
% ----------------------------------------------------------------------
clean_length = length(clean_speech);
processed_length = length(processed_speech);
if (clean_length ~= processed_length)
disp('Error: Files must have same length.');
return
end
% ----------------------------------------------------------------------
% Global Variables
% ----------------------------------------------------------------------
winlength = round(30*sample_rate/1000); % window length in samples
skiprate = floor(winlength/4); % window skip in samples
max_freq = sample_rate/2; % maximum bandwidth
num_crit = 25; % number of critical bands
n_fft = 2^nextpow2(2*winlength);
n_fftby2 = n_fft/2; % FFT size/2
% ----------------------------------------------------------------------
% Critical Band Filter Definitions (Center Frequency and Bandwidths in Hz)
% ----------------------------------------------------------------------
cent_freq(1) = 50.0000; bandwidth(1) = 70.0000;
cent_freq(2) = 120.000; bandwidth(2) = 70.0000;
cent_freq(3) = 190.000; bandwidth(3) = 70.0000;
cent_freq(4) = 260.000; bandwidth(4) = 70.0000;
cent_freq(5) = 330.000; bandwidth(5) = 70.0000;
cent_freq(6) = 400.000; bandwidth(6) = 70.0000;
cent_freq(7) = 470.000; bandwidth(7) = 70.0000;
cent_freq(8) = 540.000; bandwidth(8) = 77.3724;
cent_freq(9) = 617.372; bandwidth(9) = 86.0056;
cent_freq(10) = 703.378; bandwidth(10) = 95.3398;
cent_freq(11) = 798.717; bandwidth(11) = 105.411;
cent_freq(12) = 904.128; bandwidth(12) = 116.256;
cent_freq(13) = 1020.38; bandwidth(13) = 127.914;
cent_freq(14) = 1148.30; bandwidth(14) = 140.423;
cent_freq(15) = 1288.72; bandwidth(15) = 153.823;
cent_freq(16) = 1442.54; bandwidth(16) = 168.154;
cent_freq(17) = 1610.70; bandwidth(17) = 183.457;
cent_freq(18) = 1794.16; bandwidth(18) = 199.776;
cent_freq(19) = 1993.93; bandwidth(19) = 217.153;
cent_freq(20) = 2211.08; bandwidth(20) = 235.631;
cent_freq(21) = 2446.71; bandwidth(21) = 255.255;
cent_freq(22) = 2701.97; bandwidth(22) = 276.072;
cent_freq(23) = 2978.04; bandwidth(23) = 298.126;
cent_freq(24) = 3276.17; bandwidth(24) = 321.465;
cent_freq(25) = 3597.63; bandwidth(25) = 346.136;
bw_min = bandwidth (1); % minimum critical bandwidth
% ----------------------------------------------------------------------
% Set up the critical band filters. Note here that Gaussianly shaped
% filters are used. Also, the sum of the filter weights are equivalent
% for each critical band filter. Filter less than -30 dB and set to
% zero.
% ----------------------------------------------------------------------
min_factor = exp (-30.0 / (2.0 * 2.303)); % -30 dB point of filter
for i = 1:num_crit
f0 = (cent_freq (i) / max_freq) * (n_fftby2);
all_f0(i) = floor(f0);
bw = (bandwidth (i) / max_freq) * (n_fftby2);
norm_factor = log(bw_min) - log(bandwidth(i));
j = 0:1:n_fftby2-1;
crit_filter(i,:) = exp (-11 *(((j - floor(f0)) ./bw).^2) + norm_factor);
crit_filter(i,:) = crit_filter(i,:).*(crit_filter(i,:) > min_factor);
end
% ----------------------------------------------------------------------
% For each frame of input speech, calculate the Weighted Spectral
% Slope Measure
% ----------------------------------------------------------------------
num_frames = floor(clean_length/skiprate-(winlength/skiprate)); % number of frames
start = 1; % starting sample
window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
distortion=zeros(num_frames,num_crit);
for frame_count = 1:num_frames
% ----------------------------------------------------------
% (1) Get the Frames for the test and reference speech.
% Multiply by Hanning Window.
% ----------------------------------------------------------
clean_frame = clean_speech(start:start+winlength-1);
processed_frame = processed_speech(start:start+winlength-1);
clean_frame = clean_frame.*window;
processed_frame = processed_frame.*window;
% ----------------------------------------------------------
% (2) Compute the magnitude Spectrum of Clean and Processed
% ----------------------------------------------------------
clean_spec = abs(fft(clean_frame,n_fft));
processed_spec = abs(fft(processed_frame,n_fft));
% normalize so that spectra have unit area ----
clean_spec=clean_spec/sum(clean_spec(1:n_fftby2));
processed_spec=processed_spec/sum(processed_spec(1:n_fftby2));
% ----------------------------------------------------------
% (3) Compute Filterbank Output Energies (in dB scale)
% ----------------------------------------------------------
clean_energy=zeros(1,num_crit);
processed_energy=zeros(1,num_crit);
error_energy=zeros(1,num_crit);
for i = 1:num_crit
clean_energy(i) = sum(clean_spec(1:n_fftby2) ...
.*crit_filter(i,:)');
processed_energy(i) = sum(processed_spec(1:n_fftby2) ...
.*crit_filter(i,:)');
error_energy(i)=max((clean_energy(i)-processed_energy(i))^2,eps);
end
SNRlog=10*log10((clean_energy.^2)./error_energy);
distortion(frame_count,:)=min(max(SNRlog,-10),35);
start = start + skiprate;
end

View File

@ -0,0 +1,188 @@
function is_mean= comp_is(cleanFile, enhdFile);
% ----------------------------------------------------------------------
% Itakura-Saito (IS) Objective Speech Quality Measure
%
% This function implements the Itakura-Saito distance measure
% defined on page 50 of [1] (see Equation 2.26). See also
% Equation 12 (page 1480) of [2].
%
% Usage: IS=comp_is(cleanFile.wav, enhancedFile.wav)
%
% cleanFile.wav - clean input file in .wav format
% enhancedFile - enhanced output file in .wav format
% IS - computed Itakura Saito measure
%
% Note that the IS measure is limited in the range [0, 100].
%
% Example call: IS =comp_is('sp04.wav','enhanced.wav')
%
%
% References:
%
% [1] S. R. Quackenbush, T. P. Barnwell, and M. A. Clements,
% Objective Measures of Speech Quality. Prentice Hall
% Advanced Reference Series, Englewood Cliffs, NJ, 1988,
% ISBN: 0-13-629056-6.
%
% [2] B.-H. Juang, "On Using the Itakura-Saito Measures for
% Speech Coder Performance Evaluation", AT&T Bell
% Laboratories Technical Journal, Vol. 63, No. 8,
% October 1984, pp. 1477-1498.
%
% Authors: Bryan L. Pellom and John H. L. Hansen (July 1998)
% Modified by: Philipos C. Loizou (Oct 2006) - limited IS to be in [0,100]
%
% Copyright (c) 2006 by Philipos C. Loizou
% $Revision: 0.0 $ $Date: 10/09/2006 $
% ----------------------------------------------------------------------
if nargin~=2
fprintf('USAGE: IS=comp_is(cleanFile.wav, enhancedFile.wav)\n');
fprintf('For more help, type: help comp_is\n\n');
return;
end
alpha=0.95;
[data1, Srate1, Nbits1]= wavread(cleanFile);
[data2, Srate2, Nbits2]= wavread(enhdFile);
if ( Srate1~= Srate2) | ( Nbits1~= Nbits2)
error( 'The two files do not match!\n');
end
len= min( length( data1), length( data2));
data1= data1( 1: len)+eps;
data2= data2( 1: len)+eps;
IS_dist= is( data1, data2,Srate1);
IS_len= round( length( IS_dist)* alpha);
IS= sort( IS_dist);
is_mean= mean( IS( 1: IS_len));
function distortion = is(clean_speech, processed_speech,sample_rate)
% ----------------------------------------------------------------------
% Check the length of the clean and processed speech. Must be the same.
% ----------------------------------------------------------------------
clean_length = length(clean_speech);
processed_length = length(processed_speech);
if (clean_length ~= processed_length)
disp('Error: Both Speech Files must be same length.');
return
end
% ----------------------------------------------------------------------
% Scale both clean speech and processed speech to have same dynamic
% range. Also remove DC component from each signal
% ----------------------------------------------------------------------
%clean_speech = clean_speech - mean(clean_speech);
%processed_speech = processed_speech - mean(processed_speech);
%processed_speech = processed_speech.*(max(abs(clean_speech))/ max(abs(processed_speech)));
% ----------------------------------------------------------------------
% Global Variables
% ----------------------------------------------------------------------
%sample_rate = 8000; % default sample rate
winlength = round(30*sample_rate/1000); %240; % window length in samples
skiprate = floor(winlength/4); % window skip in samples
if sample_rate<10000
P = 10; % LPC Analysis Order
else
P=16; % this could vary depending on sampling frequency.
end
% ----------------------------------------------------------------------
% For each frame of input speech, calculate the Itakura-Saito Measure
% ----------------------------------------------------------------------
num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames
start = 1; % starting sample
window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
for frame_count = 1:num_frames
% ----------------------------------------------------------
% (1) Get the Frames for the test and reference speech.
% Multiply by Hanning Window.
% ----------------------------------------------------------
clean_frame = clean_speech(start:start+winlength-1);
processed_frame = processed_speech(start:start+winlength-1);
clean_frame = clean_frame.*window;
processed_frame = processed_frame.*window;
% ----------------------------------------------------------
% (2) Get the autocorrelation lags and LPC parameters used
% to compute the IS measure.
% ----------------------------------------------------------
[R_clean, Ref_clean, A_clean] = ...
lpcoeff(clean_frame, P);
[R_processed, Ref_processed, A_processed] = ...
lpcoeff(processed_frame, P);
% ----------------------------------------------------------
% (3) Compute the IS measure
% ----------------------------------------------------------
numerator = A_processed*toeplitz(R_clean)*A_processed';
denominator = max(A_clean*toeplitz(R_clean)*A_clean',eps);
gain_clean = max(R_clean*A_clean',eps); % this is gain
gain_processed = max(R_processed*A_processed',eps); % squared (sigma^2)
ISvalue=(gain_clean/gain_processed)*(numerator/denominator) + ...
log(gain_processed/gain_clean)-1;
distortion(frame_count) = min(ISvalue,100);
start = start + skiprate;
end
function [acorr, refcoeff, lpparams] = lpcoeff(speech_frame, model_order)
% ----------------------------------------------------------
% (1) Compute Autocorrelation Lags
% ----------------------------------------------------------
winlength = max(size(speech_frame));
for k=1:model_order+1
R(k) = sum(speech_frame(1:winlength-k+1) ...
.*speech_frame(k:winlength));
end
% ----------------------------------------------------------
% (2) Levinson-Durbin
% ----------------------------------------------------------
a = ones(1,model_order);
E(1)=R(1);
for i=1:model_order
a_past(1:i-1) = a(1:i-1);
sum_term = sum(a_past(1:i-1).*R(i:-1:2));
rcoeff(i)=(R(i+1) - sum_term) / E(i);
a(i)=rcoeff(i);
a(1:i-1) = a_past(1:i-1) - rcoeff(i).*a_past(i-1:-1:1);
E(i+1)=(1-rcoeff(i)*rcoeff(i))*E(i);
end
acorr = R;
refcoeff = rcoeff;
lpparams = [1 -a];

View File

@ -0,0 +1,162 @@
function llr_mean= comp_llr(cleanFile, enhancedFile);
% ----------------------------------------------------------------------
%
% Log Likelihood Ratio (LLR) Objective Speech Quality Measure
%
%
% This function implements the Log Likelihood Ratio Measure
% defined on page 48 of [1] (see Equation 2.18).
%
% Usage: llr=comp_llr(cleanFile.wav, enhancedFile.wav)
%
% cleanFile.wav - clean input file in .wav format
% enhancedFile - enhanced output file in .wav format
% llr - computed likelihood ratio
%
% Note that the LLR measure is limited in the range [0, 2].
%
% Example call: llr =comp_llr('sp04.wav','enhanced.wav')
%
%
% References:
%
% [1] S. R. Quackenbush, T. P. Barnwell, and M. A. Clements,
% Objective Measures of Speech Quality. Prentice Hall
% Advanced Reference Series, Englewood Cliffs, NJ, 1988,
% ISBN: 0-13-629056-6.
%
% Authors: Bryan L. Pellom and John H. L. Hansen (July 1998)
% Modified by: Philipos C. Loizou (Oct 2006) - limited LLR to be in [0,2]
%
% Copyright (c) 2006 by Philipos C. Loizou
% $Revision: 0.0 $ $Date: 10/09/2006 $
% ----------------------------------------------------------------------
if nargin~=2
fprintf('USAGE: LLR=comp_llr(cleanFile.wav, enhancedFile.wav)\n');
fprintf('For more help, type: help comp_llr\n\n');
return;
end
alpha=0.95;
[data1, Srate1, Nbits1]= wavread(cleanFile);
[data2, Srate2, Nbits2]= wavread(enhancedFile);
if ( Srate1~= Srate2) | ( Nbits1~= Nbits2)
error( 'The two files do not match!\n');
end
len= min( length( data1), length( data2));
data1= data1( 1: len)+eps;
data2= data2( 1: len)+eps;
IS_dist= llr( data1, data2,Srate1);
IS_len= round( length( IS_dist)* alpha);
IS= sort( IS_dist);
llr_mean= mean( IS( 1: IS_len));
function distortion = llr(clean_speech, processed_speech,sample_rate)
% ----------------------------------------------------------------------
% Check the length of the clean and processed speech. Must be the same.
% ----------------------------------------------------------------------
clean_length = length(clean_speech);
processed_length = length(processed_speech);
if (clean_length ~= processed_length)
disp('Error: Both Speech Files must be same length.');
return
end
% ----------------------------------------------------------------------
% Global Variables
% ----------------------------------------------------------------------
winlength = round(30*sample_rate/1000); %240; % window length in samples
skiprate = floor(winlength/4); % window skip in samples
if sample_rate<10000
P = 10; % LPC Analysis Order
else
P=16; % this could vary depending on sampling frequency.
end
% ----------------------------------------------------------------------
% For each frame of input speech, calculate the Log Likelihood Ratio
% ----------------------------------------------------------------------
num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames
start = 1; % starting sample
window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
for frame_count = 1:num_frames
% ----------------------------------------------------------
% (1) Get the Frames for the test and reference speech.
% Multiply by Hanning Window.
% ----------------------------------------------------------
clean_frame = clean_speech(start:start+winlength-1);
processed_frame = processed_speech(start:start+winlength-1);
clean_frame = clean_frame.*window;
processed_frame = processed_frame.*window;
% ----------------------------------------------------------
% (2) Get the autocorrelation lags and LPC parameters used
% to compute the LLR measure.
% ----------------------------------------------------------
[R_clean, Ref_clean, A_clean] = ...
lpcoeff(clean_frame, P);
[R_processed, Ref_processed, A_processed] = ...
lpcoeff(processed_frame, P);
% ----------------------------------------------------------
% (3) Compute the LLR measure
% ----------------------------------------------------------
numerator = A_processed*toeplitz(R_clean)*A_processed';
denominator = A_clean*toeplitz(R_clean)*A_clean';
distortion(frame_count) = min(2,log(numerator/denominator));
start = start + skiprate;
end
function [acorr, refcoeff, lpparams] = lpcoeff(speech_frame, model_order)
% ----------------------------------------------------------
% (1) Compute Autocorrelation Lags
% ----------------------------------------------------------
winlength = max(size(speech_frame));
for k=1:model_order+1
R(k) = sum(speech_frame(1:winlength-k+1) ...
.*speech_frame(k:winlength));
end
% ----------------------------------------------------------
% (2) Levinson-Durbin
% ----------------------------------------------------------
a = ones(1,model_order);
E(1)=R(1);
for i=1:model_order
a_past(1:i-1) = a(1:i-1);
sum_term = sum(a_past(1:i-1).*R(i:-1:2));
rcoeff(i)=(R(i+1) - sum_term) / E(i);
a(i)=rcoeff(i);
a(1:i-1) = a_past(1:i-1) - rcoeff(i).*a_past(i-1:-1:1);
E(i+1)=(1-rcoeff(i)*rcoeff(i))*E(i);
end
acorr = R;
refcoeff = rcoeff;
lpparams = [1 -a];

View File

@ -0,0 +1,132 @@
function [snr_mean, segsnr_mean]= comp_SNR(cleanFile, enhdFile);
%
% Segmental Signal-to-Noise Ratio Objective Speech Quality Measure
%
% This function implements the segmental signal-to-noise ratio
% as defined in [1, p. 45] (see Equation 2.12).
%
% Usage: [SNRovl, SNRseg]=comp_snr(cleanFile.wav, enhancedFile.wav)
%
% cleanFile.wav - clean input file in .wav format
% enhancedFile - enhanced output file in .wav format
% SNRovl - overall SNR (dB)
% SNRseg - segmental SNR (dB)
%
% This function returns 2 parameters. The first item is the
% overall SNR for the two speech signals. The second value
% is the segmental signal-to-noise ratio (1 seg-snr per
% frame of input). The segmental SNR is clamped to range
% between 35dB and -10dB (see suggestions in [2]).
%
% Example call: [SNRovl,SNRseg]=comp_SNR('sp04.wav','enhanced.wav')
%
% References:
%
% [1] S. R. Quackenbush, T. P. Barnwell, and M. A. Clements,
% Objective Measures of Speech Quality. Prentice Hall
% Advanced Reference Series, Englewood Cliffs, NJ, 1988,
% ISBN: 0-13-629056-6.
%
% [2] P. E. Papamichalis, Practical Approaches to Speech
% Coding, Prentice-Hall, Englewood Cliffs, NJ, 1987.
% ISBN: 0-13-689019-9. (see pages 179-181).
%
% Authors: Bryan L. Pellom and John H. L. Hansen (July 1998)
% Modified by: Philipos C. Loizou (Oct 2006)
%
% Copyright (c) 2006 by Philipos C. Loizou
% $Revision: 0.0 $ $Date: 10/09/2006 $
%-------------------------------------------------------------------------
if nargin ~=2
fprintf('USAGE: [snr_mean, segsnr_mean]= comp_SNR(cleanFile, enhdFile) \n');
return;
end
[data1, Srate1, Nbits1]= wavread(cleanFile);
[data2, Srate2, Nbits2]= wavread(enhdFile);
if (( Srate1~= Srate2) | ( Nbits1~= Nbits2))
error( 'The two files do not match!\n');
end
len= min( length( data1), length( data2));
data1= data1( 1: len);
data2= data2( 1: len);
[snr_dist, segsnr_dist]= snr( data1, data2,Srate1);
snr_mean= snr_dist;
segsnr_mean= mean( segsnr_dist);
% =========================================================================
function [overall_snr, segmental_snr] = snr(clean_speech, processed_speech,sample_rate)
% ----------------------------------------------------------------------
% Check the length of the clean and processed speech. Must be the same.
% ----------------------------------------------------------------------
clean_length = length(clean_speech);
processed_length = length(processed_speech);
if (clean_length ~= processed_length)
disp('Error: Both Speech Files must be same length.');
return
end
% ----------------------------------------------------------------------
% Scale both clean speech and processed speech to have same dynamic
% range. Also remove DC component from each signal
% ----------------------------------------------------------------------
%clean_speech = clean_speech - mean(clean_speech);
%processed_speech = processed_speech - mean(processed_speech);
%processed_speech = processed_speech.*(max(abs(clean_speech))/ max(abs(processed_speech)));
overall_snr = 10* log10( sum(clean_speech.^2)/sum((clean_speech-processed_speech).^2));
% ----------------------------------------------------------------------
% Global Variables
% ----------------------------------------------------------------------
winlength = round(30*sample_rate/1000); %240; % window length in samples for 30-msecs
skiprate = floor(winlength/4); %60; % window skip in samples
MIN_SNR = -10; % minimum SNR in dB
MAX_SNR = 35; % maximum SNR in dB
% ----------------------------------------------------------------------
% For each frame of input speech, calculate the Segmental SNR
% ----------------------------------------------------------------------
num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames
start = 1; % starting sample
window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
for frame_count = 1: num_frames
% ----------------------------------------------------------
% (1) Get the Frames for the test and reference speech.
% Multiply by Hanning Window.
% ----------------------------------------------------------
clean_frame = clean_speech(start:start+winlength-1);
processed_frame = processed_speech(start:start+winlength-1);
clean_frame = clean_frame.*window;
processed_frame = processed_frame.*window;
% ----------------------------------------------------------
% (2) Compute the Segmental SNR
% ----------------------------------------------------------
signal_energy = sum(clean_frame.^2);
noise_energy = sum((clean_frame-processed_frame).^2);
segmental_snr(frame_count) = 10*log10(signal_energy/(noise_energy+eps)+eps);
segmental_snr(frame_count) = max(segmental_snr(frame_count),MIN_SNR);
segmental_snr(frame_count) = min(segmental_snr(frame_count),MAX_SNR);
start = start + skiprate;
end

View File

@ -0,0 +1,299 @@
function wss_dist= comp_wss(cleanFile, enhancedFile);
% ----------------------------------------------------------------------
%
% Weighted Spectral Slope (WSS) Objective Speech Quality Measure
%
% This function implements the Weighted Spectral Slope (WSS)
% distance measure originally proposed in [1]. The algorithm
% works by first decomposing the speech signal into a set of
% frequency bands (this is done for both the test and reference
% frame). The intensities within each critical band are
% measured. Then, a weighted distances between the measured
% slopes of the log-critical band spectra are computed.
% This measure is also described in Section 2.2.9 (pages 56-58)
% of [2].
%
% Whereas Klatt's original measure used 36 critical-band
% filters to estimate the smoothed short-time spectrum, this
% implementation considers a bank of 25 filters spanning
% the 4 kHz bandwidth.
%
% Usage: wss_dist=comp_wss(cleanFile.wav, enhancedFile.wav)
%
% cleanFile.wav - clean input file in .wav format
% enhancedFile - enhanced output file in .wav format
% wss_dist - computed spectral slope distance
%
% Example call: ws =comp_wss('sp04.wav','enhanced.wav')
%
% References:
%
% [1] D. H. Klatt, "Prediction of Perceived Phonetic Distance
% from Critical-Band Spectra: A First Step", Proc. IEEE
% ICASSP'82, Volume 2, pp. 1278-1281, May, 1982.
%
% [2] S. R. Quackenbush, T. P. Barnwell, and M. A. Clements,
% Objective Measures of Speech Quality. Prentice Hall
% Advanced Reference Series, Englewood Cliffs, NJ, 1988,
% ISBN: 0-13-629056-6.
%
% Authors: Bryan L. Pellom and John H. L. Hansen (July 1998)
% Modified by: Philipos C. Loizou (Oct 2006)
%
% Copyright (c) 2006 by Philipos C. Loizou
% $Revision: 0.0 $ $Date: 10/09/2006 $
%
% ----------------------------------------------------------------------
if nargin~=2
fprintf('USAGE: WSS=comp_wss(cleanFile.wav, enhancedFile.wav)\n');
fprintf('For more help, type: help comp_wss\n\n');
return;
end
alpha= 0.95;
[data1, Srate1, Nbits1]= wavread(cleanFile);
[data2, Srate2, Nbits2]= wavread(enhancedFile);
if ( Srate1~= Srate2) | ( Nbits1~= Nbits2)
error( 'The two files do not match!\n');
end
len= min( length( data1), length( data2));
data1= data1( 1: len)+eps;
data2= data2( 1: len)+eps;
wss_dist_vec= wss( data1, data2,Srate1);
wss_dist_vec= sort( wss_dist_vec);
wss_dist= mean( wss_dist_vec( 1: round( length( wss_dist_vec)*alpha)));
function distortion = wss(clean_speech, processed_speech,sample_rate)
% ----------------------------------------------------------------------
% Check the length of the clean and processed speech. Must be the same.
% ----------------------------------------------------------------------
clean_length = length(clean_speech);
processed_length = length(processed_speech);
if (clean_length ~= processed_length)
disp('Error: Files musthave same length.');
return
end
% ----------------------------------------------------------------------
% Global Variables
% ----------------------------------------------------------------------
winlength = round(30*sample_rate/1000); % window length in samples
skiprate = floor(winlength/4); % window skip in samples
max_freq = sample_rate/2; % maximum bandwidth
num_crit = 25; % number of critical bands
USE_FFT_SPECTRUM = 1; % defaults to 10th order LP spectrum
n_fft = 2^nextpow2(2*winlength);
n_fftby2 = n_fft/2; % FFT size/2
Kmax = 20; % value suggested by Klatt, pg 1280
Klocmax = 1; % value suggested by Klatt, pg 1280
% ----------------------------------------------------------------------
% Critical Band Filter Definitions (Center Frequency and Bandwidths in Hz)
% ----------------------------------------------------------------------
cent_freq(1) = 50.0000; bandwidth(1) = 70.0000;
cent_freq(2) = 120.000; bandwidth(2) = 70.0000;
cent_freq(3) = 190.000; bandwidth(3) = 70.0000;
cent_freq(4) = 260.000; bandwidth(4) = 70.0000;
cent_freq(5) = 330.000; bandwidth(5) = 70.0000;
cent_freq(6) = 400.000; bandwidth(6) = 70.0000;
cent_freq(7) = 470.000; bandwidth(7) = 70.0000;
cent_freq(8) = 540.000; bandwidth(8) = 77.3724;
cent_freq(9) = 617.372; bandwidth(9) = 86.0056;
cent_freq(10) = 703.378; bandwidth(10) = 95.3398;
cent_freq(11) = 798.717; bandwidth(11) = 105.411;
cent_freq(12) = 904.128; bandwidth(12) = 116.256;
cent_freq(13) = 1020.38; bandwidth(13) = 127.914;
cent_freq(14) = 1148.30; bandwidth(14) = 140.423;
cent_freq(15) = 1288.72; bandwidth(15) = 153.823;
cent_freq(16) = 1442.54; bandwidth(16) = 168.154;
cent_freq(17) = 1610.70; bandwidth(17) = 183.457;
cent_freq(18) = 1794.16; bandwidth(18) = 199.776;
cent_freq(19) = 1993.93; bandwidth(19) = 217.153;
cent_freq(20) = 2211.08; bandwidth(20) = 235.631;
cent_freq(21) = 2446.71; bandwidth(21) = 255.255;
cent_freq(22) = 2701.97; bandwidth(22) = 276.072;
cent_freq(23) = 2978.04; bandwidth(23) = 298.126;
cent_freq(24) = 3276.17; bandwidth(24) = 321.465;
cent_freq(25) = 3597.63; bandwidth(25) = 346.136;
bw_min = bandwidth (1); % minimum critical bandwidth
% ----------------------------------------------------------------------
% Set up the critical band filters. Note here that Gaussianly shaped
% filters are used. Also, the sum of the filter weights are equivalent
% for each critical band filter. Filter less than -30 dB and set to
% zero.
% ----------------------------------------------------------------------
min_factor = exp (-30.0 / (2.0 * 2.303)); % -30 dB point of filter
for i = 1:num_crit
f0 = (cent_freq (i) / max_freq) * (n_fftby2);
all_f0(i) = floor(f0);
bw = (bandwidth (i) / max_freq) * (n_fftby2);
norm_factor = log(bw_min) - log(bandwidth(i));
j = 0:1:n_fftby2-1;
crit_filter(i,:) = exp (-11 *(((j - floor(f0)) ./bw).^2) + norm_factor);
crit_filter(i,:) = crit_filter(i,:).*(crit_filter(i,:) > min_factor);
end
% ----------------------------------------------------------------------
% For each frame of input speech, calculate the Weighted Spectral
% Slope Measure
% ----------------------------------------------------------------------
num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames
start = 1; % starting sample
window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
for frame_count = 1:num_frames
% ----------------------------------------------------------
% (1) Get the Frames for the test and reference speech.
% Multiply by Hanning Window.
% ----------------------------------------------------------
clean_frame = clean_speech(start:start+winlength-1);
processed_frame = processed_speech(start:start+winlength-1);
clean_frame = clean_frame.*window;
processed_frame = processed_frame.*window;
% ----------------------------------------------------------
% (2) Compute the Power Spectrum of Clean and Processed
% ----------------------------------------------------------
if (USE_FFT_SPECTRUM)
clean_spec = (abs(fft(clean_frame,n_fft)).^2);
processed_spec = (abs(fft(processed_frame,n_fft)).^2);
else
a_vec = zeros(1,n_fft);
a_vec(1:11) = lpc(clean_frame,10);
clean_spec = 1.0/(abs(fft(a_vec,n_fft)).^2)';
a_vec = zeros(1,n_fft);
a_vec(1:11) = lpc(processed_frame,10);
processed_spec = 1.0/(abs(fft(a_vec,n_fft)).^2)';
end
% ----------------------------------------------------------
% (3) Compute Filterbank Output Energies (in dB scale)
% ----------------------------------------------------------
for i = 1:num_crit
clean_energy(i) = sum(clean_spec(1:n_fftby2) ...
.*crit_filter(i,:)');
processed_energy(i) = sum(processed_spec(1:n_fftby2) ...
.*crit_filter(i,:)');
end
clean_energy = 10*log10(max(clean_energy,1E-10));
processed_energy = 10*log10(max(processed_energy,1E-10));
% ----------------------------------------------------------
% (4) Compute Spectral Slope (dB[i+1]-dB[i])
% ----------------------------------------------------------
clean_slope = clean_energy(2:num_crit) - ...
clean_energy(1:num_crit-1);
processed_slope = processed_energy(2:num_crit) - ...
processed_energy(1:num_crit-1);
% ----------------------------------------------------------
% (5) Find the nearest peak locations in the spectra to
% each critical band. If the slope is negative, we
% search to the left. If positive, we search to the
% right.
% ----------------------------------------------------------
for i = 1:num_crit-1
% find the peaks in the clean speech signal
if (clean_slope(i)>0) % search to the right
n = i;
while ((n<num_crit) & (clean_slope(n) > 0))
n = n+1;
end
clean_loc_peak(i) = clean_energy(n-1);
else % search to the left
n = i;
while ((n>0) & (clean_slope(n) <= 0))
n = n-1;
end
clean_loc_peak(i) = clean_energy(n+1);
end
% find the peaks in the processed speech signal
if (processed_slope(i)>0) % search to the right
n = i;
while ((n<num_crit) & (processed_slope(n) > 0))
n = n+1;
end
processed_loc_peak(i) = processed_energy(n-1);
else % search to the left
n = i;
while ((n>0) & (processed_slope(n) <= 0))
n = n-1;
end
processed_loc_peak(i) = processed_energy(n+1);
end
end
% ----------------------------------------------------------
% (6) Compute the WSS Measure for this frame. This
% includes determination of the weighting function.
% ----------------------------------------------------------
dBMax_clean = max(clean_energy);
dBMax_processed = max(processed_energy);
% The weights are calculated by averaging individual
% weighting factors from the clean and processed frame.
% These weights W_clean and W_processed should range
% from 0 to 1 and place more emphasis on spectral
% peaks and less emphasis on slope differences in spectral
% valleys. This procedure is described on page 1280 of
% Klatt's 1982 ICASSP paper.
Wmax_clean = Kmax ./ (Kmax + dBMax_clean - ...
clean_energy(1:num_crit-1));
Wlocmax_clean = Klocmax ./ ( Klocmax + clean_loc_peak - ...
clean_energy(1:num_crit-1));
W_clean = Wmax_clean .* Wlocmax_clean;
Wmax_processed = Kmax ./ (Kmax + dBMax_processed - ...
processed_energy(1:num_crit-1));
Wlocmax_processed = Klocmax ./ ( Klocmax + processed_loc_peak - ...
processed_energy(1:num_crit-1));
W_processed = Wmax_processed .* Wlocmax_processed;
W = (W_clean + W_processed)./2.0;
distortion(frame_count) = sum(W.*(clean_slope(1:num_crit-1) - ...
processed_slope(1:num_crit-1)).^2);
% this normalization is not part of Klatt's paper, but helps
% to normalize the measure. Here we scale the measure by the
% sum of the weights.
distortion(frame_count) = distortion(frame_count)/sum(W);
start = start + skiprate;
end

View File

@ -0,0 +1,496 @@
function [Csig,Cbak,Covl]= composite(cleanFile, enhancedFile);
% ----------------------------------------------------------------------
% Composite Objective Speech Quality Measure
%
% This function implements the composite objective measure proposed in
% [1].
%
% Usage: [sig,bak,ovl]=composite(cleanFile.wav, enhancedFile.wav)
%
% cleanFile.wav - clean input file in .wav format
% enhancedFile - enhanced output file in .wav format
% sig - predicted rating [1-5] of speech distortion
% bak - predicted rating [1-5] of noise distortion
% ovl - predicted rating [1-5] of overall quality
%
% In addition to the above ratings (sig, bak, & ovl) it returns
% the individual values of the LLR, SNRseg, WSS and PESQ measures.
%
% Example call: [sig,bak,ovl] =composite('sp04.wav','enhanced.wav')
%
%
% References:
%
% [1] Hu, Y. and Loizou, P. (2006). Evaluation of objective measures
% for speech enhancement. Proc. Interspeech, Pittsburg, PA.
%
% Authors: Yi Hu and Philipos C. Loizou
% (the LLR, SNRseg and WSS measures were based on Bryan Pellom and John
% Hansen's implementations)
%
% Copyright (c) 2006 by Philipos C. Loizou
% $Revision: 0.0 $ $Date: 10/09/2006 $
% ----------------------------------------------------------------------
if nargin~=2
fprintf('USAGE: [sig,bak,ovl]=composite(cleanFile.wav, enhancedFile.wav)\n');
fprintf('For more help, type: help composite\n\n');
return;
end
alpha= 0.95;
[data1, Srate1, Nbits1]= wavread(cleanFile);
[data2, Srate2, Nbits2]= wavread(enhancedFile);
if ( Srate1~= Srate2) | ( Nbits1~= Nbits2)
error( 'The two files do not match!\n');
end
len= min( length( data1), length( data2));
data1= data1( 1: len)+eps;
data2= data2( 1: len)+eps;
% -- compute the WSS measure ---
%
wss_dist_vec= wss( data1, data2,Srate1);
wss_dist_vec= sort( wss_dist_vec);
wss_dist= mean( wss_dist_vec( 1: round( length( wss_dist_vec)*alpha)));
% --- compute the LLR measure ---------
%
LLR_dist= llr( data1, data2,Srate1);
LLRs= sort(LLR_dist);
LLR_len= round( length(LLR_dist)* alpha);
llr_mean= mean( LLRs( 1: LLR_len));
% --- compute the SNRseg ----------------
%
[snr_dist, segsnr_dist]= snr( data1, data2,Srate1);
snr_mean= snr_dist;
segSNR= mean( segsnr_dist);
% -- compute the pesq ----
[pesq_mos]= pesq(cleanFile, enhancedFile);
% --- now compute the composite measures ------------------
%
Csig = 3.093 - 1.029*llr_mean + 0.603*pesq_mos-0.009*wss_dist;
Csig = max(1,Csig); Csig=min(5, Csig); % limit values to [1, 5]
Cbak = 1.634 + 0.478 *pesq_mos - 0.007*wss_dist + 0.063*segSNR;
Cbak = max(1, Cbak); Cbak=min(5,Cbak); % limit values to [1, 5]
Covl = 1.594 + 0.805*pesq_mos - 0.512*llr_mean - 0.007*wss_dist;
Covl = max(1, Covl); Covl=min(5, Covl); % limit values to [1, 5]
fprintf('\n LLR=%f SNRseg=%f WSS=%f PESQ=%f\n',llr_mean,segSNR,wss_dist,pesq_mos);
return; %=================================================================
function distortion = wss(clean_speech, processed_speech,sample_rate)
% ----------------------------------------------------------------------
% Check the length of the clean and processed speech. Must be the same.
% ----------------------------------------------------------------------
clean_length = length(clean_speech);
processed_length = length(processed_speech);
if (clean_length ~= processed_length)
disp('Error: Files musthave same length.');
return
end
% ----------------------------------------------------------------------
% Global Variables
% ----------------------------------------------------------------------
winlength = round(30*sample_rate/1000); %240; % window length in samples
skiprate = floor(winlength/4); % window skip in samples
max_freq = sample_rate/2; % maximum bandwidth
num_crit = 25; % number of critical bands
USE_FFT_SPECTRUM = 1; % defaults to 10th order LP spectrum
n_fft = 2^nextpow2(2*winlength);
n_fftby2 = n_fft/2; % FFT size/2
Kmax = 20; % value suggested by Klatt, pg 1280
Klocmax = 1; % value suggested by Klatt, pg 1280
% ----------------------------------------------------------------------
% Critical Band Filter Definitions (Center Frequency and Bandwidths in Hz)
% ----------------------------------------------------------------------
cent_freq(1) = 50.0000; bandwidth(1) = 70.0000;
cent_freq(2) = 120.000; bandwidth(2) = 70.0000;
cent_freq(3) = 190.000; bandwidth(3) = 70.0000;
cent_freq(4) = 260.000; bandwidth(4) = 70.0000;
cent_freq(5) = 330.000; bandwidth(5) = 70.0000;
cent_freq(6) = 400.000; bandwidth(6) = 70.0000;
cent_freq(7) = 470.000; bandwidth(7) = 70.0000;
cent_freq(8) = 540.000; bandwidth(8) = 77.3724;
cent_freq(9) = 617.372; bandwidth(9) = 86.0056;
cent_freq(10) = 703.378; bandwidth(10) = 95.3398;
cent_freq(11) = 798.717; bandwidth(11) = 105.411;
cent_freq(12) = 904.128; bandwidth(12) = 116.256;
cent_freq(13) = 1020.38; bandwidth(13) = 127.914;
cent_freq(14) = 1148.30; bandwidth(14) = 140.423;
cent_freq(15) = 1288.72; bandwidth(15) = 153.823;
cent_freq(16) = 1442.54; bandwidth(16) = 168.154;
cent_freq(17) = 1610.70; bandwidth(17) = 183.457;
cent_freq(18) = 1794.16; bandwidth(18) = 199.776;
cent_freq(19) = 1993.93; bandwidth(19) = 217.153;
cent_freq(20) = 2211.08; bandwidth(20) = 235.631;
cent_freq(21) = 2446.71; bandwidth(21) = 255.255;
cent_freq(22) = 2701.97; bandwidth(22) = 276.072;
cent_freq(23) = 2978.04; bandwidth(23) = 298.126;
cent_freq(24) = 3276.17; bandwidth(24) = 321.465;
cent_freq(25) = 3597.63; bandwidth(25) = 346.136;
bw_min = bandwidth (1); % minimum critical bandwidth
% ----------------------------------------------------------------------
% Set up the critical band filters. Note here that Gaussianly shaped
% filters are used. Also, the sum of the filter weights are equivalent
% for each critical band filter. Filter less than -30 dB and set to
% zero.
% ----------------------------------------------------------------------
min_factor = exp (-30.0 / (2.0 * 2.303)); % -30 dB point of filter
for i = 1:num_crit
f0 = (cent_freq (i) / max_freq) * (n_fftby2);
all_f0(i) = floor(f0);
bw = (bandwidth (i) / max_freq) * (n_fftby2);
norm_factor = log(bw_min) - log(bandwidth(i));
j = 0:1:n_fftby2-1;
crit_filter(i,:) = exp (-11 *(((j - floor(f0)) ./bw).^2) + norm_factor);
crit_filter(i,:) = crit_filter(i,:).*(crit_filter(i,:) > min_factor);
end
% ----------------------------------------------------------------------
% For each frame of input speech, calculate the Weighted Spectral
% Slope Measure
% ----------------------------------------------------------------------
num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames
start = 1; % starting sample
window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
for frame_count = 1:num_frames
% ----------------------------------------------------------
% (1) Get the Frames for the test and reference speech.
% Multiply by Hanning Window.
% ----------------------------------------------------------
clean_frame = clean_speech(start:start+winlength-1);
processed_frame = processed_speech(start:start+winlength-1);
clean_frame = clean_frame.*window;
processed_frame = processed_frame.*window;
% ----------------------------------------------------------
% (2) Compute the Power Spectrum of Clean and Processed
% ----------------------------------------------------------
if (USE_FFT_SPECTRUM)
clean_spec = (abs(fft(clean_frame,n_fft)).^2);
processed_spec = (abs(fft(processed_frame,n_fft)).^2);
else
a_vec = zeros(1,n_fft);
a_vec(1:11) = lpc(clean_frame,10);
clean_spec = 1.0/(abs(fft(a_vec,n_fft)).^2)';
a_vec = zeros(1,n_fft);
a_vec(1:11) = lpc(processed_frame,10);
processed_spec = 1.0/(abs(fft(a_vec,n_fft)).^2)';
end
% ----------------------------------------------------------
% (3) Compute Filterbank Output Energies (in dB scale)
% ----------------------------------------------------------
for i = 1:num_crit
clean_energy(i) = sum(clean_spec(1:n_fftby2) ...
.*crit_filter(i,:)');
processed_energy(i) = sum(processed_spec(1:n_fftby2) ...
.*crit_filter(i,:)');
end
clean_energy = 10*log10(max(clean_energy,1E-10));
processed_energy = 10*log10(max(processed_energy,1E-10));
% ----------------------------------------------------------
% (4) Compute Spectral Slope (dB[i+1]-dB[i])
% ----------------------------------------------------------
clean_slope = clean_energy(2:num_crit) - ...
clean_energy(1:num_crit-1);
processed_slope = processed_energy(2:num_crit) - ...
processed_energy(1:num_crit-1);
% ----------------------------------------------------------
% (5) Find the nearest peak locations in the spectra to
% each critical band. If the slope is negative, we
% search to the left. If positive, we search to the
% right.
% ----------------------------------------------------------
for i = 1:num_crit-1
% find the peaks in the clean speech signal
if (clean_slope(i)>0) % search to the right
n = i;
while ((n<num_crit) & (clean_slope(n) > 0))
n = n+1;
end
clean_loc_peak(i) = clean_energy(n-1);
else % search to the left
n = i;
while ((n>0) & (clean_slope(n) <= 0))
n = n-1;
end
clean_loc_peak(i) = clean_energy(n+1);
end
% find the peaks in the processed speech signal
if (processed_slope(i)>0) % search to the right
n = i;
while ((n<num_crit) & (processed_slope(n) > 0))
n = n+1;
end
processed_loc_peak(i) = processed_energy(n-1);
else % search to the left
n = i;
while ((n>0) & (processed_slope(n) <= 0))
n = n-1;
end
processed_loc_peak(i) = processed_energy(n+1);
end
end
% ----------------------------------------------------------
% (6) Compute the WSS Measure for this frame. This
% includes determination of the weighting function.
% ----------------------------------------------------------
dBMax_clean = max(clean_energy);
dBMax_processed = max(processed_energy);
% The weights are calculated by averaging individual
% weighting factors from the clean and processed frame.
% These weights W_clean and W_processed should range
% from 0 to 1 and place more emphasis on spectral
% peaks and less emphasis on slope differences in spectral
% valleys. This procedure is described on page 1280 of
% Klatt's 1982 ICASSP paper.
Wmax_clean = Kmax ./ (Kmax + dBMax_clean - ...
clean_energy(1:num_crit-1));
Wlocmax_clean = Klocmax ./ ( Klocmax + clean_loc_peak - ...
clean_energy(1:num_crit-1));
W_clean = Wmax_clean .* Wlocmax_clean;
Wmax_processed = Kmax ./ (Kmax + dBMax_processed - ...
processed_energy(1:num_crit-1));
Wlocmax_processed = Klocmax ./ ( Klocmax + processed_loc_peak - ...
processed_energy(1:num_crit-1));
W_processed = Wmax_processed .* Wlocmax_processed;
W = (W_clean + W_processed)./2.0;
distortion(frame_count) = sum(W.*(clean_slope(1:num_crit-1) - ...
processed_slope(1:num_crit-1)).^2);
% this normalization is not part of Klatt's paper, but helps
% to normalize the measure. Here we scale the measure by the
% sum of the weights.
distortion(frame_count) = distortion(frame_count)/sum(W);
start = start + skiprate;
end
%-----------------------------------------------
function distortion = llr(clean_speech, processed_speech,sample_rate)
% ----------------------------------------------------------------------
% Check the length of the clean and processed speech. Must be the same.
% ----------------------------------------------------------------------
clean_length = length(clean_speech);
processed_length = length(processed_speech);
if (clean_length ~= processed_length)
disp('Error: Both Speech Files must be same length.');
return
end
% ----------------------------------------------------------------------
% Global Variables
% ----------------------------------------------------------------------
winlength = round(30*sample_rate/1000); % window length in samples
skiprate = floor(winlength/4); % window skip in samples
if sample_rate<10000
P = 10; % LPC Analysis Order
else
P=16; % this could vary depending on sampling frequency.
end
% ----------------------------------------------------------------------
% For each frame of input speech, calculate the Log Likelihood Ratio
% ----------------------------------------------------------------------
num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames
start = 1; % starting sample
window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
for frame_count = 1:num_frames
% ----------------------------------------------------------
% (1) Get the Frames for the test and reference speech.
% Multiply by Hanning Window.
% ----------------------------------------------------------
clean_frame = clean_speech(start:start+winlength-1);
processed_frame = processed_speech(start:start+winlength-1);
clean_frame = clean_frame.*window;
processed_frame = processed_frame.*window;
% ----------------------------------------------------------
% (2) Get the autocorrelation lags and LPC parameters used
% to compute the LLR measure.
% ----------------------------------------------------------
[R_clean, Ref_clean, A_clean] = ...
lpcoeff(clean_frame, P);
[R_processed, Ref_processed, A_processed] = ...
lpcoeff(processed_frame, P);
% ----------------------------------------------------------
% (3) Compute the LLR measure
% ----------------------------------------------------------
numerator = A_processed*toeplitz(R_clean)*A_processed';
denominator = A_clean*toeplitz(R_clean)*A_clean';
distortion(frame_count) = log(numerator/denominator);
start = start + skiprate;
end
%---------------------------------------------
function [acorr, refcoeff, lpparams] = lpcoeff(speech_frame, model_order)
% ----------------------------------------------------------
% (1) Compute Autocorrelation Lags
% ----------------------------------------------------------
winlength = max(size(speech_frame));
for k=1:model_order+1
R(k) = sum(speech_frame(1:winlength-k+1) ...
.*speech_frame(k:winlength));
end
% ----------------------------------------------------------
% (2) Levinson-Durbin
% ----------------------------------------------------------
a = ones(1,model_order);
E(1)=R(1);
for i=1:model_order
a_past(1:i-1) = a(1:i-1);
sum_term = sum(a_past(1:i-1).*R(i:-1:2));
rcoeff(i)=(R(i+1) - sum_term) / E(i);
a(i)=rcoeff(i);
a(1:i-1) = a_past(1:i-1) - rcoeff(i).*a_past(i-1:-1:1);
E(i+1)=(1-rcoeff(i)*rcoeff(i))*E(i);
end
acorr = R;
refcoeff = rcoeff;
lpparams = [1 -a];
% ----------------------------------------------------------------------
function [overall_snr, segmental_snr] = snr(clean_speech, processed_speech,sample_rate)
% ----------------------------------------------------------------------
% Check the length of the clean and processed speech. Must be the same.
% ----------------------------------------------------------------------
clean_length = length(clean_speech);
processed_length = length(processed_speech);
if (clean_length ~= processed_length)
disp('Error: Both Speech Files must be same length.');
return
end
% ----------------------------------------------------------------------
% Scale both clean speech and processed speech to have same dynamic
% range. Also remove DC component from each signal
% ----------------------------------------------------------------------
%clean_speech = clean_speech - mean(clean_speech);
%processed_speech = processed_speech - mean(processed_speech);
%processed_speech = processed_speech.*(max(abs(clean_speech))/ max(abs(processed_speech)));
overall_snr = 10* log10( sum(clean_speech.^2)/sum((clean_speech-processed_speech).^2));
% ----------------------------------------------------------------------
% Global Variables
% ----------------------------------------------------------------------
winlength = round(30*sample_rate/1000); %240; % window length in samples
skiprate = floor(winlength/4); % window skip in samples
MIN_SNR = -10; % minimum SNR in dB
MAX_SNR = 35; % maximum SNR in dB
% ----------------------------------------------------------------------
% For each frame of input speech, calculate the Segmental SNR
% ----------------------------------------------------------------------
num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames
start = 1; % starting sample
window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
for frame_count = 1: num_frames
% ----------------------------------------------------------
% (1) Get the Frames for the test and reference speech.
% Multiply by Hanning Window.
% ----------------------------------------------------------
clean_frame = clean_speech(start:start+winlength-1);
processed_frame = processed_speech(start:start+winlength-1);
clean_frame = clean_frame.*window;
processed_frame = processed_frame.*window;
% ----------------------------------------------------------
% (2) Compute the Segmental SNR
% ----------------------------------------------------------
signal_energy = sum(clean_frame.^2);
noise_energy = sum((clean_frame-processed_frame).^2);
segmental_snr(frame_count) = 10*log10(signal_energy/(noise_energy+eps)+eps);
segmental_snr(frame_count) = max(segmental_snr(frame_count),MIN_SNR);
segmental_snr(frame_count) = min(segmental_snr(frame_count),MAX_SNR);
start = start + skiprate;
end

View File

@ -0,0 +1,84 @@
function crude_align( ref_logVAD, ref_Nsamples, deg_logVAD, ...
deg_Nsamples, Utt_id)
global Downsample
global Nutterances Largest_uttsize Nsurf_samples Crude_DelayEst
global Crude_DelayConf UttSearch_Start UttSearch_End Utt_DelayEst
global Utt_Delay Utt_DelayConf Utt_Start Utt_End
global MAXNUTTERANCES WHOLE_SIGNAL
global pesq_mos subj_mos cond_nr
if (Utt_id== WHOLE_SIGNAL )
nr = floor( ref_Nsamples/ Downsample);
nd = floor( deg_Nsamples/ Downsample);
startr= 1;
startd= 1;
elseif Utt_id== MAXNUTTERANCES
startr= UttSearch_Start(MAXNUTTERANCES);
startd= startr+ Utt_DelayEst(MAXNUTTERANCES)/ Downsample;
if ( startd< 0 )
startr= 1- Utt_DelayEst(MAXNUTTERANCES)/ Downsample;
startd= 1;
end
nr= UttSearch_End(MAXNUTTERANCES)- startr;
nd= nr;
if( startd+ nd> floor( deg_Nsamples/ Downsample) )
nd= floor( deg_Nsamples/ Downsample)- startd;
end
% fprintf( 'nr,nd is %d,%d\n', nr, nd);
else
startr= UttSearch_Start(Utt_id);
startd= startr+ Crude_DelayEst/ Downsample;
if ( startd< 0 )
startr= 1- Crude_DelayEst/ Downsample;
startd= 1;
end
nr= UttSearch_End(Utt_id)- startr;
nd = nr;
if( startd+ nd> floor( deg_Nsamples/ Downsample)+ 1)
nd = floor( deg_Nsamples/ Downsample)- startd+ 1;
end
end
max_Y= 0.0;
I_max_Y= nr;
if( (nr> 1) && (nd> 1) )
Y= FFTNXCorr( ref_logVAD, startr, nr, deg_logVAD, startd, nd);
[max_Y, I_max_Y]= max( Y);
if (max_Y<= 0)
max_Y= 0;
I_max_Y= nr;
end
end
% fprintf( 'max_Y, I_max_Y is %f, %d\n', max_Y, I_max_Y);
if( Utt_id== WHOLE_SIGNAL )
Crude_DelayEst= (I_max_Y- nr)* Downsample;
Crude_DelayConf= 0.0;
% fprintf( 1, 'I_max_Y, nr, Crude_DelayEst is %f, %f, %f\n', ...
% I_max_Y, nr, Crude_DelayEst);
elseif( Utt_id == MAXNUTTERANCES )
Utt_Delay(MAXNUTTERANCES)= (I_max_Y- nr)* Downsample+ ...
Utt_DelayEst(MAXNUTTERANCES);
% fprintf( 'startr, startd, nr, nd, I_max, Utt_Delay[%d] is %d, %d, %d, %d, %d, %d\n', ...
% MAXNUTTERANCES, startr, startd, nr, nd, ...
% I_max_Y, Utt_Delay(MAXNUTTERANCES) );
else
% fprintf( 'I_max_Y, nr is %d, %d\n', I_max_Y, nr);
Utt_DelayEst(Utt_id)= (I_max_Y- nr)* Downsample+ ...
Crude_DelayEst;
end

View File

@ -0,0 +1,21 @@
function mod_data= fix_power_level( data, data_Nsamples, maxNsamples)
% this function is used for level normalization, i.e., to fix the power
% level of data to a preset number, and return it to mod_data.
global Downsample DATAPADDING_MSECS SEARCHBUFFER Fs
global TARGET_AVG_POWER
TARGET_AVG_POWER= 1e7;
align_filter_dB= [0,-500; 50, -500; 100, -500; 125, -500; 160, -500; 200, -500;
250, -500; 300, -500; 350, 0; 400, 0; 500, 0; 600, 0; 630, 0;
800, 0; 1000, 0; 1250, 0; 1600, 0; 2000, 0; 2500, 0; 3000, 0;
3250, 0; 3500, -500; 4000, -500; 5000, -500; 6300, -500; 8000, -500];
align_filtered= apply_filter( data, data_Nsamples, align_filter_dB);
power_above_300Hz = pow_of (align_filtered, SEARCHBUFFER* Downsample+ 1, ...
data_Nsamples- SEARCHBUFFER* Downsample+ DATAPADDING_MSECS* (Fs/ 1000), ...
maxNsamples- 2* SEARCHBUFFER* Downsample+ DATAPADDING_MSECS* (Fs/ 1000));
global_scale= sqrt( TARGET_AVG_POWER/ power_above_300Hz);
% fprintf( 1, '\tglobal_scale is %f\n', global_scale);
mod_data= data* global_scale;

View File

@ -0,0 +1,68 @@
function id_searchwindows( ref_VAD, ref_Nsamples, deg_VAD, deg_Nsamples);
global MINUTTLENGTH Downsample MINUTTLENGTH SEARCHBUFFER
global Crude_DelayEst Nutterances UttSearch_Start UttSearch_End
Utt_num = 1;
speech_flag = 0;
VAD_length= floor( ref_Nsamples/ Downsample);
del_deg_start= MINUTTLENGTH- Crude_DelayEst/ Downsample;
del_deg_end= floor((deg_Nsamples- Crude_DelayEst)/ Downsample)-...
MINUTTLENGTH;
for count= 1: VAD_length
VAD_value= ref_VAD(count);
if( (VAD_value> 0) && (speech_flag== 0) )
speech_flag= 1;
this_start= count;
UttSearch_Start(Utt_num)= count- SEARCHBUFFER;
if( UttSearch_Start(Utt_num)< 0 )
UttSearch_Start(Utt_num)= 0;
end
end
if( ((VAD_value== 0) || (count == (VAD_length-1))) && ...
(speech_flag == 1) )
speech_flag = 0;
UttSearch_End(Utt_num) = count + SEARCHBUFFER;
if( UttSearch_End(Utt_num) > VAD_length - 1 )
UttSearch_End(Utt_num) = VAD_length -1;
end
if( ((count - this_start) >= MINUTTLENGTH) &&...
(this_start < del_deg_end) &&...
(count > del_deg_start) )
Utt_num= Utt_num + 1;
end
end
end
Utt_num= Utt_num- 1;
Nutterances = Utt_num;
% fprintf( 1, 'Nutterances is %d\n', Nutterances);
% fid= fopen( 'mat_utt.txt', 'wt');
% fprintf( fid, '%d\n', UttSearch_Start( 1: Nutterances));
% fprintf( fid, '\n');
% fprintf( fid, '%d\n', UttSearch_End( 1: Nutterances));
% fclose(fid);

View File

@ -0,0 +1,85 @@
function id_utterances( ref_Nsamples, ref_VAD, deg_Nsamples)
global Largest_uttsize MINUTTLENGTH MINUTTLENGTH Crude_DelayEst
global Downsample SEARCHBUFFER Nutterances Utt_Start
global Utt_End Utt_Delay
Utt_num = 1;
speech_flag = 0;
VAD_length = floor( ref_Nsamples / Downsample);
% fprintf( 1, 'VAD_length is %d\n', VAD_length);
del_deg_start = MINUTTLENGTH - Crude_DelayEst / Downsample;
del_deg_end = floor((deg_Nsamples- Crude_DelayEst)/ Downsample) ...
- MINUTTLENGTH;
for count = 1: VAD_length
VAD_value = ref_VAD(count);
if( (VAD_value > 0.0) && (speech_flag == 0) )
speech_flag = 1;
this_start = count;
Utt_Start (Utt_num) = count;
end
if( ((VAD_value == 0) || (count == VAD_length)) && ...
(speech_flag == 1) )
speech_flag = 0;
Utt_End (Utt_num) = count;
if( ((count - this_start) >= MINUTTLENGTH) && ...
(this_start < del_deg_end) && ...
(count > del_deg_start) )
Utt_num = Utt_num + 1;
end
end
end
Utt_Start(1) = SEARCHBUFFER+ 1;
Utt_End(Nutterances) = VAD_length - SEARCHBUFFER+ 1;
for Utt_num = 2: Nutterances
this_start = Utt_Start(Utt_num)- 1;
last_end = Utt_End(Utt_num - 1)- 1;
count = floor( (this_start + last_end) / 2);
Utt_Start(Utt_num) = count+ 1;
Utt_End(Utt_num - 1) = count+ 1;
end
this_start = (Utt_Start(1)- 1) * Downsample + Utt_Delay(1);
if( this_start < (SEARCHBUFFER * Downsample) )
count = SEARCHBUFFER + floor( ...
(Downsample - 1 - Utt_Delay(1)) / Downsample);
Utt_Start(1) = count+ 1;
end
last_end = (Utt_End(Nutterances)- 1) * Downsample + 1 + ...
Utt_Delay(Nutterances);
% fprintf( 'Utt_End(%d) is %d\n', Nutterances, Utt_End(Nutterances));
% fprintf( 'last_end is %d\n', last_end);
% fprintf( 'Utt_Delay(%d) is %d\n', Nutterances, Utt_Delay(Nutterances));
if( last_end > (deg_Nsamples - SEARCHBUFFER * Downsample+ 1) )
count = floor( (deg_Nsamples - Utt_Delay(Nutterances)) / Downsample) ...
- SEARCHBUFFER;
Utt_End(Nutterances) = count+ 1;
end
for Utt_num = 2: Nutterances
this_start = (Utt_Start(Utt_num)- 1) * Downsample + Utt_Delay(Utt_num);
last_end = (Utt_End(Utt_num - 1)- 1) * Downsample + Utt_Delay(Utt_num - 1);
if( this_start < last_end )
count = floor( (this_start + last_end) / 2);
this_start = floor( (Downsample- 1+ count- Utt_Delay(Utt_num))...
/ Downsample);
last_end = floor( (count - Utt_Delay(Utt_num - 1))...
/ Downsample);
Utt_Start(Utt_num) = this_start+ 1;
Utt_End(Utt_num- 1) = last_end+ 1;
end
end
Largest_uttsize= max( Utt_End- Utt_Start);

View File

@ -0,0 +1,9 @@
function [mod_ref_data, mod_deg_data]= input_filter( ref_data, ref_Nsamples, ...
deg_data, deg_Nsamples)
mod_ref_data= DC_block( ref_data, ref_Nsamples);
mod_deg_data= DC_block( deg_data, deg_Nsamples);
mod_ref_data= apply_filters( mod_ref_data, ref_Nsamples);
mod_deg_data= apply_filters( mod_deg_data, deg_Nsamples);

View File

@ -0,0 +1,127 @@
function [pesq_mos]= pesq(ref_wav, deg_wav)
% ----------------------------------------------------------------------
% PESQ objective speech quality measure
%
% This function implements the PESQ measure based on the ITU standard
% P.862 [1].
%
%
% Usage: pval=pesq(cleanFile.wav, enhancedFile.wav)
%
% cleanFile.wav - clean input file in .wav format
% enhancedFile - enhanced output file in .wav format
% pval - PESQ value
%
% Note that the PESQ routine only supports sampling rates of 8 kHz and
% 16 kHz [1]
%
% Example call: pval = pesq ('sp04.wav','enhanced.wav')
%
%
% References:
% [1] ITU (2000). Perceptual evaluation of speech quality (PESQ), and
% objective method for end-to-end speech quality assessment of
% narrowband telephone networks and speech codecs. ITU-T
% Recommendation P. 862
%
% Authors: Yi Hu and Philipos C. Loizou
%
%
% Copyright (c) 2006 by Philipos C. Loizou
% $Revision: 0.0 $ $Date: 10/09/2006 $
% ----------------------------------------------------------------------
if nargin<2
fprintf('Usage: [pesq_mos]=pesq(cleanfile.wav,enhanced.wav) \n');
return;
end;
global Downsample DATAPADDING_MSECS SEARCHBUFFER Fs WHOLE_SIGNAL
global Align_Nfft Window
[ref_data,sampling_rate]= audioread( ref_wav);
if sampling_rate~=8000 & sampling_rate~=16000
error('Sampling frequency needs to be either 8000 or 16000 Hz');
end
setup_global( sampling_rate);
% Window= hann( Align_Nfft, 'periodic'); %Hanning window
% Window= Window';
TWOPI= 6.28318530717959;
%for count = 0: Align_Nfft- 1
% Window(1+ count) = 0.5 * (1.0 - cos((TWOPI * count) / Align_Nfft));
%end
count=0:Align_Nfft- 1;
Window= 0.5 * (1.0 - cos((TWOPI * count) / Align_Nfft));
ref_data= ref_data';
ref_data= ref_data* 32768;
ref_Nsamples= length( ref_data)+ 2* SEARCHBUFFER* Downsample;
ref_data= [zeros( 1, SEARCHBUFFER* Downsample), ref_data, ...
zeros( 1, DATAPADDING_MSECS* (Fs/ 1000)+ SEARCHBUFFER* Downsample)];
deg_data= audioread( deg_wav);
deg_data= deg_data';
deg_data= deg_data* 32768;
deg_Nsamples= length( deg_data)+ 2* SEARCHBUFFER* Downsample;
deg_data= [zeros( 1, SEARCHBUFFER* Downsample), deg_data, ...
zeros( 1, DATAPADDING_MSECS* (Fs/ 1000)+ SEARCHBUFFER* Downsample)];
maxNsamples= max( ref_Nsamples, deg_Nsamples);
ref_data= fix_power_level( ref_data, ref_Nsamples, maxNsamples);
deg_data= fix_power_level( deg_data, deg_Nsamples, maxNsamples);
standard_IRS_filter_dB= [0, -200; 50, -40; 100, -20; 125, -12; 160, -6; 200, 0;...
250, 4; 300, 6; 350, 8; 400, 10; 500, 11; 600, 12; 700, 12; 800, 12;...
1000, 12; 1300, 12; 1600, 12; 2000, 12; 2500, 12; 3000, 12; 3250, 12;...
3500, 4; 4000, -200; 5000, -200; 6300, -200; 8000, -200];
ref_data= apply_filter( ref_data, ref_Nsamples, standard_IRS_filter_dB);
deg_data= apply_filter( deg_data, deg_Nsamples, standard_IRS_filter_dB);
%
% for later use in psychoacoustical model
model_ref= ref_data;
model_deg= deg_data;
[ref_data, deg_data]= input_filter( ref_data, ref_Nsamples, deg_data, ...
deg_Nsamples);
[ref_VAD, ref_logVAD]= apply_VAD( ref_data, ref_Nsamples);
[deg_VAD, deg_logVAD]= apply_VAD( deg_data, deg_Nsamples);
crude_align (ref_logVAD, ref_Nsamples, deg_logVAD, deg_Nsamples,...
WHOLE_SIGNAL);
utterance_locate (ref_data, ref_Nsamples, ref_VAD, ref_logVAD,...
deg_data, deg_Nsamples, deg_VAD, deg_logVAD);
ref_data= model_ref;
deg_data= model_deg;
% make ref_data and deg_data equal length
if (ref_Nsamples< deg_Nsamples)
newlen= deg_Nsamples+ DATAPADDING_MSECS* (Fs/ 1000);
ref_data( newlen)= 0;
elseif (ref_Nsamples> deg_Nsamples)
newlen= ref_Nsamples+ DATAPADDING_MSECS* (Fs/ 1000);
deg_data( newlen)= 0;
end
pesq_mos= pesq_psychoacoustic_model (ref_data, ref_Nsamples, deg_data, ...
deg_Nsamples );

View File

@ -0,0 +1,920 @@
function pesq_mos= pesq_psychoacoustic_model (ref_data, ref_Nsamples, deg_data, ...
deg_Nsamples )
global CALIBRATE Nfmax Nb Sl Sp
global nr_of_hz_bands_per_bark_band centre_of_band_bark
global width_of_band_hz centre_of_band_hz width_of_band_bark
global pow_dens_correction_factor abs_thresh_power
global Downsample SEARCHBUFFER DATAPADDING_MSECS Fs Nutterances
global Utt_Start Utt_End Utt_Delay NUMBER_OF_PSQM_FRAMES_PER_SYLLABE
global Fs Plot_Frame
% Plot_Frame= 75; % this is the frame whose spectrum will be plotted
FALSE= 0;
TRUE= 1;
NUMBER_OF_PSQM_FRAMES_PER_SYLLABE= 20;
maxNsamples = max (ref_Nsamples, deg_Nsamples);
Nf = Downsample * 8;
MAX_NUMBER_OF_BAD_INTERVALS = 1000;
start_frame_of_bad_interval= zeros( 1, MAX_NUMBER_OF_BAD_INTERVALS);
stop_frame_of_bad_interval= zeros( 1, MAX_NUMBER_OF_BAD_INTERVALS);
start_sample_of_bad_interval= zeros( 1, MAX_NUMBER_OF_BAD_INTERVALS);
stop_sample_of_bad_interval= zeros( 1, MAX_NUMBER_OF_BAD_INTERVALS);
number_of_samples_in_bad_interval= zeros( 1, MAX_NUMBER_OF_BAD_INTERVALS);
delay_in_samples_in_bad_interval= zeros( 1, MAX_NUMBER_OF_BAD_INTERVALS);
number_of_bad_intervals= 0;
there_is_a_bad_frame= FALSE;
Whanning= hann( Nf, 'periodic');
Whanning= Whanning';
D_POW_F = 2;
D_POW_S = 6;
D_POW_T = 2;
A_POW_F = 1;
A_POW_S = 6;
A_POW_T = 2;
D_WEIGHT= 0.1;
A_WEIGHT= 0.0309;
CRITERIUM_FOR_SILENCE_OF_5_SAMPLES = 500;
samples_to_skip_at_start = 0;
sum_of_5_samples= 0;
while ((sum_of_5_samples< CRITERIUM_FOR_SILENCE_OF_5_SAMPLES) ...
&& (samples_to_skip_at_start < maxNsamples / 2))
sum_of_5_samples= sum( abs( ref_data( samples_to_skip_at_start...
+ SEARCHBUFFER * Downsample + 1: samples_to_skip_at_start...
+ SEARCHBUFFER * Downsample + 5)));
if (sum_of_5_samples< CRITERIUM_FOR_SILENCE_OF_5_SAMPLES)
samples_to_skip_at_start = samples_to_skip_at_start+ 1;
end
end
% fprintf( 'samples_to_skip_at_start is %d\n', samples_to_skip_at_start);
samples_to_skip_at_end = 0;
sum_of_5_samples= 0;
while ((sum_of_5_samples< CRITERIUM_FOR_SILENCE_OF_5_SAMPLES) ...
&& (samples_to_skip_at_end < maxNsamples / 2))
sum_of_5_samples= sum( abs( ref_data( maxNsamples - ...
SEARCHBUFFER* Downsample + DATAPADDING_MSECS* (Fs/ 1000) ...
- samples_to_skip_at_end - 4: maxNsamples - ...
SEARCHBUFFER* Downsample + DATAPADDING_MSECS* (Fs/ 1000) ...
- samples_to_skip_at_end)));
if (sum_of_5_samples< CRITERIUM_FOR_SILENCE_OF_5_SAMPLES)
samples_to_skip_at_end = samples_to_skip_at_end+ 1;
end
end
% fprintf( 'samples_to_skip_at_end is %d\n', samples_to_skip_at_end);
start_frame = floor( samples_to_skip_at_start/ (Nf/ 2));
stop_frame = floor( (maxNsamples- 2* SEARCHBUFFER* Downsample ...
+ DATAPADDING_MSECS* (Fs/ 1000)- samples_to_skip_at_end) ...
/ (Nf/ 2))- 1;
% number of frames in speech data plus DATAPADDING_MSECS
% fprintf( 'start/end frame is %d/%d\n', start_frame, stop_frame);
D_disturbance= zeros( stop_frame+ 1, Nb);
DA_disturbance= zeros( stop_frame+ 1, Nb);
power_ref = pow_of (ref_data, SEARCHBUFFER* Downsample, ...
maxNsamples- SEARCHBUFFER* Downsample+ DATAPADDING_MSECS* (Fs/ 1000),...
maxNsamples- 2* SEARCHBUFFER* Downsample+ DATAPADDING_MSECS* (Fs/ 1000));
power_deg = pow_of (deg_data, SEARCHBUFFER * Downsample, ...
maxNsamples- SEARCHBUFFER* Downsample+ DATAPADDING_MSECS* (Fs/ 1000),...
maxNsamples- 2* SEARCHBUFFER* Downsample+ DATAPADDING_MSECS* (Fs/ 1000));
% fprintf( 'ref/deg power is %f/%f\n', power_ref, power_deg);
hz_spectrum_ref = zeros( 1, Nf/ 2);
hz_spectrum_deg = zeros( 1, Nf/ 2);
frame_is_bad = zeros( 1, stop_frame + 1);
smeared_frame_is_bad = zeros( 1, stop_frame + 1);
silent = zeros( 1, stop_frame + 1);
pitch_pow_dens_ref = zeros( stop_frame + 1, Nb);
pitch_pow_dens_deg = zeros( stop_frame + 1, Nb);
frame_was_skipped = zeros( 1, stop_frame + 1);
frame_disturbance = zeros( 1, stop_frame + 1);
frame_disturbance_asym_add = zeros( 1, stop_frame + 1);
avg_pitch_pow_dens_ref = zeros( 1, Nb);
avg_pitch_pow_dens_deg = zeros( 1, Nb);
loudness_dens_ref = zeros( 1, Nb);
loudness_dens_deg = zeros( 1, Nb);
deadzone = zeros( 1, Nb);
disturbance_dens = zeros( 1, Nb);
disturbance_dens_asym_add = zeros( 1, Nb);
time_weight = zeros( 1, stop_frame + 1);
total_power_ref = zeros( 1, stop_frame + 1);
% fid= fopen( 'tmp_mat.txt', 'wt');
for frame = 0: stop_frame
start_sample_ref = 1+ SEARCHBUFFER * Downsample + frame* (Nf/ 2);
hz_spectrum_ref= short_term_fft (Nf, ref_data, Whanning, ...
start_sample_ref);
utt = Nutterances;
while ((utt >= 1) && ((Utt_Start(utt)- 1)* Downsample+ 1 ...
> start_sample_ref))
utt= utt - 1;
end
if (utt >= 1)
delay = Utt_Delay(utt);
else
delay = Utt_Delay(1);
end
start_sample_deg = start_sample_ref + delay;
if ((start_sample_deg > 0) && (start_sample_deg + Nf- 1 < ...
maxNsamples+ DATAPADDING_MSECS* (Fs/ 1000)))
hz_spectrum_deg= short_term_fft (Nf, deg_data, Whanning, ...
start_sample_deg);
else
hz_spectrum_deg( 1: Nf/ 2)= 0;
end
pitch_pow_dens_ref( frame+ 1, :)= freq_warping (...
hz_spectrum_ref, Nb, frame);
%peak = maximum_of (pitch_pow_dens_ref, 0, Nb);
pitch_pow_dens_deg( frame+ 1, :)= freq_warping (...
hz_spectrum_deg, Nb, frame);
total_audible_pow_ref = total_audible (frame, pitch_pow_dens_ref, 1E2);
total_audible_pow_deg = total_audible (frame, pitch_pow_dens_deg, 1E2);
silent(frame+ 1) = (total_audible_pow_ref < 1E7);
end
% fclose( fid);
avg_pitch_pow_dens_ref= time_avg_audible_of (stop_frame + 1, ...
silent, pitch_pow_dens_ref, floor((maxNsamples- 2* SEARCHBUFFER* ...
Downsample+ DATAPADDING_MSECS* (Fs/ 1000))/ (Nf / 2))- 1);
avg_pitch_pow_dens_deg= time_avg_audible_of (stop_frame + 1, ...
silent, pitch_pow_dens_deg, floor((maxNsamples- 2* SEARCHBUFFER* ...
Downsample+ DATAPADDING_MSECS* (Fs/ 1000))/ (Nf/ 2))- 1);
% fid= fopen( 'tmp_mat.txt', 'wt');
% fprintf( fid, '%f\n', avg_pitch_pow_dens_deg);
% fclose( fid);
if (CALIBRATE== 0)
pitch_pow_dens_ref= freq_resp_compensation (stop_frame + 1, ...
pitch_pow_dens_ref, avg_pitch_pow_dens_ref, ...
avg_pitch_pow_dens_deg, 1000);
if (Plot_Frame>= 0) % plot pitch_pow_dens_ref
figure;
subplot( 1, 2, 1);
plot( centre_of_band_hz, 10* log10( eps+ ...
pitch_pow_dens_ref( Plot_Frame+ 1, :)));
axis( [0 Fs/2 0 95]); %xlabel( 'Hz'); ylabel( 'Db');
title( 'reference signal bark spectrum with frequency compensation');
subplot( 1, 2, 2);
plot( centre_of_band_hz, 10* log10( eps+ ...
pitch_pow_dens_deg( Plot_Frame+ 1, :)));
axis( [0 Fs/2 0 95]); %xlabel( 'Hz'); ylabel( 'Db');
title( 'degraded signal bark spectrum');
end
end
% tmp1= pitch_pow_dens_ref';
MAX_SCALE = 5.0;
MIN_SCALE = 3e-4;
oldScale = 1;
THRESHOLD_BAD_FRAMES = 30;
for frame = 0: stop_frame
total_audible_pow_ref = total_audible (frame, pitch_pow_dens_ref, 1);
total_audible_pow_deg = total_audible (frame, pitch_pow_dens_deg, 1);
total_power_ref (1+ frame) = total_audible_pow_ref;
scale = (total_audible_pow_ref + 5e3)/ (total_audible_pow_deg + 5e3);
if (frame > 0)
scale = 0.2 * oldScale + 0.8 * scale;
end
oldScale = scale;
if (scale > MAX_SCALE)
scale = MAX_SCALE;
elseif (scale < MIN_SCALE)
scale = MIN_SCALE;
end
pitch_pow_dens_deg( 1+ frame, :) = ...
pitch_pow_dens_deg( 1+ frame, :) * scale;
if (frame== Plot_Frame)
figure;
subplot( 1, 2, 1);
plot( centre_of_band_hz, 10* log10( eps+ ...
pitch_pow_dens_ref( Plot_Frame+ 1, :)));
axis( [0 Fs/2 0 95]); %xlabel( 'Hz'); ylabel( 'Db');
subplot( 1, 2, 2);
plot( centre_of_band_hz, 10* log10( eps+ ...
pitch_pow_dens_deg( Plot_Frame+ 1, :)));
axis( [0 Fs/2 0 95]); %xlabel( 'Hz'); ylabel( 'Db');
end
loudness_dens_ref = intensity_warping_of (frame, pitch_pow_dens_ref);
loudness_dens_deg = intensity_warping_of (frame, pitch_pow_dens_deg);
disturbance_dens = loudness_dens_deg - loudness_dens_ref;
if (frame== Plot_Frame)
figure;
subplot( 1, 2, 1);
plot( centre_of_band_hz, 10* log10( eps+ ...
loudness_dens_ref));
axis( [0 Fs/2 0 15]); %xlabel( 'Hz'); ylabel( 'Db');
title( 'reference signal loudness density');
subplot( 1, 2, 2);
plot( centre_of_band_hz, 10* log10( eps+ ...
loudness_dens_deg));
axis( [0 Fs/2 0 15]); %xlabel( 'Hz'); ylabel( 'Db');
title( 'degraded signal loudness density');
end
for band =1: Nb
deadzone (band) = 0.25* min (loudness_dens_deg (band), ...
loudness_dens_ref (band));
end
for band = 1: Nb
d = disturbance_dens (band);
m = deadzone (band);
if (d > m)
disturbance_dens (band) = disturbance_dens (band)- m;
% disturbance_dens (band) = d- m;
else
if (d < -m)
disturbance_dens (band) = disturbance_dens (band)+ m;
% disturbance_dens (band) = d+ m;
else
disturbance_dens (band) = 0;
end
end
end
if (frame== Plot_Frame)
figure;
subplot( 1, 2, 1);
plot( centre_of_band_hz, disturbance_dens);
axis( [0 Fs/2 -1 50]); %xlabel( 'Hz'); ylabel( 'Db');
title( 'disturbance');
end
D_disturbance( frame+ 1, :)= disturbance_dens;
frame_disturbance (1+ frame) = pseudo_Lp (disturbance_dens, D_POW_F);
if (frame_disturbance (1+ frame) > THRESHOLD_BAD_FRAMES)
there_is_a_bad_frame = TRUE;
end
disturbance_dens= multiply_with_asymmetry_factor (...
disturbance_dens, frame, pitch_pow_dens_ref, pitch_pow_dens_deg);
if (frame== Plot_Frame)
subplot( 1, 2, 2);
plot( centre_of_band_hz, disturbance_dens);
axis( [0 Fs/2 -1 50]); %xlabel( 'Hz'); ylabel( 'Db');
title( 'disturbance after asymmetry processing');
end
DA_disturbance( frame+ 1, :)= disturbance_dens;
frame_disturbance_asym_add (1+ frame) = ...
pseudo_Lp (disturbance_dens, A_POW_F);
end
% fid= fopen( 'tmp_mat.txt', 'wt');
% fprintf( fid, '%f\n', frame_disturbance);
% fclose( fid);
frame_was_skipped (1: 1+ stop_frame) = FALSE;
for utt = 2: Nutterances
frame1 = floor (((Utt_Start(utt)- 1- SEARCHBUFFER )* Downsample+ 1+ ...
Utt_Delay(utt))/ (Nf/ 2));
j = floor( floor(((Utt_End(utt-1)- 1- SEARCHBUFFER)* Downsample+ 1+ ...
Utt_Delay(utt-1)))/(Nf/ 2));
delay_jump = Utt_Delay(utt) - Utt_Delay(utt-1);
if (frame1 > j)
frame1 = j;
elseif (frame1 < 0)
frame1 = 0;
end
% fprintf( 'frame1, j, delay_jump is %d, %d, %d\n', frame1, ...
% j, delay_jump);
if (delay_jump < -(Nf/ 2))
frame2 = floor (((Utt_Start(utt)- 1- SEARCHBUFFER)* Downsample+ 1 ...
+ max (0, abs (delay_jump)))/ (Nf/ 2)) + 1;
for frame = frame1: frame2
if (frame < stop_frame)
frame_was_skipped (1+ frame) = TRUE;
frame_disturbance (1+ frame) = 0;
frame_disturbance_asym_add (1+ frame) = 0;
end
end
end
end
nn = DATAPADDING_MSECS* (Fs/ 1000) + maxNsamples;
tweaked_deg = zeros( 1, nn);
% fprintf( 'nn is %d\n', nn);
for i= SEARCHBUFFER* Downsample+ 1: nn- SEARCHBUFFER* Downsample
utt = Nutterances;
while ((utt >= 1) && ((Utt_Start (utt)- 1)* Downsample> i))
utt = utt- 1;
end
if (utt >= 1)
delay = Utt_Delay (utt);
else
delay = Utt_Delay (1);
end
j = i + delay;
if (j < SEARCHBUFFER * Downsample+ 1)
j = SEARCHBUFFER * Downsample+ 1;
end
if (j > nn - SEARCHBUFFER * Downsample)
j = nn - SEARCHBUFFER * Downsample;
end
tweaked_deg (i) = deg_data (j);
end
if (there_is_a_bad_frame)
for frame = 0: stop_frame
frame_is_bad (1+ frame) = (frame_disturbance (1+ frame)...
> THRESHOLD_BAD_FRAMES);
smeared_frame_is_bad (1+ frame) = FALSE;
end
frame_is_bad (1) = FALSE;
SMEAR_RANGE = 2;
for frame = SMEAR_RANGE: stop_frame- 1- SMEAR_RANGE
max_itself_and_left = frame_is_bad (1+ frame);
max_itself_and_right = frame_is_bad (1+ frame);
for i = -SMEAR_RANGE: 0
if (max_itself_and_left < frame_is_bad (1+ frame+ i))
max_itself_and_left = frame_is_bad (1+ frame+ i);
end
end
for i = 0: SMEAR_RANGE
if (max_itself_and_right < frame_is_bad (1+ frame + i))
max_itself_and_right = frame_is_bad (1+ frame + i);
end
end
mini = max_itself_and_left;
if (mini > max_itself_and_right)
mini = max_itself_and_right;
end
smeared_frame_is_bad (1+ frame) = mini;
end
MINIMUM_NUMBER_OF_BAD_FRAMES_IN_BAD_INTERVAL = 5;
number_of_bad_intervals = 0;
frame = 0;
while (frame <= stop_frame)
while ((frame <= stop_frame) && (~smeared_frame_is_bad (1+ frame)))
frame= frame+ 1;
end
if (frame <= stop_frame)
start_frame_of_bad_interval(1+ number_of_bad_intervals)= ...
1+ frame;
while ((frame <= stop_frame) && (...
smeared_frame_is_bad (1+ frame)))
frame= frame+ 1;
end
if (frame <= stop_frame)
stop_frame_of_bad_interval(1+ number_of_bad_intervals)= ...
1+ frame;
if (stop_frame_of_bad_interval(1+ number_of_bad_intervals)- ...
start_frame_of_bad_interval(1+ number_of_bad_intervals)...
>= MINIMUM_NUMBER_OF_BAD_FRAMES_IN_BAD_INTERVAL)
number_of_bad_intervals= number_of_bad_intervals+ 1;
end
end
end
end
for bad_interval = 0: number_of_bad_intervals - 1
start_sample_of_bad_interval(1+ bad_interval) = ...
(start_frame_of_bad_interval(1+ bad_interval)- 1) * (Nf/ 2) ...
+ SEARCHBUFFER * Downsample+ 1;
stop_sample_of_bad_interval(1+ bad_interval) = ...
(stop_frame_of_bad_interval(1+ bad_interval)- 1) * (Nf/ 2) ...
+ Nf + SEARCHBUFFER* Downsample;
if (stop_frame_of_bad_interval(1+ bad_interval) > stop_frame+ 1)
stop_frame_of_bad_interval(1+ bad_interval) = stop_frame+ 1;
end
number_of_samples_in_bad_interval(1+ bad_interval) = ...
stop_sample_of_bad_interval(1+ bad_interval) - ...
start_sample_of_bad_interval(1+ bad_interval)+ 1;
end
% fprintf( 'number of bad intervals %d\n', number_of_bad_intervals);
% fprintf( '%d %d\n', number_of_samples_in_bad_interval(1), ...
% number_of_samples_in_bad_interval(2));
% fprintf( '%d %d\n', start_sample_of_bad_interval(1), ...
% start_sample_of_bad_interval(2));
SEARCH_RANGE_IN_TRANSFORM_LENGTH = 4;
search_range_in_samples= SEARCH_RANGE_IN_TRANSFORM_LENGTH * Nf;
for bad_interval= 0: number_of_bad_intervals- 1
ref = zeros (1, 2 * search_range_in_samples + ...
number_of_samples_in_bad_interval (1+ bad_interval));
deg = zeros (1, 2 * search_range_in_samples + ...
number_of_samples_in_bad_interval (1+ bad_interval));
ref(1: search_range_in_samples) = 0;
ref (search_range_in_samples+ 1: search_range_in_samples+ ...
number_of_samples_in_bad_interval (1+ bad_interval)) = ...
ref_data (start_sample_of_bad_interval( 1+ bad_interval) + 1: ...
start_sample_of_bad_interval( 1+ bad_interval) + ...
number_of_samples_in_bad_interval (1+ bad_interval));
ref (search_range_in_samples + ...
number_of_samples_in_bad_interval (1+ bad_interval) + 1: ...
search_range_in_samples + ...
number_of_samples_in_bad_interval (1+ bad_interval) + ...
search_range_in_samples) = 0;
for i = 0: 2 * search_range_in_samples + ...
number_of_samples_in_bad_interval (1+ bad_interval) - 1
j = start_sample_of_bad_interval (1+ bad_interval) - ...
search_range_in_samples + i;
nn = maxNsamples - SEARCHBUFFER * Downsample + ...
DATAPADDING_MSECS * (Fs / 1000);
if (j <= SEARCHBUFFER * Downsample)
j = SEARCHBUFFER * Downsample+ 1;
end
if (j > nn)
j = nn;
end
deg (1+ i) = tweaked_deg (j);
end
[delay_in_samples, best_correlation]= compute_delay ...
(1, 2 * search_range_in_samples + ...
number_of_samples_in_bad_interval (1+ bad_interval), ...
search_range_in_samples, ref, deg);
delay_in_samples_in_bad_interval (1+ bad_interval) = ...
delay_in_samples;
% fprintf( 'delay_in_samples, best_correlation is \n\t%d, %f\n', ...
% delay_in_samples, best_correlation);
%
if (best_correlation < 0.5)
delay_in_samples_in_bad_interval (1+ bad_interval) = 0;
end
end
if (number_of_bad_intervals > 0)
doubly_tweaked_deg = tweaked_deg( 1: maxNsamples + ...
DATAPADDING_MSECS * (Fs / 1000));
for bad_interval= 0: number_of_bad_intervals- 1
delay = delay_in_samples_in_bad_interval (1+ bad_interval);
for i = start_sample_of_bad_interval (1+ bad_interval): ...
stop_sample_of_bad_interval (1+ bad_interval)
j = i + delay;
if (j < 1)
j = 1;
end
if (j > maxNsamples)
j = maxNsamples;
end
h = tweaked_deg (j);
doubly_tweaked_deg (i) = h;
end
end
untweaked_deg = deg_data;
deg_data = doubly_tweaked_deg;
for bad_interval= 0: number_of_bad_intervals- 1
for frame = start_frame_of_bad_interval (1+ bad_interval): ...
stop_frame_of_bad_interval (1+ bad_interval)- 1
frame= frame- 1;
start_sample_ref = SEARCHBUFFER * Downsample + ...
frame * Nf / 2+ 1;
start_sample_deg = start_sample_ref;
hz_spectrum_deg= short_term_fft (Nf, deg_data, ...
Whanning, start_sample_deg);
pitch_pow_dens_deg( 1+ frame, :)= freq_warping (...
hz_spectrum_deg, Nb, frame);
end
oldScale = 1;
for frame = start_frame_of_bad_interval (1+ bad_interval): ...
stop_frame_of_bad_interval (1+ bad_interval)- 1
frame= frame- 1;
% see implementation for detail why 1 needed to be
% subtracted
total_audible_pow_ref = total_audible (frame, ...
pitch_pow_dens_ref, 1);
total_audible_pow_deg = total_audible (frame, ...
pitch_pow_dens_deg, 1);
scale = (total_audible_pow_ref + 5e3) / ...
(total_audible_pow_deg + 5e3);
if (frame > 0)
scale = 0.2 * oldScale + 0.8*scale;
end
oldScale = scale;
if (scale > MAX_SCALE)
scale = MAX_SCALE;
end
if (scale < MIN_SCALE)
scale = MIN_SCALE;
end
pitch_pow_dens_deg (1+ frame, :) = ...
pitch_pow_dens_deg (1+ frame, :)* scale;
loudness_dens_ref= intensity_warping_of (frame, ...
pitch_pow_dens_ref);
loudness_dens_deg= intensity_warping_of (frame, ...
pitch_pow_dens_deg);
disturbance_dens = loudness_dens_deg - loudness_dens_ref;
for band = 1: Nb
deadzone(band) = min (loudness_dens_deg(band), ...
loudness_dens_ref(band));
deadzone(band) = deadzone(band)* 0.25;
end
for band = 1: Nb
d = disturbance_dens (band);
m = deadzone (band);
if (d > m)
disturbance_dens (band) = ...
disturbance_dens (band)- m;
else
if (d < -m)
disturbance_dens (band) = ...
disturbance_dens (band)+ m;
else
disturbance_dens (band) = 0;
end
end
end
frame_disturbance( 1+ frame) = min (...
frame_disturbance( 1+ frame), pseudo_Lp(...
disturbance_dens, D_POW_F));
disturbance_dens= multiply_with_asymmetry_factor ...
(disturbance_dens, frame, pitch_pow_dens_ref, ...
pitch_pow_dens_deg);
frame_disturbance_asym_add(1+ frame) = min (...
frame_disturbance_asym_add(1+ frame), ...
pseudo_Lp (disturbance_dens, A_POW_F));
end
end
deg_data = untweaked_deg;
end
end
for frame = 0: stop_frame
h = 1;
if (stop_frame + 1 > 1000)
n = floor( (maxNsamples - 2 * SEARCHBUFFER * Downsample)...
/ (Nf / 2)) - 1;
timeWeightFactor = (n - 1000) / 5500;
if (timeWeightFactor > 0.5)
timeWeightFactor = 0.5;
end
h = (1.0 - timeWeightFactor) + timeWeightFactor * frame / n;
end
time_weight (1 +frame) = h;
end
% fid= fopen( 'tmp_mat1.txt', 'at');
% fprintf( '\n');
for frame = 0: stop_frame
h = ((total_power_ref (1+ frame) + 1e5) / 1e7)^ 0.04;
% if (frame== 118)
% fprintf( '%f\n', h);
% fprintf( '%f\n', frame_disturbance( 1+ frame));
% end
frame_disturbance( 1+ frame) = frame_disturbance( 1+ frame)/ h;
% if (frame== 118)
% fprintf( '%f\n', frame_disturbance( 1+ frame));
% end
%
frame_disturbance_asym_add( 1+ frame) = ...
frame_disturbance_asym_add( 1+ frame)/ h;
if (frame_disturbance( 1+ frame) > 45)
frame_disturbance( 1+ frame) = 45;
end
if (frame_disturbance_asym_add( 1+ frame)> 45)
frame_disturbance_asym_add( 1+ frame) = 45;
end
end
% fclose ( fid);
d_indicator = Lpq_weight (start_frame, stop_frame, ...
D_POW_S, D_POW_T, frame_disturbance, time_weight);
a_indicator = Lpq_weight (start_frame, stop_frame, ...
A_POW_S, A_POW_T, frame_disturbance_asym_add, time_weight);
pesq_mos = 4.5 - D_WEIGHT * d_indicator - A_WEIGHT * a_indicator;
if (Plot_Frame> 0)
figure;
subplot( 1, 2, 1);
mesh( 0: stop_frame, centre_of_band_hz, D_disturbance');
title( 'disturbance');
subplot( 1, 2, 2);
mesh( 0: stop_frame, centre_of_band_hz, DA_disturbance');
title( 'disturbance after asymmetry processing');
end
% fid= fopen( 'tmp_mat.txt', 'wt');
% fprintf( fid, 'time_weight\n');
% fprintf( fid, '%f\n', time_weight);
% fprintf( fid, 'frame_disturbance:\n');
% fprintf( fid, '%f\n', frame_disturbance);
% fprintf( fid, 'frame_disturbance_asym_add\n');
% fprintf( fid, '%f\n', frame_disturbance_asym_add);
% fclose( fid);
function result_time= Lpq_weight(start_frame, stop_frame, ...
power_syllable, power_time, frame_disturbance, time_weight)
global NUMBER_OF_PSQM_FRAMES_PER_SYLLABE
% fid= fopen( 'tmp_mat1.txt', 'at');
% fprintf( 'result_time:\n');
result_time= 0;
total_time_weight_time = 0;
% fprintf( 'start/end frame: %d/%d\n', start_frame, stop_frame);
for start_frame_of_syllable = start_frame: ...
NUMBER_OF_PSQM_FRAMES_PER_SYLLABE/2: stop_frame
result_syllable = 0;
count_syllable = 0;
for frame = start_frame_of_syllable: ...
start_frame_of_syllable + NUMBER_OF_PSQM_FRAMES_PER_SYLLABE- 1
if (frame <= stop_frame)
h = frame_disturbance(1+ frame);
% if (start_frame_of_syllable== 101)
% fprintf( fid, '%f\n', h);
% end
result_syllable = result_syllable+ (h^ power_syllable);
end
count_syllable = count_syllable+ 1;
end
result_syllable = result_syllable/ count_syllable;
result_syllable = result_syllable^ (1/power_syllable);
result_time= result_time+ (time_weight (...
1+ start_frame_of_syllable - start_frame) * ...
result_syllable)^ power_time;
total_time_weight_time = total_time_weight_time+ ...
time_weight (1+ start_frame_of_syllable - start_frame)^ power_time;
% fprintf( fid, '%f\n', result_time);
end
% fclose (fid);
% fprintf( 'total_time_weight_time is %f\n', total_time_weight_time);
result_time = result_time/ total_time_weight_time;
result_time= result_time^ (1/ power_time);
% fprintf( 'result_time is %f\n\n', result_time);
function [best_delay, max_correlation] = compute_delay (...
start_sample, stop_sample, search_range, ...
time_series1, time_series2)
n = stop_sample - start_sample+ 1;
power_of_2 = 2^ (ceil( log2( 2 * n)));
power1 = pow_of (time_series1, start_sample, stop_sample, n)* ...
n/ power_of_2;
power2 = pow_of (time_series2, start_sample, stop_sample, n)* ...
n/ power_of_2;
normalization = sqrt (power1 * power2);
% fprintf( 'normalization is %f\n', normalization);
if ((power1 <= 1e-6) || (power2 <= 1e-6))
max_correlation = 0;
best_delay= 0;
end
x1( 1: power_of_2)= 0;
x2( 1: power_of_2)= 0;
y( 1: power_of_2)= 0;
x1( 1: n)= abs( time_series1( start_sample: ...
stop_sample));
x2( 1: n)= abs( time_series2( start_sample: ...
stop_sample));
x1_fft= fft( x1, power_of_2)/ power_of_2;
x2_fft= fft( x2, power_of_2);
x1_fft_conj= conj( x1_fft);
y= ifft( x1_fft_conj.* x2_fft, power_of_2);
best_delay = 0;
max_correlation = 0;
% these loop can be rewritten
for i = -search_range: -1
h = abs (y (1+ i + power_of_2)) / normalization;
if (h > max_correlation)
max_correlation = h;
best_delay= i;
end
end
for i = 0: search_range- 1
h = abs (y (1+i)) / normalization;
if (h > max_correlation)
max_correlation = h;
best_delay= i;
end
end
best_delay= best_delay- 1;
function mod_disturbance_dens= multiply_with_asymmetry_factor (...
disturbance_dens, frame, pitch_pow_dens_ref, pitch_pow_dens_deg)
global Nb
for i = 1: Nb
ratio = (pitch_pow_dens_deg(1+ frame, i) + 50)...
/ (pitch_pow_dens_ref (1+ frame, i) + 50);
h = ratio^ 1.2;
if (h > 12)
h = 12;
elseif (h < 3)
h = 0.0;
end
mod_disturbance_dens (i) = disturbance_dens (i) * h;
end
function loudness_dens = intensity_warping_of (...
frame, pitch_pow_dens)
global abs_thresh_power Sl Nb centre_of_band_bark
ZWICKER_POWER= 0.23;
for band = 1: Nb
threshold = abs_thresh_power (band);
input = pitch_pow_dens (1+ frame, band);
if (centre_of_band_bark (band) < 4)
h = 6 / (centre_of_band_bark (band) + 2);
else
h = 1;
end
if (h > 2)
h = 2;
end
h = h^ 0.15;
modified_zwicker_power = ZWICKER_POWER * h;
if (input > threshold)
loudness_dens (band) = ((threshold / 0.5)^ modified_zwicker_power)...
* ((0.5 + 0.5 * input / threshold)^ modified_zwicker_power- 1);
else
loudness_dens (band) = 0;
end
loudness_dens (band) = loudness_dens (band)* Sl;
end
function result= pseudo_Lp (x, p)
global Nb width_of_band_bark
totalWeight = 0;
result = 0;
for band = 2: Nb
h = abs (x (band));
w = width_of_band_bark (band);
prod = h * w;
result = result+ prod^ p;
totalWeight = totalWeight+ w;
end
result = (result/ totalWeight)^ (1/p);
result = result* totalWeight;
function mod_pitch_pow_dens_ref= freq_resp_compensation (number_of_frames, ...
pitch_pow_dens_ref, avg_pitch_pow_dens_ref, ...
avg_pitch_pow_dens_deg, constant)
global Nb
for band = 1: Nb
x = (avg_pitch_pow_dens_deg (band) + constant) / ...
(avg_pitch_pow_dens_ref (band) + constant);
if (x > 100.0)
x = 100.0;
elseif (x < 0.01)
x = 0.01;
end
for frame = 1: number_of_frames
mod_pitch_pow_dens_ref(frame, band) = ...
pitch_pow_dens_ref(frame, band) * x;
end
end
function avg_pitch_pow_dens= time_avg_audible_of(number_of_frames, ...
silent, pitch_pow_dens, total_number_of_frames)
global Nb abs_thresh_power
for band = 1: Nb
result = 0;
for frame = 1: number_of_frames
if (~silent (frame))
h = pitch_pow_dens (frame, band);
if (h > 100 * abs_thresh_power (band))
result = result + h;
end
end
avg_pitch_pow_dens (band) = result/ total_number_of_frames;
end
end
function hz_spectrum= short_term_fft (Nf, data, Whanning, start_sample)
x1= data( start_sample: start_sample+ Nf-1).* Whanning;
x1_fft= fft( x1);
hz_spectrum= abs( x1_fft( 1: Nf/ 2)).^ 2;
hz_spectrum( 1)= 0;
function pitch_pow_dens= freq_warping( hz_spectrum, Nb, frame)
global nr_of_hz_bands_per_bark_band pow_dens_correction_factor
global Sp
hz_band = 1;
for bark_band = 1: Nb
n = nr_of_hz_bands_per_bark_band (bark_band);
sum = 0;
for i = 1: n
sum = sum+ hz_spectrum( hz_band);
hz_band= hz_band+ 1;
end
sum = sum* pow_dens_correction_factor (bark_band);
sum = sum* Sp;
pitch_pow_dens (bark_band) = sum;
end
function total_audible_pow = total_audible (frame, ...
pitch_pow_dens, factor)
global Nb abs_thresh_power
total_audible_pow = 0;
for band= 2: Nb
h = pitch_pow_dens (frame+ 1,band);
threshold = factor * abs_thresh_power (band);
if (h > threshold)
total_audible_pow = total_audible_pow+ h;
end
end

View File

@ -0,0 +1,3 @@
function power= pow_of( data, start_point, end_point, divisor)
power= sum( data( start_point: end_point).^ 2)/ divisor;

View File

@ -0,0 +1,301 @@
function setup_global( sampling_rate);
global Downsample InIIR_Hsos InIIR_Nsos Align_Nfft
global DATAPADDING_MSECS SEARCHBUFFER Fs MINSPEECHLGTH JOINSPEECHLGTH
global Nutterances Largest_uttsize Nsurf_samples Crude_DelayEst
global Crude_DelayConf UttSearch_Start UttSearch_End Utt_DelayEst
global Utt_Delay Utt_DelayConf Utt_Start Utt_End
global MAXNUTTERANCES WHOLE_SIGNAL
global pesq_mos subj_mos cond_nr MINUTTLENGTH
global CALIBRATE Nfmax Nb Sl Sp
global nr_of_hz_bands_per_bark_band centre_of_band_bark
global width_of_band_hz centre_of_band_hz width_of_band_bark
global pow_dens_correction_factor abs_thresh_power
CALIBRATE= 0;
Nfmax= 512;
MAXNUTTERANCES= 50;
MINUTTLENGTH= 50;
WHOLE_SIGNAL= -1;
UttSearch_Star= zeros( 1, MAXNUTTERANCES);
UttSearch_End= zeros( 1, MAXNUTTERANCES);
Utt_DelayEst= zeros( 1, MAXNUTTERANCES);
Utt_Delay= zeros( 1, MAXNUTTERANCES);
Utt_DelayConf= zeros( 1, MAXNUTTERANCES);
Utt_Start= zeros( 1, MAXNUTTERANCES);
Utt_End= zeros( 1, MAXNUTTERANCES);
DATAPADDING_MSECS= 320;
SEARCHBUFFER= 75;
MINSPEECHLGTH= 4;
JOINSPEECHLGTH= 50;
Sp_16k = 6.910853e-006;
Sl_16k = 1.866055e-001;
fs_16k= 16000;
Downsample_16k = 64;
Align_Nfft_16k = 1024;
InIIR_Nsos_16k = 12;
InIIR_Hsos_16k = [
0.325631521, -0.086782860, -0.238848661, -1.079416490, 0.434583902;
0.403961804, -0.556985881, 0.153024077, -0.415115835, 0.696590244;
4.736162769, 3.287251046, 1.753289019, -1.859599046, 0.876284034;
0.365373469, 0.000000000, 0.000000000, -0.634626531, 0.000000000;
0.884811506, 0.000000000, 0.000000000, -0.256725271, 0.141536777;
0.723593055, -1.447186099, 0.723593044, -1.129587469, 0.657232737;
1.644910855, -1.817280902, 1.249658063, -1.778403899, 0.801724355;
0.633692689, -0.284644314, -0.319789663, 0.000000000, 0.000000000;
1.032763031, 0.268428979, 0.602913323, 0.000000000, 0.000000000;
1.001616361, -0.823749013, 0.439731942, -0.885778255, 0.000000000;
0.752472096, -0.375388990, 0.188977609, -0.077258216, 0.247230734;
1.023700575, 0.001661628, 0.521284240, -0.183867259, 0.354324187
];
Sp_8k = 2.764344e-5;
Sl_8k = 1.866055e-1;
fs_8k= 8000;
Downsample_8k = 32;
Align_Nfft_8k = 512;
InIIR_Nsos_8k = 8;
InIIR_Hsos_8k = [
0.885535424, -0.885535424, 0.000000000, -0.771070709, 0.000000000;
0.895092588, 1.292907193, 0.449260174, 1.268869037, 0.442025372;
4.049527940, -7.865190042, 3.815662102, -1.746859852, 0.786305963;
0.500002353, -0.500002353, 0.000000000, 0.000000000, 0.000000000;
0.565002834, -0.241585934, -0.306009671, 0.259688659, 0.249979657;
2.115237288, 0.919935084, 1.141240051, -1.587313419, 0.665935315;
0.912224584, -0.224397719, -0.641121413, -0.246029464, -0.556720590;
0.444617727, -0.307589321, 0.141638062, -0.996391149, 0.502251622
];
nr_of_hz_bands_per_bark_band_8k = [
1, 1, 1, 1, 1, 1, 1, 1, 2, 1, ...
1, 1, 1, 1, 2, 1, 1, 2, 2, 2, ...
2, 2, 2, 2, 2, 3, 3, 3, 3, 4, ...
3, 4, 5, 4, 5, 6, 6, 7, 8, 9, ...
9, 11
];
centre_of_band_bark_8k = [
0.078672, 0.316341, 0.636559, 0.961246, 1.290450, ...
1.624217, 1.962597, 2.305636, 2.653383, 3.005889, ...
3.363201, 3.725371, 4.092449, 4.464486, 4.841533, ...
5.223642, 5.610866, 6.003256, 6.400869, 6.803755, ...
7.211971, 7.625571, 8.044611, 8.469146, 8.899232, ...
9.334927, 9.776288, 10.223374, 10.676242, 11.134952,...
11.599563, 12.070135, 12.546731, 13.029408, 13.518232,...
14.013264, 14.514566, 15.022202, 15.536238, 16.056736,...
16.583761, 17.117382
];
centre_of_band_hz_8k = [
7.867213, 31.634144, 63.655895, 96.124611, 129.044968,...
162.421738, 196.259659, 230.563568, 265.338348, 300.588867,...
336.320129, 372.537140, 409.244934, 446.448578, 484.568604,...
526.600586, 570.303833, 619.423340, 672.121643, 728.525696,...
785.675964, 846.835693, 909.691650, 977.063293, 1049.861694,...
1129.635986, 1217.257568, 1312.109497, 1412.501465, 1517.999390,...
1628.894165, 1746.194336, 1871.568848, 2008.776123, 2158.979248,...
2326.743164, 2513.787109, 2722.488770, 2952.586670, 3205.835449,...
3492.679932, 3820.219238
];
width_of_band_bark_8k = [
0.157344, 0.317994, 0.322441, 0.326934, 0.331474, ...
0.336061, 0.340697, 0.345381, 0.350114, 0.354897, ...
0.359729, 0.364611, 0.369544, 0.374529, 0.379565, ...
0.384653, 0.389794, 0.394989, 0.400236, 0.405538, ...
0.410894, 0.416306, 0.421773, 0.427297, 0.432877, ...
0.438514, 0.444209, 0.449962, 0.455774, 0.461645, ...
0.467577, 0.473569, 0.479621, 0.485736, 0.491912, ...
0.498151, 0.504454, 0.510819, 0.517250, 0.523745, ...
0.530308, 0.536934
];
width_of_band_hz_8k = [
15.734426, 31.799433, 32.244064, 32.693359, 33.147385, ...
33.606140, 34.069702, 34.538116, 35.011429, 35.489655, ...
35.972870, 36.461121, 36.954407, 37.452911, 40.269653, ...
42.311859, 45.992554, 51.348511, 55.040527, 56.775208, ...
58.699402, 62.445862, 64.820923, 69.195374, 76.745667, ...
84.016235, 90.825684, 97.931152, 103.348877, 107.801880, ...
113.552246, 121.490601, 130.420410, 143.431763, 158.486816, ...
176.872803, 198.314697, 219.549561, 240.600098, 268.702393, ...
306.060059, 349.937012
];
pow_dens_correction_factor_8k = [
100.000000, 99.999992, 100.000000, 100.000008, 100.000008,...
100.000015, 99.999992, 99.999969, 50.000027, 100.000000,...
99.999969, 100.000015, 99.999947, 100.000061, 53.047077, ...
110.000046, 117.991989, 65.000000, 68.760147, 69.999931, ...
71.428818, 75.000038, 76.843384, 80.968781, 88.646126, ...
63.864388, 68.155350, 72.547775, 75.584831, 58.379192,...
80.950836, 64.135651, 54.384785, 73.821884, 64.437073, ...
59.176456, 65.521278, 61.399822, 58.144047, 57.004543,...
64.126297, 59.248363
];
abs_thresh_power_8k = [
51286152, 2454709.500, 70794.593750, ...
4897.788574, 1174.897705, 389.045166, ...
104.712860, 45.708820, 17.782795, ...
9.772372, 4.897789, 3.090296, ...
1.905461, 1.258925, 0.977237, ...
0.724436, 0.562341, 0.457088, ...
0.389045, 0.331131, 0.295121, ...
0.269153, 0.257040, 0.251189, ...
0.251189, 0.251189, 0.251189, ...
0.263027, 0.288403, 0.309030, ...
0.338844, 0.371535, 0.398107, ...
0.436516, 0.467735, 0.489779, ...
0.501187, 0.501187, 0.512861, ...
0.524807, 0.524807, 0.524807
];
nr_of_hz_bands_per_bark_band_16k = [
1, 1, 1, 1, 1, 1, 1, 1, 2, 1, ...
1, 1, 1, 1, 2, 1, 1, 2, 2, 2, ...
2, 2, 2, 2, 2, 3, 3, 3, 3, 4, ...
3, 4, 5, 4, 5, 6, 6, 7, 8, 9, ...
9, 12, 12, 15, 16, 18, 21, 25, 20
];
centre_of_band_bark_16k = [
0.078672, 0.316341, 0.636559, 0.961246, 1.290450, ...
1.624217, 1.962597, 2.305636, 2.653383, 3.005889, ...
3.363201, 3.725371, 4.092449, 4.464486, 4.841533, ...
5.223642, 5.610866, 6.003256, 6.400869, 6.803755, ...
7.211971, 7.625571, 8.044611, 8.469146, 8.899232, ...
9.334927, 9.776288, 10.223374, 10.676242, 11.134952, ...
11.599563, 12.070135, 12.546731, 13.029408, 13.518232, ...
14.013264, 14.514566, 15.022202, 15.536238, 16.056736, ...
16.583761, 17.117382, 17.657663, 18.204674, 18.758478, ...
19.319147, 19.886751, 20.461355, 21.043034
];
centre_of_band_hz_16k = [
7.867213, 31.634144, 63.655895, 96.124611, 129.044968,...
162.421738, 196.259659, 230.563568, 265.338348, 300.588867,...
336.320129, 372.537140, 409.244934, 446.448578, 484.568604,...
526.600586, 570.303833, 619.423340, 672.121643, 728.525696,...
785.675964, 846.835693, 909.691650, 977.063293, 1049.861694,...
1129.635986, 1217.257568, 1312.109497, 1412.501465, 1517.999390,...
1628.894165, 1746.194336, 1871.568848, 2008.776123, 2158.979248,...
2326.743164, 2513.787109, 2722.488770, 2952.586670, 3205.835449,...
3492.679932, 3820.219238, 4193.938477, 4619.846191, 5100.437012,...
5636.199219, 6234.313477, 6946.734863, 7796.473633
];
width_of_band_bark_16k = [
0.157344, 0.317994, 0.322441, 0.326934, 0.331474,...
0.336061, 0.340697, 0.345381, 0.350114, 0.354897,...
0.359729, 0.364611, 0.369544, 0.374529, 0.379565,...
0.384653, 0.389794, 0.394989, 0.400236, 0.405538,...
0.410894, 0.416306, 0.421773, 0.427297, 0.432877,...
0.438514, 0.444209, 0.449962, 0.455774, 0.461645,...
0.467577, 0.473569, 0.479621, 0.485736, 0.491912,...
0.498151, 0.504454, 0.510819, 0.517250, 0.523745,...
0.530308, 0.536934, 0.543629, 0.550390, 0.557220,...
0.564119, 0.571085, 0.578125, 0.585232
];
width_of_band_hz_16k = [
15.734426, 31.799433, 32.244064, 32.693359, ...
33.147385, 33.606140, 34.069702, 34.538116, ...
35.011429, 35.489655, 35.972870, 36.461121, ...
36.954407, 37.452911, 40.269653, 42.311859, ...
45.992554, 51.348511, 55.040527, 56.775208, ...
58.699402, 62.445862, 64.820923, 69.195374, ...
76.745667, 84.016235, 90.825684, 97.931152, ...
103.348877, 107.801880, 113.552246, 121.490601, ...
130.420410, 143.431763, 158.486816, 176.872803, ...
198.314697, 219.549561, 240.600098, 268.702393, ...
306.060059, 349.937012, 398.686279, 454.713867, ...
506.841797, 564.863770, 637.261230, 794.717285, ...
931.068359
];
pow_dens_correction_factor_16k = [
100.000000, 99.999992, 100.000000, 100.000008,...
100.000008, 100.000015, 99.999992, 99.999969, ...
50.000027, 100.000000, 99.999969, 100.000015, ...
99.999947, 100.000061, 53.047077, 110.000046, ...
117.991989, 65.000000, 68.760147, 69.999931, ...
71.428818, 75.000038, 76.843384, 80.968781, ...
88.646126, 63.864388, 68.155350, 72.547775, ...
75.584831, 58.379192, 80.950836, 64.135651, ...
54.384785, 73.821884, 64.437073, 59.176456, ...
65.521278, 61.399822, 58.144047, 57.004543, ...
64.126297, 54.311001, 61.114979, 55.077751, ...
56.849335, 55.628868, 53.137054, 54.985844, ...
79.546974
];
abs_thresh_power_16k = [
51286152.00, 2454709.500, 70794.593750, ...
4897.788574, 1174.897705, 389.045166, ...
104.712860, 45.708820, 17.782795, ...
9.772372, 4.897789, 3.090296, ...
1.905461, 1.258925, 0.977237, ...
0.724436, 0.562341, 0.457088, ...
0.389045, 0.331131, 0.295121, ...
0.269153, 0.257040, 0.251189, ...
0.251189, 0.251189, 0.251189, ...
0.263027, 0.288403, 0.309030, ...
0.338844, 0.371535, 0.398107, ...
0.436516, 0.467735, 0.489779, ...
0.501187, 0.501187, 0.512861, ...
0.524807, 0.524807, 0.524807, ...
0.512861, 0.478630, 0.426580, ...
0.371535, 0.363078, 0.416869, ...
0.537032
];
if (sampling_rate== fs_16k)
Downsample = Downsample_16k;
InIIR_Hsos = InIIR_Hsos_16k;
InIIR_Nsos = InIIR_Nsos_16k;
Align_Nfft = Align_Nfft_16k;
Fs= fs_16k;
Nb = 49;
Sl = Sl_16k;
Sp = Sp_16k;
nr_of_hz_bands_per_bark_band = nr_of_hz_bands_per_bark_band_16k;
centre_of_band_bark = centre_of_band_bark_16k;
centre_of_band_hz = centre_of_band_hz_16k;
width_of_band_bark = width_of_band_bark_16k;
width_of_band_hz = width_of_band_hz_16k;
pow_dens_correction_factor = pow_dens_correction_factor_16k;
abs_thresh_power = abs_thresh_power_16k;
return;
end
if (sampling_rate== fs_8k)
Downsample = Downsample_8k;
InIIR_Hsos = InIIR_Hsos_8k;
InIIR_Nsos = InIIR_Nsos_8k;
Align_Nfft = Align_Nfft_8k;
Fs= fs_8k;
Nb = 42;
Sl = Sl_8k;
Sp = Sp_8k;
nr_of_hz_bands_per_bark_band = nr_of_hz_bands_per_bark_band_8k;
centre_of_band_bark = centre_of_band_bark_8k;
centre_of_band_hz = centre_of_band_hz_8k;
width_of_band_bark = width_of_band_bark_8k;
width_of_band_hz = width_of_band_hz_8k;
pow_dens_correction_factor = pow_dens_correction_factor_8k;
abs_thresh_power = abs_thresh_power_8k;
return;
end

View File

@ -0,0 +1,390 @@
function split_align( ref_data, ref_Nsamples, ref_VAD, ref_logVAD, ...
deg_data, deg_Nsamples, deg_VAD, deg_logVAD, ...
Utt_Start_l, Utt_SpeechStart, Utt_SpeechEnd, Utt_End_l, ...
Utt_DelayEst_l, Utt_DelayConf_l)
global MAXNUTTERANCES Align_Nfft Downsample Window
global Utt_DelayEst Utt_Delay UttSearch_Start UttSearch_End
global Best_ED1 Best_D1 Best_DC1 Best_ED2 Best_D2 Best_DC2 Best_BP
Utt_BPs= zeros( 1, 41);
Utt_ED1= zeros( 1, 41);
Utt_ED2= zeros( 1, 41);
Utt_D1= zeros( 1, 41);
Utt_D2= zeros( 1, 41);
Utt_DC1= zeros( 1, 41);
Utt_DC2= zeros( 1, 41);
Utt_Len = Utt_SpeechEnd - Utt_SpeechStart;
Utt_Test = MAXNUTTERANCES;
Best_DC1 = 0.0;
Best_DC2 = 0.0;
kernel = Align_Nfft / 64;
Delta = Align_Nfft / (4 * Downsample);
Step = floor( ((0.801 * Utt_Len + 40 * Delta - 1)/(40 * Delta)));
Step = Step* Delta;
% fprintf( 'Step is %f\n', Step);
Pad = floor( Utt_Len / 10);
if( Pad < 75 )
Pad = 75;
end
Utt_BPs(1) = Utt_SpeechStart + Pad;
N_BPs = 1;
while( 1)
N_BPs= N_BPs+ 1;
Utt_BPs(N_BPs)= Utt_BPs(N_BPs- 1)+ Step;
if (~((Utt_BPs(N_BPs) <= (Utt_SpeechEnd- Pad)) && (N_BPs <= 40) ))
break;
end
end
if( N_BPs <= 1 )
return;
end
% fprintf( 'Utt_DelayEst_l, Utt_Start_l, N_BPs is %d,%d,%d\n', ...
% Utt_DelayEst_l, Utt_Start_l, N_BPs);
for bp = 1: N_BPs- 1
Utt_DelayEst(Utt_Test) = Utt_DelayEst_l;
UttSearch_Start(Utt_Test) = Utt_Start_l;
UttSearch_End(Utt_Test) = Utt_BPs(bp);
% fprintf( 'bp,Utt_BPs(%d) is %d,%d\n', bp,bp,Utt_BPs(bp));
crude_align( ref_logVAD, ref_Nsamples, deg_logVAD, ...
deg_Nsamples, MAXNUTTERANCES);
Utt_ED1(bp) = Utt_Delay(Utt_Test);
Utt_DelayEst(Utt_Test) = Utt_DelayEst_l;
UttSearch_Start(Utt_Test) = Utt_BPs(bp);
UttSearch_End(Utt_Test) = Utt_End_l;
crude_align( ref_logVAD, ref_Nsamples, deg_logVAD, ...
deg_Nsamples, MAXNUTTERANCES);
Utt_ED2(bp) = Utt_Delay(Utt_Test);
end
% stream = fopen( 'matmat.txt', 'wt' );
% for count= 1: N_BPs- 1
% fprintf( stream, '%d\n', Utt_ED2(count));
% end
% fclose( stream );
Utt_DC1(1: N_BPs-1) = -2.0;
% stream= fopen( 'what_mmm.txt', 'at');
while( 1 )
bp = 1;
while( (bp <= N_BPs- 1) && (Utt_DC1(bp) > -2.0) )
bp = bp+ 1;
end
if( bp >= N_BPs )
break;
end
estdelay = Utt_ED1(bp);
% fprintf( 'bp,estdelay is %d,%d\n', bp, estdelay);
H(1: Align_Nfft)= 0;
Hsum = 0.0;
startr = (Utt_Start_l- 1) * Downsample+ 1;
startd = startr + estdelay;
% fprintf( 'startr/startd is %d/%d\n', startr, startd);
if ( startd < 0 )
startr = -estdelay+ 1;
startd = 1;
end
while( ((startd + Align_Nfft) <= 1+ deg_Nsamples) &&...
((startr + Align_Nfft) <= (1+ (Utt_BPs(bp)- 1) * Downsample)) )
X1= ref_data(startr: startr+ Align_Nfft- 1).* Window;
X2= deg_data(startd: startd+ Align_Nfft- 1).* Window;
X1_fft= fft( X1, Align_Nfft );
X1_fft_conj= conj( X1_fft);
X2_fft= fft( X2, Align_Nfft );
X1= ifft( X1_fft_conj.* X2_fft, Align_Nfft);
X1= abs( X1);
v_max= max( X1)* 0.99;
n_max = (v_max^ 0.125 )/ kernel;
% fprintf( stream, '%f %f\n', v_max, n_max);
for count = 0: Align_Nfft- 1
if( X1(count+ 1) > v_max )
Hsum = Hsum+ n_max * kernel;
for k = 1-kernel: kernel- 1
H(1+ rem( count+ k+ Align_Nfft, Align_Nfft))= ...
H(1+ rem(count+ k+ Align_Nfft, Align_Nfft))+ ...
n_max* (kernel- abs(k));
end
end
end
startr = startr+ (Align_Nfft / 4);
startd = startd+ (Align_Nfft / 4);
end
[v_max, I_max] = max( H);
if( I_max- 1 >= (Align_Nfft/2) )
I_max = I_max- Align_Nfft;
end
Utt_D1(bp) = estdelay + I_max- 1;
if( Hsum > 0.0 )
% if (Utt_Len== 236)
% fprintf( 'v_max, Hsum is %f, %f\n', v_max, Hsum);
% end
Utt_DC1(bp) = v_max / Hsum;
else
Utt_DC1(bp) = 0.0;
end
% fprintf( 'bp/startr/startd is %d/%d/%d\n', bp, startr, startd);
while( bp < (N_BPs - 1) )
bp = bp + 1;
if( (Utt_ED1(bp) == estdelay) && (Utt_DC1(bp) <= -2.0) )
% loopno= 0;
while(((startd+ Align_Nfft)<= 1+ deg_Nsamples) && ...
((startr+ Align_Nfft)<= ...
((Utt_BPs(bp)- 1)* Downsample+ 1) ))
X1= ref_data( startr: startr+ Align_Nfft- 1).* ...
Window;
% % if (Utt_Len== 321)
% fid= fopen( 'what_mat.txt', 'at');
% fprintf( fid, '%f\n', Window);
% fclose( fid);
% % fprintf( '\n');
% % end
X2= deg_data( startd: startd+ Align_Nfft- 1).* ...
Window;
X1_fft= fft( X1, Align_Nfft );
X1_fft_conj= conj( X1_fft);
X2_fft= fft( X2, Align_Nfft );
X1= ifft( X1_fft_conj.* X2_fft, Align_Nfft);
X1= abs( X1);
v_max = 0.99* max( X1);
n_max = (v_max^ 0.125)/ kernel;
% fprintf( 'v_max n_max is %f %f\n', v_max, n_max);
for count = 0: Align_Nfft- 1
if( X1(count+ 1) > v_max )
Hsum = Hsum+ n_max * kernel;
for k = 1-kernel: kernel-1
H(1+ rem( count+ k+ Align_Nfft, Align_Nfft))= ...
H(1+ rem(count+ k+ Align_Nfft, Align_Nfft))+ ...
n_max* (kernel- abs(k));
end
end
end
startr = startr+ (Align_Nfft / 4);
startd = startd+ (Align_Nfft / 4);
% loopno= loopno+ 1;
end
% fprintf( 'loopno is %d\n', loopno);
[v_max, I_max] = max( H);
% fprintf( 'I_max is %d ', I_max);
if( I_max- 1 >= (Align_Nfft/2) )
I_max = I_max- Align_Nfft;
end
Utt_D1(bp) = estdelay + I_max- 1;
if( Hsum > 0.0 )
% fprintf( 'v_max Hsum is %f %f\n', v_max, Hsum);
Utt_DC1(bp) = v_max / Hsum;
else
Utt_DC1(bp) = 0.0;
end
end
end
end
% fclose( stream);
for bp= 1: N_BPs- 1
if( Utt_DC1(bp) > Utt_DelayConf_l )
Utt_DC2(bp) = -2.0;
else
Utt_DC2(bp) = 0.0;
end
end
while( 1 )
bp = N_BPs- 1;
while( (bp >= 1) && (Utt_DC2(bp) > -2.0) )
bp = bp- 1;
end
if( bp < 1 )
break;
end
estdelay = Utt_ED2(bp);
H( 1: Align_Nfft)= 0;
Hsum = 0.0;
startr = (Utt_End_l- 1)* Downsample+ 1- Align_Nfft;
startd = startr + estdelay;
% fprintf( '***NEW startr is %d\n', startr);
% fprintf( 'startr/d, deg_Nsamples is %d/%d, %d\n', startr,startd, ...
% deg_Nsamples);
% fprintf( 'deg_data has %d elements\n', numel( deg_data));
if ( (startd + Align_Nfft) > deg_Nsamples+ 1 )
startd = deg_Nsamples - Align_Nfft+ 1;
startr = startd - estdelay;
end
while( (startd>= 1) && (startr>= (Utt_BPs(bp)- 1)* Downsample+ 1) )
X1= ref_data( startr: startr+ Align_Nfft- 1).* Window;
X2= deg_data( startd: startd+ Align_Nfft- 1).* Window;
X1_fft= fft( X1, Align_Nfft);
X1_fft_conj= conj( X1_fft);
X2_fft= fft( X2, Align_Nfft);
X1= ifft( X1_fft_conj.* X2_fft, Align_Nfft );
X1= abs( X1);
v_max = max( X1)* 0.99;
n_max = ( v_max^ 0.125 )/ kernel;
for count = 0: Align_Nfft- 1
if( X1(count+ 1) > v_max )
Hsum = Hsum+ n_max * kernel;
for k = 1-kernel: kernel- 1
H(1+ rem(count+ k+ Align_Nfft, Align_Nfft))= ...
H(1+ rem(count+ k+ Align_Nfft, Align_Nfft))+ ...
n_max* (kernel- abs(k));
end
end
end
startr = startr- (Align_Nfft / 4);
startd = startd- (Align_Nfft / 4);
end
[v_max, I_max] = max( H);
if( I_max- 1 >= (Align_Nfft/2) )
I_max = I_max- Align_Nfft;
end
Utt_D2(bp) = estdelay + I_max- 1;
if( Hsum > 0.0 )
Utt_DC2(bp) = v_max / Hsum;
else
Utt_DC2(bp) = 0.0;
end
while( bp > 1 )
bp = bp - 1;
if( (Utt_ED2(bp) == estdelay) && (Utt_DC2(bp) <= -2.0) )
while( (startd >= 1) && (startr >= (Utt_BPs(bp)- 1) * Downsample+ 1))
X1= ref_data( startr: startr+ Align_Nfft- 1).* Window;
X2= deg_data( startd: startd+ Align_Nfft- 1).* Window;
X1_fft_conj= conj( fft( X1, Align_Nfft));
X2_fft= fft( X2, Align_Nfft);
X1= ifft( X1_fft_conj.* X2_fft, Align_Nfft);
X1= abs( X1);
v_max = max( X1)* 0.99;
n_max = (v_max^ 0.125)/ kernel;
for count = 0: Align_Nfft- 1
if( X1(count+ 1) > v_max )
Hsum = Hsum+ n_max * kernel;
for k = 1-kernel: kernel- 1
H(1+ rem( count+ k+ Align_Nfft, Align_Nfft))= ...
H(1+ rem(count+ k+ Align_Nfft, Align_Nfft))+ ...
n_max* (kernel- abs(k));
end
end
end
startr = startr- (Align_Nfft / 4);
startd = startd- (Align_Nfft / 4);
end
[v_max, I_max] = max( H);
if( I_max- 1 >= (Align_Nfft/2) )
I_max = I_max- Align_Nfft;
end
Utt_D2(bp) = estdelay + I_max- 1;
if( Hsum > 0.0 )
Utt_DC2(bp) = v_max / Hsum;
else
Utt_DC2(bp) = 0.0;
end
end
end
end
% fid= fopen( 'uttinfo_mat.txt', 'wt');
% fprintf( fid, '%f\n', Utt_D2);
% fprintf( fid, '\n');
% fprintf( fid, '%f\n', Utt_DC2);
% fclose( fid);
% fprintf( 'Utt_Len, N_BPs is %d, %d\n', Utt_Len, N_BPs);
for bp = 1: N_BPs- 1
if( (abs(Utt_D2(bp) - Utt_D1(bp)) >= Downsample) && ...
((Utt_DC1(bp)+ Utt_DC2(bp))> (Best_DC1 + Best_DC2)) &&...
(Utt_DC1(bp) > Utt_DelayConf_l) && ...
(Utt_DC2(bp) > Utt_DelayConf_l) )
Best_ED1 = Utt_ED1(bp);
Best_D1 = Utt_D1(bp);
Best_DC1 = Utt_DC1(bp);
Best_ED2 = Utt_ED2(bp);
Best_D2 = Utt_D2(bp);
Best_DC2 = Utt_DC2(bp);
Best_BP = Utt_BPs(bp);
% fprintf( 'in loop...');
end
end
% if (Utt_Len== 236)
% fid= fopen( 'matmat.txt', 'wt');
% fprintf( fid, 'N_BPs is %d\n', N_BPs);
% fprintf( fid, 'Utt_DelayConf is %f\n', Utt_DelayConf_l);
% fprintf( fid, 'ED2\t ED1\t D2\t D1\t DC2\t DC1\t BPs\n');
% for bp= 1: N_BPs- 1
% fprintf( fid, '%d\t %d\t %d\t %d\t %f\t %f\t %d\n', Utt_ED2( bp), ...
% Utt_ED1( bp), Utt_D2(bp), Utt_D1(bp), Utt_DC2(bp),...
% Utt_DC1( bp), Utt_BPs( bp));
% end
% fclose( fid);
% end

View File

@ -0,0 +1,76 @@
function time_align(ref_data, ref_Nsamples, ...
deg_data, deg_Nsamples, Utt_id)
global Utt_DelayEst Utt_Delay Utt_DelayConf UttSearch_Start UttSearch_End
global Align_Nfft Downsample Window
estdelay = Utt_DelayEst(Utt_id);
H = zeros( 1, Align_Nfft);
X1= zeros( 1, Align_Nfft);
X2= zeros( 1, Align_Nfft);
startr = (UttSearch_Start(Utt_id)- 1)* Downsample+ 1;
startd = startr + estdelay;
if ( startd < 0 )
startr = 1 -estdelay;
startd = 1;
end
while( ((startd + Align_Nfft) <= deg_Nsamples) && ...
((startr + Align_Nfft) <= ((UttSearch_End(Utt_id)- 1) * Downsample)) )
X1= ref_data( startr: startr+ Align_Nfft- 1).* Window;
X2= deg_data( startd: startd+ Align_Nfft- 1).* Window;
% find cross-correlation between X1 and X2
X1_fft= fft( X1, Align_Nfft );
X1_fft_conj= conj( X1_fft);
X2_fft= fft( X2, Align_Nfft );
X1= ifft( X1_fft_conj.* X2_fft, Align_Nfft );
X1= abs( X1);
v_max = max( X1)* 0.99;
X1_greater_vmax= find( X1 > v_max );
H( X1_greater_vmax )= H( X1_greater_vmax )+ v_max^ 0.125;
startr = startr+ Align_Nfft/ 4;
startd = startd+ Align_Nfft/ 4;
end
X1= H;
X2= 0;
Hsum = sum( H);
X2(1) = 1.0;
kernel = Align_Nfft / 64;
for count= 2: kernel
X2( count)= 1- (count- 1)/ kernel;
X2( Align_Nfft- count+ 2)= 1- (count- 1)/ kernel;
end
X1_fft= fft( X1, Align_Nfft );
X2_fft= fft( X2, Align_Nfft );
X1= ifft( X1_fft.* X2_fft, Align_Nfft );
if (Hsum> 0)
H= abs( X1)/ Hsum;
else
H= 0;
end
[v_max, I_max] = max( H);
if( I_max- 1 >= (Align_Nfft/2) )
I_max = I_max- Align_Nfft;
end
Utt_Delay(Utt_id) = estdelay + I_max- 1;
Utt_DelayConf(Utt_id) = v_max; % confidence

View File

@ -0,0 +1,26 @@
function utterance_locate (ref_data, ref_Nsamples, ref_VAD, ref_logVAD,...
deg_data, deg_Nsamples, deg_VAD, deg_logVAD);
global Nutterances Utt_Delay Utt_DelayConf Utt_Start Utt_End Utt_DelayEst
id_searchwindows( ref_VAD, ref_Nsamples, deg_VAD, deg_Nsamples);
for Utt_id= 1: Nutterances
%fprintf( 1, 'Utt_id is %d\n', Utt_id);
crude_align( ref_logVAD, ref_Nsamples, deg_logVAD, deg_Nsamples, Utt_id);
time_align(ref_data, ref_Nsamples, ...
deg_data, deg_Nsamples, Utt_id);
end
id_utterances( ref_Nsamples, ref_VAD, deg_Nsamples);
utterance_split( ref_data, ref_Nsamples, ref_VAD, ref_logVAD, ...
deg_data, deg_Nsamples, deg_VAD, deg_logVAD);

View File

@ -0,0 +1,122 @@
function utterance_split( ref_data, ref_Nsamples, ref_VAD, ref_logVAD, ...
deg_data, deg_Nsamples, deg_VAD, deg_logVAD)
global Nutterances MAXNUTTERANCES Downsample SEARCHBUFFER
global Utt_DelayEst Utt_Delay Utt_DelayConf UttSearch_Start
global Utt_Start Utt_End Largest_uttsize UttSearch_End
global Best_ED1 Best_D1 Best_DC1 Best_ED2 Best_D2 Best_DC2 Best_BP
Utt_id = 1;
while( (Utt_id <= Nutterances) && (Nutterances <= MAXNUTTERANCES) )
Utt_DelayEst_l = Utt_DelayEst(Utt_id);
Utt_Delay_l = Utt_Delay(Utt_id);
Utt_DelayConf_l = Utt_DelayConf(Utt_id);
Utt_Start_l = Utt_Start(Utt_id);
Utt_End_l = Utt_End(Utt_id);
Utt_SpeechStart = Utt_Start_l;
% fprintf( 'SpeechStart is %d\n', Utt_SpeechStart);
while( (Utt_SpeechStart < Utt_End_l) && ...
(ref_VAD(Utt_SpeechStart)<= 0.0) )
Utt_SpeechStart = Utt_SpeechStart + 1;
end %find the SpeechStart for each utterance
Utt_SpeechEnd = Utt_End_l;
% fprintf( 'SpeechEnd is %d\n', Utt_SpeechEnd);
while( (Utt_SpeechEnd > Utt_Start_l) && ...
(ref_VAD(Utt_SpeechEnd) <= 0))
Utt_SpeechEnd = Utt_SpeechEnd- 1;
end
Utt_SpeechEnd = Utt_SpeechEnd+ 1;
%find SpeechEnd for each utterance
Utt_Len = Utt_SpeechEnd - Utt_SpeechStart;
% fprintf( 'Utt_Len is %d\n', Utt_Len);
if( Utt_Len >= 200 )
split_align( ref_data, ref_Nsamples, ref_VAD, ref_logVAD, ...
deg_data, deg_Nsamples, deg_VAD, deg_logVAD, ...
Utt_Start_l, Utt_SpeechStart, Utt_SpeechEnd, Utt_End_l, ...
Utt_DelayEst_l, Utt_DelayConf_l);
% fprintf( '\nBest_ED1, Best_D1, Best_DC1 is %d, %d, %f\n',...
% Best_ED1, Best_D1, Best_DC1);
% fprintf( 'Best_ED2, Best_D2, Best_DC2 is %d, %d, %f\n',...
% Best_ED2, Best_D2, Best_DC2);
% fprintf( 'Best_BP is %d\n', Best_BP);
if( (Best_DC1 > Utt_DelayConf_l) && (Best_DC2 > Utt_DelayConf_l) )
for step = Nutterances: -1: Utt_id+ 1
Utt_DelayEst(step+ 1) = Utt_DelayEst(step);
Utt_Delay(step+ 1) = Utt_Delay(step);
Utt_DelayConf(step+ 1) = Utt_DelayConf(step);
Utt_Start(step+ 1) = Utt_Start(step);
Utt_End(step+ 1) = Utt_End(step);
UttSearch_Start(step+ 1) = Utt_Start( step);
UttSearch_End(step+ 1) = Utt_End( step);
end
Nutterances = Nutterances+ 1;
Utt_DelayEst(Utt_id) = Best_ED1;
Utt_Delay(Utt_id) = Best_D1;
Utt_DelayConf(Utt_id) = Best_DC1;
Utt_DelayEst(Utt_id +1) = Best_ED2;
Utt_Delay(Utt_id +1) = Best_D2;
Utt_DelayConf(Utt_id +1) = Best_DC2;
UttSearch_Start(Utt_id +1) = UttSearch_Start(Utt_id);
UttSearch_End(Utt_id +1) = UttSearch_End( Utt_id);
if( Best_D2 < Best_D1 )
Utt_Start(Utt_id) = Utt_Start_l;
Utt_End(Utt_id) = Best_BP;
Utt_Start(Utt_id +1) = Best_BP;
Utt_End(Utt_id +1) = Utt_End_l;
else
Utt_Start( Utt_id) = Utt_Start_l;
Utt_End( Utt_id) = Best_BP + ...
floor( (Best_D2- Best_D1)/ (2 * Downsample));
Utt_Start( Utt_id +1) = Best_BP - ...
floor( (Best_D2- Best_D1)/ (2 * Downsample));
Utt_End( Utt_id +1) = Utt_End_l;
end
if( (Utt_Start(Utt_id)- SEARCHBUFFER- 1)* Downsample+ 1+ ...
Best_D1 < 0 )
Utt_Start(Utt_id) = SEARCHBUFFER+ 1+ ...
floor( (Downsample - 1 - Best_D1) / Downsample);
end
if( ((Utt_End( Utt_id +1)- 1)* Downsample+ 1 + Best_D2) >...
(deg_Nsamples - SEARCHBUFFER * Downsample) )
Utt_End( Utt_id +1) = floor( (deg_Nsamples - Best_D2)...
/ Downsample)- SEARCHBUFFER+ 1;
end
else
Utt_id= Utt_id+ 1;
end
else
Utt_id = Utt_id+ 1;
end
end
Largest_uttsize = max( Utt_End- Utt_Start);
% fid= fopen( 'uttinfo_mat.txt', 'wt');
% fprintf( fid, 'Number of Utterances is:\n');
% fprintf( fid, '%d\n', Nutterances);
% fprintf( fid, 'Utterance Delay Estimation:\n');
% fprintf( fid, '%d\n', Utt_DelayEst( 1: Nutterances) );
% fprintf( fid, 'Utterance Delay:\n');
% fprintf( fid, '%d\n', Utt_Delay( 1: Nutterances));
% fprintf( fid, 'Utterance Delay Confidence:\n');
% fprintf( fid, '%f\n', Utt_DelayConf( 1: Nutterances));
% fprintf( fid, 'Utterance Start:\n');
% fprintf( fid, '%d\n', Utt_Start( 1: Nutterances));
% fprintf( fid, 'Utterance End:\n');
% fprintf( fid, '%d\n', Utt_End( 1: Nutterances));
% fprintf( fid, 'Largest utterance length:\n');
% fprintf( fid, '%d\n', Largest_uttsize);
% fclose( fid);

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,41 @@
import numpy as np
import matplotlib.pyplot as plt
import scipy.signal
import sounddevice as sd
SOUND_PATH = "noisefiles/train.dat"
def normalize_signal(signal):
min_amp = np.min(signal)
normalized_signal = signal - min_amp
max_amp = np.max(normalized_signal)
normalized_signal *= 2/max_amp
normalized_signal -= 1
return normalized_signal
def load_audiofile(path):
sound_data = []
sample_rate = 8000
if path[-3:] == "dat":
with open(SOUND_PATH, "r") as sound_file:
sound_data_strings = sound_file.readlines()
for data_string in sound_data_strings:
sound_data.append(eval(data_string.strip()))
sound_data = np.array(sound_data)
elif path[-3:] == "wav":
sample_rate, sound_data = wavfile.read(path)
return sample_rate, sound_data
def main():
sample_rate, sound_data = load_audiofile(SOUND_PATH)
print(sample_rate)
sd.play(normalize_signal(sound_data), samplerate=sample_rate, blocking=True)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,176 @@
function audnoise(ns_file,outfile)
%
% Implements the audible-noise suppression algorithm [1].
%
% Usage: audnoise(noisyFile, outputFile)
%
% infile - noisy speech file in .wav format
% outputFile - enhanced output file in .wav format
%
% It runs 2 iterations, but one could change the number of iterations by
% modifying accordingly the variable iter_num on line 33.
%
% Example call: audnoise('sp04_babble_sn10.wav','out_aud.wav');
%
% References:
% [1] Tsoukalas, D. E., Mourjopoulos, J. N., and Kokkinakis, G. (1997). Speech
% enhancement based on audible noise suppression. IEEE Trans. on Speech and
% Audio Processing, 5(6), 497-514.
%
% Authors: Yi Hu and Philipos C. Loizou
%
% Copyright (c) 2006 by Philipos C. Loizou
% $Revision: 0.0 $ $Date: 10/09/2006 $
%-------------------------------------------------------------------------
if nargin<2
fprintf('Usage: audnoise(noisyfile.wav,outFile.wav) \n\n');
return;
end
iter_num=2; % number of iterations
NF_SABSENT= 6;
%this is the number of speech-absent frames to estimate the initial
%noise power spectrum
[nsdata, Fs, bits]= wavread( ns_file); %nsdata is a column vector
aa=0.98;
mu=0.98;
eta=0.15;
nwind= floor( 20* Fs/ 1000); %this corresponds to 20ms window
if rem( nwind, 2)~= 0 nwind= nwind+ 1; end %made window length even
noverlap= nwind/ 2;
w= hamming( nwind);
rowindex= ( 1: nwind)';
%we assume the first NF_SABSENT frames are speech absent, we use them to estimate the noise power spectrum
noisedata= nsdata( 1: nwind* NF_SABSENT); noise_colindex= 1+ ( 0: NF_SABSENT- 1)* nwind;
noisematrixdata = zeros( nwind, NF_SABSENT);
noisematrixdata( :)= noisedata( ...
rowindex( :, ones(1, NF_SABSENT))+ noise_colindex( ones( nwind, 1), :)- 1);
noisematrixdata= noisematrixdata.* w( :, ones( 1, NF_SABSENT)) ; %WINDOWING NOISE DATA
noise_ps= mean( (abs( fft( noisematrixdata))).^ 2, 2); %NOTE!!!! it is a column vector
% ----- estimate noise in CBs ------------------
%
noise_b=zeros(nwind/2+1,1);
[CB_FREQ_INDICES]=find_CB_FREQ_INDICES(Fs,nwind,16,nwind/2);
for i = 1:length(CB_FREQ_INDICES)
noise_b(CB_FREQ_INDICES{i})=ones(size(CB_FREQ_INDICES{i},2),1)*mean(noise_ps(CB_FREQ_INDICES{i}));
end
noise_b1=[noise_b; fliplr(noise_b(2:nwind/2))];
nslide= nwind- noverlap;
x= nsdata;
nx= length( x); ncol= fix(( nx- noverlap)/ nslide);
colindex = 1 + (0: (ncol- 1))* nslide;
if nx< (nwind + colindex(ncol) - 1)
x(nx+ 1: nwind+ colindex(ncol) - 1) = ...
rand( nwind+ colindex( ncol)- 1- nx, 1)* (2^ (-15)); % zero-padding
end
es_old= zeros( noverlap, 1);
%es_old is actually the second half of the previous enhanced speech frame,
%it is used for overlap-add
for k= 1: ncol
y= x( colindex( k): colindex( k)+ nwind- 1);
y= y.* w; %WINDOWING NOISY SPEECH DATA
y_spec= fft( y); y_specmag= abs( y_spec); y_specang= angle( y_spec);
%they are the frequency spectrum, spectrum magnitude and spectrum phase, respectively
y_ps= y_specmag.^ 2; %power spectrum of noisy speech
y_ps1=y_ps(1:nwind/2+1);
% ====start of vad ===
gammak=min(y_ps./noise_ps,40); % post SNR
if k==1
ksi=aa+(1-aa)*max(gammak-1,0);
else
ksi=aa*Xk_prev./noise_ps + (1-aa)*max(gammak-1,0); % a priori SNR
end
log_sigma_k= gammak.* ksi./ (1+ ksi)- log(1+ ksi);
vad_decision= sum( log_sigma_k)/ nwind;
if (vad_decision < eta)
% noise only frame found
noise_ps= mu* noise_ps+ (1- mu)* y_ps;
end
for i = 1:length(CB_FREQ_INDICES)
noise_b(CB_FREQ_INDICES{i})=...
ones(size(CB_FREQ_INDICES{i},2),1)*mean(noise_ps(CB_FREQ_INDICES{i}));
end
% ===end of vad===
x_cons1=max(y_ps-noise_ps,0.001);
% conservative estimate of x from power spectral subtraction
x_cons = x_cons1(1:nwind/2+1);
% --- Estimate masking thresholds iteratively (as per page 505) ----
%
Tk0=mask(x_cons,nwind,Fs,16);
Xp=y_ps1;
for j=1:iter_num
ab = noise_b+(noise_b.^2)./Tk0; % Eq. 41
Xp=(Xp.^2)./(ab+Xp); % Eq. 40
Tk0=mask(Xp,nwind,Fs,16);
end
% --- Estimate alpha ------
%
alpha = (noise_b+Tk0).*(noise_b./Tk0);
% eq. 26 for Threshold (T) method with ni(b)=1
% ---- Apply suppression rule --------------
%
H0 = (Xp./(alpha+Xp));
H=[H0(1:nwind/2+1); flipud(H0(2:nwind/2))];
x_hat = H.*y_spec;
Xk_prev= abs( x_hat).^ 2;
es_tmp=real(ifft(x_hat));
% ---- Overlap and add ---------------
es_data( colindex( k): colindex( k)+ nwind- 1)= [es_tmp( 1: noverlap)+ es_old;...
es_tmp( noverlap+ 1: nwind)];
%overlap-add
es_old= es_tmp( nwind- noverlap+ 1: nwind);
end
wavwrite( es_data, Fs, bits, outfile);
%------------------------------------------------------
function [CB_FREQ_INDICES]=find_CB_FREQ_INDICES(Fs,dft_length,nbits,frame_overlap)
% This function is from Matlab STSA Toolbox for Audio Signal Noise Reduction
% Copyright (C) 2001 Patrick J. Wolfe
freq_val = (0:Fs/dft_length:Fs/2)';
freq=freq_val;
crit_band_ends = [0;100;200;300;400;510;630;770;920;1080;1270;1480;1720;2000;2320;2700;3150;3700;4400;5300;6400;7700;9500;12000;15500;Inf];
imax = max(find(crit_band_ends < freq(end)));
num_bins = length(freq);
LIN_TO_BARK = zeros(imax,num_bins);
i = 1;
for j = 1:num_bins
while ~((freq(j) >= crit_band_ends(i)) & (freq(j) < crit_band_ends(i+1))),i = i+1;end
LIN_TO_BARK(i,j) = 1;
end
% Calculation of critical band frequency indices--i.e., which bins are in which critical band for i = 1:imax
for i=1:imax,
CB_FREQ_INDICES{i} = find(LIN_TO_BARK(i,:));
end

View File

@ -0,0 +1,47 @@
function z=confhyperg(a,b,x,n)
%
% Computes the confluent hypergeometric function
% using a series expansion:
%
% f(a,b;x)=
%
% 1 + [ab/1!c]x + [a(a+1)/2!b(b+1)]x^2 +
% [a(a+1)(a+2)/3!b(b+1)(b+2)]x^3 + ...
%
% The above series is expanded to n terms
%
%
%
% Philipos C. Loizou
if nargin ~= 4
error('Usage: confhyperg(a,b,x,n) - Incorrect number of arguments')
end
if (n <= 0 | n ~= floor(n))
error('Usage: confhyperg (a,b,c,x,n) - n has to be a positive integer')
end
NEG=0;
if x<0
x=abs(x);
a=b-a;
NEG=1;
end
z = 0;
m = 0;
while (m<n)
if (m == 0)
delta = 1;
else
delta = delta .* x .* (a + (m - 1)) ./ (m .* (b + (m-1)));
end
z = z + delta;
m = m + 1;
end
if NEG==1 % if x<0
z=exp(-x).*z;
end;

View File

@ -0,0 +1,54 @@
function z=hyperg(a,b,c,x,n)
% HYPERGEOMETRIC2F1 Computes the hypergeometric function
% using a series expansion:
%
% f(a,b;c;x)=
%
% 1 + [ab/1!c]x + [a(a+1)b(b+1)/2!c(c+1)]x^2 +
% [a(a+1)(a+2)b(b+1)(b+2)/3!c(c+1)(c+2)]x^3 + ...
%
% The series is expanded to n terms
%
% This function solves the Gaussian Hypergeometric Differential Equation:
%
% x(1-x)y'' + {c-(a+b+1)x}y' - aby = 0
%
% The Hypergeometric function converges only for:
% |x| < 1
% c != 0, -1, -2, -3, ...
%
%
% Comments to:
% Diego Garcia - d.garcia@ieee.org
% Chuck Mongiovi - mongiovi@fast.net
% June 14, 2002
if nargin ~= 5
error('Usage: hypergeometric2f1(a,b,c,x,n) --> Wrong number of arguments')
end
if (n <= 0 | n ~= floor(n))
error('Usage: hypergeometric2f1(a,b,c,x,n) --> n has to be a positive integer')
end
% if (abs(x) > 1)
% z=min(0.99,x);
% return;
% error('Usage: hypergeometric2f1(a,b,c,x,n) --> |x| has to be less than 1')
% end
if (c <= 0 & c == floor(c))
error('Usage: hypergeometric2f1(a,b,c,x,n) --> c != 0, -1, -2, -3, ...')
end
z = 0;
m = 0;
while (m<n)
if (m == 0)
delta = 1;
else
delta = delta .* x .* (a + (m - 1)) .* (b + (m-1)) ./ m ./ (c + (m-1));
end
z = z + delta;
m = m + 1;
end

View File

@ -0,0 +1,119 @@
function logmmse(filename,outfile)
%
% Implements the logMMSE algorithm [1].
%
% Usage: logmmse(noisyFile, outputFile)
%
% infile - noisy speech file in .wav format
% outputFile - enhanced output file in .wav format
%
%
% Example call: logmmse('sp04_babble_sn10.wav','out_log.wav');
%
% References:
% [1] Ephraim, Y. and Malah, D. (1985). Speech enhancement using a minimum
% mean-square error log-spectral amplitude estimator. IEEE Trans. Acoust.,
% Speech, Signal Process., ASSP-23(2), 443-445.
%
% Authors: Philipos C. Loizou
%
% Copyright (c) 2006 by Philipos C. Loizou
% $Revision: 0.0 $ $Date: 10/09/2006 $
%-------------------------------------------------------------------------
if nargin<2
fprintf('Usage: logmmse(noisyfile.wav,outFile.wav) \n\n');
return;
end
[x, Srate, bits]= wavread( filename); %nsdata is a column vector
% =============== Initialize variables ===============
len=floor(20*Srate/1000); % Frame size in samples
if rem(len,2)==1, len=len+1; end;
PERC=50; % window overlap in percent of frame size
len1=floor(len*PERC/100);
len2=len-len1;
win=hamming(len); % define window
% Noise magnitude calculations - assuming that the first 6 frames is
% noise/silence
nFFT=2*len;
noise_mean=zeros(nFFT,1);
j=1;
for m=1:6
noise_mean=noise_mean+abs(fft(win.*x(j:j+len-1),nFFT));
j=j+len;
end
noise_mu=noise_mean/6;
noise_mu2=noise_mu.^2;
%--- allocate memory and initialize various variables
x_old=zeros(len1,1);
Nframes=floor(length(x)/len2)-floor(len/len2);
xfinal=zeros(Nframes*len2,1);
%=============================== Start Processing =======================================================
%
k=1;
aa=0.98;
mu=0.98;
eta=0.15;
ksi_min=10^(-25/10);
for n=1:Nframes
insign=win.*x(k:k+len-1);
spec=fft(insign,nFFT);
sig=abs(spec); % compute the magnitude
sig2=sig.^2;
gammak=min(sig2./noise_mu2,40); % limit post SNR to avoid overflows
if n==1
ksi=aa+(1-aa)*max(gammak-1,0);
else
ksi=aa*Xk_prev./noise_mu2 + (1-aa)*max(gammak-1,0); % a priori SNR
ksi=max(ksi_min,ksi); % limit ksi to -25 dB
end
log_sigma_k= gammak.* ksi./ (1+ ksi)- log(1+ ksi);
vad_decision= sum(log_sigma_k)/ len;
if (vad_decision< eta)
% noise only frame found
noise_mu2= mu* noise_mu2+ (1- mu)* sig2;
end
% ===end of vad===
A=ksi./(1+ksi); % Log-MMSE estimator
vk=A.*gammak;
ei_vk=0.5*expint(vk);
hw=A.*exp(ei_vk);
sig=sig.*hw;
Xk_prev=sig.^2;
xi_w= ifft( hw .* spec,nFFT);
xi_w= real( xi_w);
xfinal(k:k+ len2-1)= x_old+ xi_w(1:len1);
x_old= xi_w(len1+ 1: len);
k=k+len2;
end
wavwrite(xfinal,Srate,16,outfile);

View File

@ -0,0 +1,287 @@
function logmmse_SPU(filename,outfile,option)
%
% Implements the logMMSE algorithm with signal-presence uncertainty (SPU) [1].
% Four different methods for estimating the a priori probability of speech absence
% (P(H0)) are implemented.
%
% Usage: logmmse_SPU(noisyFile, outputFile, option)
%
% infile - noisy speech file in .wav format
% outputFile - enhanced output file in .wav format
% option - method used to estimate the a priori probability of speech
% absence, P(Ho):
% 1 - hard decision (Soon et al. [2])
% 2 - soft decision (Soon et al. [2])
% 3 - Malah et al.(1999) - ICASSP
% 4 - Cohen (2002) [1]
%
%
% Example call: logmmse_SPU('sp04_babble_sn10.wav','out_logSPU.wav',1);
%
% References:
% [1] Cohen, I. (2002). Optimal speech enhancement under signal presence
% uncertainty using log-spectra amplitude estimator. IEEE Signal Processing
% Letters, 9(4), 113-116.
% [2] Soon, I., Koh, S., and Yeo, C. (1999). Improved noise suppression
% filter using self-adaptive estimator of probability of speech absence.
% Signal Processing, 75, 151-159.
%
% Author: Philipos C. Loizou
%
% Copyright (c) 2006 by Philipos C. Loizou
% $Revision: 0.0 $ $Date: 10/09/2006 $
%-------------------------------------------------------------------------
if nargin<3
fprintf('Usage: logmmse_SPU(infile.wav,outfile.wav,option) \n');
fprintf('where option = \n');
fprintf(' 1 - hard decision ( Soon et al)\n');
fprintf(' 2 - soft decision (Soon et al.)\n');
fprintf(' 3 - Malah et al.(1999) \n');
fprintf(' 4 - Cohen (2002) \n');
return;
end;
if option<1 | option>4 | rem(option,1)~=0
error('ERROR! option needs to be an integer between 1 and 4.\n\n');
end
[x, Srate, bits]= wavread( filename);
% =============== Initialize variables ===============
%
len=floor(20*Srate/1000); % Frame size in samples
if rem(len,2)==1, len=len+1; end;
PERC=50; % window overlap in percent of frame size
len1=floor(len*PERC/100);
len2=len-len1;
win=hamming(len); % define window
% Noise magnitude calculations - assuming that the first 6 frames is
% noise/silence
%
nFFT=len;
nFFT2=floor(len/2);
noise_mean=zeros(nFFT,1);
j=1;
for k=1:6
noise_mean=noise_mean+abs(fft(win.*x(j:j+len-1),nFFT));
j=j+len;
end
noise_mu=noise_mean/6;
noise_mu2=noise_mu.^2;
%--- allocate memory and initialize various variables
aa=0.98;
mu=0.98;
eta=0.15;
img=sqrt(-1);
x_old=zeros(len1,1);
Nframes=floor(length(x)/len2)-floor(len/len2);
xfinal=zeros(Nframes*len2,1);
if option==4 % Cohen's method
global zetak zeta_fr_old z_peak
len2a=len/2+1;
zetak=zeros(len2a,1);
zeta_fr_old=1000;
z_peak=0;
end;
%=============================== Start Processing =======================================================
%
qk=0.5*ones(len,1);
ksi_old=zeros(len,1);
ksi_min=10^(-25/10);
%qkr=(1-qk)/qk;
%qk2=1/(1-qk);
Gmin=10^(-20/10); % needed for Cohen's implementation
k=1;
for n=1:Nframes
insign=win.*x(k:k+len-1);
%--- Take fourier transform of frame
spec=fft(insign,nFFT);
sig=abs(spec); % compute the magnitude
sig2=sig.^2;
gammak=min(sig2./noise_mu2,40); % post SNR
if n==1
ksi=aa+(1-aa)*max(gammak-1,0);
else
ksi=aa*Xk_prev./noise_mu2 + (1-aa)*max(gammak-1,0);
% a priori SNR
ksi=max(ksi_min,ksi); % limit ksi to -25 dB
end
log_sigma_k= gammak.* ksi./ (1+ ksi)- log(1+ ksi);
vad_decision= sum( log_sigma_k)/ len;
if (vad_decision< eta)
% noise only frame found
noise_mu2= mu* noise_mu2+ (1- mu)* sig2;
end
% ===end of vad===
%ksi=qk2*ksi;
A=ksi./(1+ksi);
vk=A.*gammak;
ei_vk=0.5*expint(vk);
hw=A.*exp(ei_vk);
% --- estimate conditional speech-presence probability ---------------
%
[qk]=est_sap(qk,ksi,ksi_old,gammak,option); % estimate P(Ho)- a priori speech absence prob.
pSAP = (1-qk)./(1-qk+qk.*(1+ksi).*exp(-vk)); % P(H1 | Yk)
% ---- Cohen's 2002 ------
%
Gmin2=Gmin.^(1-pSAP); % Cohen's (2002) - Eq 8
Gcohen=(hw.^pSAP).*Gmin2;
sig = sig.*Gcohen;
%----------------------------
Xk_prev=sig.^2;
ksi_old=ksi; % needed for Cohen's method for estimating q
xi_w= ifft( sig .* exp(img*angle(spec)));
xi_w= real( xi_w);
% --------- Overlap and add ---------------
%
xfinal(k:k+ len2-1)= x_old+ xi_w(1:len1);
x_old= xi_w(len1+ 1: len);
k=k+len2;
end
%========================================================================================
wavwrite(xfinal,Srate,16,outfile);
%--------------------------- E N D -----------------------------------------
function [qk]=est_sap(qk,xsi,xsi_old,gammak,type)
% function returns a priori probability of speech absence, P(Ho)
%
global zetak zeta_fr_old z_peak
if type ==1 % hard-decision: Soon et al.
beta=0.1;
dk=ones(length(xsi),1);
i0=besseli(0,2*(gammak.*xsi).^0.5);
temp=exp(-xsi).*i0;
indx=find(temp>1);
dk(indx)=0;
qk=beta*dk + (1-beta)*qk;
elseif type==2 % soft-decision: Soon et al.
beta=0.1;
i0=besseli(0,2*(gammak.*xsi).^0.5);
temp=exp(-xsi).*i0;
P_Ho=1./(1+temp);
P_Ho=min(1,P_Ho);
qk=beta*P_Ho + (1-beta)*qk;
elseif type==3 % Malah et al. (1999)
if mean(gammak(1:floor(length(gammak)/2)))> 2.4 % VAD detector
beta=0.95;
gamma_th=0.8;
dk=ones(length(xsi),1);
indx=find(gammak>gamma_th);
dk(indx)=0;
qk=beta*qk+(1-beta)*dk;
end
elseif type==4 % Cohen (2002)
beta=0.7;
len=length(qk);
len2=len/2+1;
zetak=beta*zetak+(1-beta)*xsi_old(1:len2);
z_min=0.1; z_max=0.3162;
C=log10(z_max/z_min);
zp_min=1; zp_max=10;
zeta_local=smoothing(zetak,1);
zeta_global=smoothing(zetak,15);
Plocal=zeros(len2,1); % estimate P_local
imax=find(zeta_local>z_max);
Plocal(imax)=1;
ibet=find(zeta_local>z_min & zeta_local<z_max);
Plocal(ibet)=log10(zeta_local(ibet)/z_min)/C;
Pglob=zeros(len2,1); % estimate P_global
imax=find(zeta_global>z_max);
Pglob(imax)=1;
ibet=find(zeta_global>z_min & zeta_global<z_max);
Pglob(ibet)=log10(zeta_global(ibet)/z_min)/C;
zeta_fr=mean(zetak); % estimate Pframe
if zeta_fr>z_min
if zeta_fr>zeta_fr_old
Pframe=1;
z_peak=min(max(zeta_fr,zp_min),zp_max);
else
if zeta_fr <=z_peak*z_min, Pframe=0;
elseif zeta_fr>= z_peak*z_max, Pframe=1;
else, Pframe=log10(zeta_fr/z_peak/z_min)/C;
end
end
else
Pframe=0;
end
zeta_fr_old=zeta_fr;
qk2 = 1- Plocal.*Pglob*Pframe; % estimate prob of speech absence
qk2= min(0.95,qk2);
qk = [qk2; flipud(qk2(2:len2-1))];
end
%----------------------------------------------
function y=smoothing (x,N);
len=length(x);
win=hanning(2*N+1);
win1=win(1:N+1);
win2=win(N+2:2*N+1);
y1=filter(flipud(win1),[1],x);
x2=zeros(len,1);
x2(1:len-N)=x(N+1:len);
y2=filter(flipud(win2),[1],x2);
y=(y1+y2)/norm(win,2);

View File

@ -0,0 +1,96 @@
% Author: Patrick J. Wolfe
% Signal Processing Group
% Cambridge University Engineering Department
% p.wolfe@ieee.org
% Johnston perceptual model initialisation
function M= mask( Sx, dft_length, Fs, nbits)
frame_overlap= dft_length/ 2;
freq_val = (0:Fs/dft_length:Fs/2)';
half_lsb = (1/(2^nbits-1))^2/dft_length;
freq= freq_val;
thresh= half_lsb;
crit_band_ends = [0;100;200;300;400;510;630;770;920;1080;1270;...
1480;1720;2000;2320;2700;3150;3700;4400;5300;6400;7700;...
9500;12000;15500;Inf];
% Maximum Bark frequency
%
imax = max(find(crit_band_ends < freq(end)));
% Normalised (to 0 dB) threshold of hearing values (Fletcher, 1929)
% as used by Johnston. First and last thresholds are corresponding
% critical band endpoint values, elsewhere means of interpolated
% critical band endpoint threshold values are used.
%
abs_thr = 10.^([38;31;22;18.5;15.5;13;11;9.5;8.75;7.25;4.75;2.75;...
1.5;0.5;0;0;0;0;2;7;12;15.5;18;24;29]./10);
ABSOLUTE_THRESH = thresh.*abs_thr(1:imax);
% Calculation of tone-masking-noise offset ratio in dB
%
OFFSET_RATIO_DB = 9+ (1:imax)';
% Initialisation of matrices for bark/linear frequency conversion
% (loop increments i to the proper critical band)
%
num_bins = length(freq);
LIN_TO_BARK = zeros(imax,num_bins);
i = 1;
for j = 1:num_bins
while ~((freq(j) >= crit_band_ends(i)) & ...
(freq(j) < crit_band_ends(i+1))),
i = i+1;
end
LIN_TO_BARK(i,j) = 1;
end
% Calculation of spreading function (Schroeder et al., 82)
spreading_fcn = zeros(imax);
summ = 0.474:imax;
spread = 10.^((15.81+7.5.*summ-17.5.*sqrt(1+summ.^2))./10);
for i = 1:imax
for j = 1:imax
spreading_fcn(i,j) = spread(abs(j-i)+1);
end
end
% Calculation of excitation pattern function
EX_PAT = spreading_fcn* LIN_TO_BARK;
% Calculation of DC gain due to spreading function
DC_GAIN = spreading_fcn* ones(imax,1);
%Sx = X.* conj(X);
C = EX_PAT* Sx;
% Calculation of spectral flatness measure SFM_dB
%
[num_bins num_frames] = size(Sx);
k = 1/num_bins;
SFM_dB = 10.*log10((prod(Sx).^k)./(k.*sum(Sx))+ eps);
% Calculation of tonality coefficient and masked threshold offset
%
alpha = min(1,SFM_dB./-60);
O_dB = OFFSET_RATIO_DB(:,ones(1,num_frames)).*...
alpha(ones(length(OFFSET_RATIO_DB),1),:) + 5.5;
% Threshold calculation and renormalisation, accounting for absolute
% thresholds
T = C./10.^(O_dB./10);
T = T./DC_GAIN(:,ones(1,num_frames));
T = max( T, ABSOLUTE_THRESH(:, ones(1, num_frames)));
% Reconversion to linear frequency scale
%M = 1.* sqrt((LIN_TO_BARK')*T);
M= LIN_TO_BARK'* T;

View File

@ -0,0 +1,150 @@
function mmse(filename,outfile,SPU)
%
% Implements the MMSE algorithm [1].
%
% Usage: mmse(noisyFile, outputFile, SPU)
%
% infile - noisy speech file in .wav format
% outputFile - enhanced output file in .wav format
% SPU - if 1, includes speech-presence uncertainty
% if 0, doesnt include speech-presence uncertainty
%
%
% Example call: mmse('sp04_babble_sn10.wav','out_mmse.wav',1);
%
% References:
% [1] Ephraim, Y. and Malah, D. (1985). Speech enhancement using a minimum
% mean-square error log-spectral amplitude estimator. IEEE Trans. Acoust.,
% Speech, Signal Process., ASSP-23(2), 443-445.
%
% Authors: Philipos C. Loizou
%
% Copyright (c) 2006 by Philipos C. Loizou
% $Revision: 0.0 $ $Date: 10/09/2006 $
%-------------------------------------------------------------------------
if nargin<3
fprintf('Usage: mmse(infile.wav,outfile.wav,SPU) \n');
fprintf('where SPU=1 - includes speech presence uncertainty\n');
fprintf(' SPU=0 - does not includes speech presence uncertainty\n\n');
return;
end;
if SPU~=1 & SPU~=0
error('ERROR: SPU needs to be either 1 or 0.');
end
[x, Srate, bits]= wavread( filename);
% =============== Initialize variables ===============
len=floor(20*Srate/1000); % Frame size in samples
if rem(len,2)==1, len=len+1; end;
PERC=50; % window overlap in percent of frame size
len1=floor(len*PERC/100);
len2=len-len1;
win=hamming(len); %tukey(len,PERC); % define window
% Noise magnitude calculations - assuming that the first 6 frames is noise/silence
%
nFFT=2*len;
j=1;
noise_mean=zeros(nFFT,1);
for k=1:6
noise_mean=noise_mean+abs(fft(win.*x(j:j+len-1),nFFT));
j=j+len;
end
noise_mu=noise_mean/6;
noise_mu2=noise_mu.^2;
%--- allocate memory and initialize various variables
k=1;
img=sqrt(-1);
x_old=zeros(len1,1);
Nframes=floor(length(x)/len2)-1;
xfinal=zeros(Nframes*len2,1);
% --------------- Initialize parameters ------------
%
k=1;
aa=0.98;
eta= 0.15;
mu=0.98;
c=sqrt(pi)/2;
qk=0.3;
qkr=(1-qk)/qk;
ksi_min=10^(-25/10);
%=============================== Start Processing =======================================================
%
for n=1:Nframes
insign=win.*x(k:k+len-1);
%--- Take fourier transform of frame
%
spec=fft(insign,nFFT);
sig=abs(spec); % compute the magnitude
sig2=sig.^2;
gammak=min(sig2./noise_mu2,40); % posteriori SNR
if n==1
ksi=aa+(1-aa)*max(gammak-1,0);
else
ksi=aa*Xk_prev./noise_mu2 + (1-aa)*max(gammak-1,0);
% decision-direct estimate of a priori SNR
ksi=max(ksi_min,ksi); % limit ksi to -25 dB
end
log_sigma_k= gammak.* ksi./ (1+ ksi)- log(1+ ksi);
vad_decision= sum( log_sigma_k)/ len;
if (vad_decision< eta) % noise only frame found
noise_mu2= mu* noise_mu2+ (1- mu)* sig2;
end
% ===end of vad===
vk=ksi.*gammak./(1+ksi);
[j0,err]=besseli(0,vk/2);
[j1,err2]=besseli(1,vk/2);
if any(err) | any(err2)
fprintf('ERROR! Overflow in Bessel calculation in frame: %d \n',n);
else
C=exp(-0.5*vk);
A=((c*(vk.^0.5)).*C)./gammak;
B=(1+vk).*j0+vk.*j1;
hw=A.*B;
end
% --- estimate speech presence probability
%
if SPU==1
evk=exp(vk);
Lambda=qkr*evk./(1+ksi);
pSAP=Lambda./(1+Lambda);
sig=sig.*hw.*pSAP;
else
sig=sig.*hw;
end
Xk_prev=sig.^2; % save for estimation of a priori SNR in next frame
xi_w= ifft( sig .* exp(img*angle(spec)),nFFT);
xi_w= real( xi_w);
xfinal(k:k+ len2-1)= x_old+ xi_w(1:len1);
x_old= xi_w(len1+ 1: len);
k=k+len2;
end
%========================================================================================
wavwrite(xfinal,Srate,16,outfile);

View File

@ -0,0 +1,696 @@
function outfile= mt_mask( noisy_file, outfile)
%
% Implements a psychoacoustically motivated algorithm [1].
%
% Usage: mt_mask(noisyFile, outputFile)
%
% infile - noisy speech file in .wav format
% outputFile - enhanced output file in .wav format
%
%
% Example call: mt_mask('sp04_babble_sn10.wav','out_mask.wav');
%
% References:
% [1] Hu, Y. and Loizou, P. (2004). Incorporating a psychoacoustical model in
% frequency domain speech enhancement. IEEE Signal Processing Letters, 11(2),
% 270-273.
%
% Authors: Yi Hu and Philipos C. Loizou
%
% Copyright (c) 2006 by Philipos C. Loizou
% $Revision: 0.0 $ $Date: 10/09/2006 $
%-------------------------------------------------------------------------
if nargin<2
fprintf('Usage: mt_mask(noisyfile.wav,outFile.wav) \n\n');
return;
end
% Initialize wavelet parameters (see also wiener_wt.m)
wavname='db4';
thre_type='ds';thre_func_type='s';q_0=5;
taper_num=16;
%------------------get the noisy speech data
[noisy_speech, Srate, NBITS]= wavread( noisy_file);
%===========initiate the parameters=======================
frame_dur= 20; %unit is milli-second
len= floor( Srate* frame_dur/ 1000);
if rem( len, 2)~= 0
len= len+ 1;
end
NFFT= len; %number of FFT points
tapers= sine_taper( taper_num, NFFT);
diga= digamma( taper_num)- log( taper_num);
win= hamming( len);
% win= win/ norm( win);
PERC= 50; % window overlap in percent of frame size
len1=floor(len* PERC/ 100);
len2= len- len1;
L120= floor( 120* Srate/ 1000);
bfl=0.002; % spectral floor
k= 1; %k is starting point of each frame
%================================================
q= ceil( log2( len));
M= 2^ q;
sigma_eta_square= trigamma( taper_num);
N_autoc= sigma_eta_square* ( 1- ( 0: taper_num+ 1)/ ( taper_num+ 1));
N_autoc( M/ 2+ 1)= 0;
Sigma_N_firstrow= [N_autoc( 1: M/ 2+ 1), fliplr( N_autoc( 2: M/ 2))];
noise_stat= real( fft( Sigma_N_firstrow));
[wfilter( 1, :), wfilter( 2, :), wfilter( 3, :), wfilter( 4, :)]= ...
wfilters( wavname);
%------get the wavelet/scaling filter for decomposition/reconstruction
noise= noisy_speech( 1: L120);
noise_ps= psd_mt_sine( noise, tapers);
log_noise_ps= log( noise_ps)- diga;
den_log_noise_ps= thre_wavelet( log_noise_ps, noise_stat, thre_type, ...
thre_func_type, wfilter, q_0);
den_log_noise_ps= [den_log_noise_ps( 1: len/ 2+ 1); ...
flipud( den_log_noise_ps( 2: len/ 2))];
noise_ps= exp( den_log_noise_ps);
%=================
mu_vad= 0.98; % smoothing factor in noise spectrum update
aa= 0.98; % smoothing factor in priori update
eta= 0.15; % VAD threshold
%=================
Nframes= floor( length( noisy_speech)/ len2)- 1;
x_old= zeros( len1, 1);
xfinal= zeros( Nframes* len2, 1);
%=============================== Start Processing ==========
for n= 1: Nframes
insign= noisy_speech( k: k+ len- 1);
insign_spec= fft( insign.* win, NFFT);
%========estimate the noisy speech power spectrum
ns_ps= psd_mt_sine( insign, tapers);
log_ns_ps= log( ns_ps)- diga;
den_log_ns_ps= thre_wavelet( log_ns_ps, noise_stat, thre_type, ...
thre_func_type, wfilter, q_0);
den_log_ns_ps= [den_log_ns_ps( 1: NFFT/ 2+ 1); ...
flipud( den_log_ns_ps( 2: NFFT/ 2))];
ns_ps= exp( den_log_ns_ps);
%=================================================
gammak= abs( insign_spec).^ 2/ (norm( win)^2)./ noise_ps;
if n==1
ksi=aa+(1-aa)*max(gammak-1,0);
else
ksi=aa*Xk_prev./noise_ps + (1-aa)*max(gammak-1,0);
% decision-direct estimate of a priori SNR
end
log_sigma_k= gammak.* ksi./ (1+ ksi)- log(1+ ksi);
vad_decision(n)= sum( log_sigma_k)/ len;
if (vad_decision(n)< eta)
% noise only frame found
noise_ps= mu_vad* noise_ps+ (1- mu_vad)* ns_ps;
vad( k: k+ len- 1)= 0;
else
vad( k: k+ len- 1)= 1;
end
% ===end of vad===
%========estimate the clean speech power spectrum
cl_ps= ns_ps- noise_ps;
cl_ps= max( cl_ps, bfl* ns_ps);
%--providing a spectral floor
%========
%compute the masking threshold
mask_thre= mask( cl_ps( 1: NFFT/ 2+ 1), NFFT, Srate, 16);
mask_thre= [mask_thre; flipud( mask_thre( 2: NFFT/ 2))];
%expand it to NFFT length
noise_mask_ratio= noise_ps./ mask_thre;
%=======two methods to compute g_wi
% get the mu_k by u= max( sqrt( Sn/ alpha- 1), 0) * Sx/ Sn
%aprioSNR= cl_ps./ noise_ps;
%mu( :, n)= max( sqrt( noise_mask_ratio)-1, 0).* aprioSNR;
%g_wi= aprioSNR./ ( aprioSNR+ mu_n);
tmp= max( sqrt( noise_mask_ratio)-1, 0);
g_wi= 1./ (1+ tmp);
xi_freq= g_wi.* insign_spec;
Xk_prev= abs( xi_freq).^ 2;
xi_w= ifft( xi_freq);
xi_w= real( xi_w);
xfinal( k: k+ len2- 1)= x_old+ xi_w( 1: len1);
x_old= xi_w( len1+ 1: len);
k= k+ len2;
end
%========================================================================================
wavwrite( xfinal, Srate, 16, outfile);
%========================================================================================
function after_thre= thre_wavelet( before_thre, noise_stat, ...
thre_type, thre_func_type, wfilter, q_0)
%this function implements the wavelet thresholding technique
% refer to the paper by Walden/1998, Donoho/1995, Johnstone/1997
%note on the parameters
% before_thre: data before thresholding
% noise_stat: the power spectrum of the noise (i.e., noise statistics),
% DFT of the first row of Sigma_N, refer to Eq. (8) in Walden's paper
% thre_type: threshold type, scale-dependent Universal ('d'),
% scale-independent Universal ('i'), scale-dependent SURE ('ds'),
% scale-independent SURE ('is'), or scale-dependent Generalized
% Corss-Validation ('dg')
% thre_func_type: threshold function type: soft ('s') or hard ('h');
% wfilter: wavelet low pass and high pass decomposition/reconstruction filters [lo_d, hi_d, lo_r, hi_r]
% the 1st row is lo_d, the 2nd row is hi_d, the 3rd row is lo_r, and the 4th row is hi_r
% q_0 is the decomposition level
% after_thre: data after thresholding
s= size( before_thre);
before_thre= before_thre( :)'; %make it a row vector
noise_stat= noise_stat( :)';
N= length( before_thre); %length of before-thresholded data
q= ceil( log2( N));
M= 2^ q;
%==get the low pass and high pass decomposition/reconstruction filters from wfilter
lo_d= wfilter( 1, :); %low pass decomposition filter/ scaling filter
hi_d= wfilter( 2, :); %high pass decomposition filter/ wavelet filter
lo_r= wfilter( 3, :); %low pass reconstruction filter/ scaling filter
hi_r= wfilter( 4, :); %high pass reconstruction filter/ wavelet filter
%==refer to pp. 3155 in Walden's paper
H= zeros( q_0, M);
H( 1, :)= fft( hi_d, M); %frequency response of wavelet filter
G( 1, :)= fft( lo_d, M); %frequency response of scaling filter
for i= 2: q_0- 1
G( i, :)= G( 1, rem( (2^ (i- 1) )* (0: M- 1), M)+ 1);
end
for j= 2: q_0
H( j, :)= prod( [G( 1: j- 1, :); H( 1, rem( (2^ (j- 1) )* (0: M- 1), M)+ 1)], 1);
end
[y_coeff, len_info]= wavedec( before_thre, q_0, lo_d, hi_d);
% --decompose before_thre into q_0 levels using wavelet filter hi_d and scaling filter lo_d
% --where y_coeff contains the coefficients and len_info contains the length information
% --different segments of y_coeff correspond approximation and detail coefficients;
% -- length of len_info should be q_0+ 2
%===============processing according to 'thre_type'
%-------with 'd'--scale-dependent thresholding, threshold has to be computed for each level
%-------with 'i'--scale-independent thresholding, threshold is set to a fixed level
if thre_type== 'i' %scale-independent universal thresholding
sigma_square= mean( noise_stat);
thre= sqrt( sigma_square* 2* log( M)) ; %mean( noise_stat) is sigma_eta_square in Eq. (6)
y_coeff( len_info( 1)+ 1: end)= ...
wthresh( y_coeff( len_info( 1)+ 1: end), thre_func_type, thre);
elseif thre_type== 'd' %scale-dependent universal thresholding
%------first we need to compute the energy level of each scale from j= 1: q_0
for i= 1: q_0 %refer to Eq. (9) in Walden's paper
sigma_j_square( i)= mean( noise_stat.* (abs( H( i, :)).^ 2), 2); %average along the row
end
for i= 2: q_0+ 1 %thresholding for each scale
sp= sum( len_info( 1: i- 1), 2)+ 1; %starting point
ep= sp+ len_info( i)- 1;
thre= sqrt( sigma_j_square( q_0- i+ 2)* 2* log( len_info( i)));
y_coeff( sp: ep)= wthresh( y_coeff( sp: ep), thre_func_type, thre);
end
elseif thre_type== 'ds' %scale-dependent SURE thresholding
%=======use Eq. (9) in Walden's paper to get sigma_j, MDA estimate seems to be better
% for i= 1: q_0
% sigma_j_square( i)= mean( noise_stat.* (abs( H( i, :)).^ 2), 2); %average along the row
% sigma_j( i)= sqrt( sigma_j_square( i));
% end
%======MDA estimate of sigma_j
sigma_j= wnoisest( y_coeff, len_info, 1: q_0);
for i= 2: q_0+ 1 %thresholding for each scale
sp= sum( len_info( 1: i- 1), 2)+ 1; %starting point
ep= sp+ len_info( i)- 1; %ending point
if sigma_j( q_0- i+ 2)< sqrt( eps)* max( y_coeff( sp: ep));
thre= 0;
else
thre= sigma_j( q_0- i+ 2)* thselect( y_coeff( sp: ep)/ ...
sigma_j( q_0- i+ 2), 'heursure');
end
%fprintf( 1, 'sigma_j is %6.2f, thre is %6.2f\n', sigma_j, thre);
y_coeff( sp: ep)= wthresh( y_coeff( sp: ep), thre_func_type, thre);
end
elseif thre_type== 'dn' %new risk function defined in Xiao-ping Zhang's paper
sigma_j= wnoisest( y_coeff, len_info, 1: q_0);
sigma_j_square= sigma_j.^ 2;
for i= 2: q_0+ 1 %thresholding for each scale
sp= sum( len_info( 1: i- 1), 2)+ 1; %starting point
ep= sp+ len_info( i)- 1; %ending point
if sigma_j( q_0- i+ 2)< sqrt( eps)* max( y_coeff( sp: ep));
thre= 0;
else
%based on some evidece, the following theme let thre vary with SNR
% with ultra low SNR indicating low probability of signal presence,
% hence using universal threshold
% and very high SNR indicates high probability of signal presence,
% hence using SURE threshold
thre_max= sigma_j( q_0- i+ 2)* sqrt( 2* log( len_info( i))); %thre with SNRlog< -5dB
thre_min= sigma_j( q_0- i+ 2)* fminbnd( @riskfunc, 0, sqrt(2* log( ep- sp+ 1)), ...
optimset( 'MaxFunEvals',1000,'MaxIter',1000), ...
y_coeff( sp: ep)/ sigma_j( q_0- i+ 2), 3); %thre with SNRlog> 20dB
slope= (thre_max- thre_min)/ 25;
thre_0= thre_min+ 20* slope;
SNRlog= 10* log10( mean( max( y_coeff( sp: ep).^ 2/ sigma_j_square( q_0- i+ 2)- 1, 0)));
if SNRlog>= 20
thre= thre_min; %actually this corresponds to SURE threshold
elseif ( SNRlog< 20) & ( SNRlog>= -5)
thre= thre_0- SNRlog* slope;
else
thre= thre_max; %this corresponds to oversmooth threshold
end
%the theme below is similar to the option 'heursure' in the function 'thselect'
% univ_thr = sqrt(2* log( len_info( i))); %universal thresholding
% eta = (norm( y_coeff( sp: ep)/ sigma_j( q_0- i+ 2)).^2)/ ( len_info( i))- 1;
% crit = (log2( len_info( i)))^(1.5)/ sqrt( len_info( i));
% if 1%eta > crit %high probility that speech exists
% thre= sigma_j( q_0- i+ 2)* fminbnd( @riskfunc, 0, sqrt(2* log( ep- sp+ 1)), ...
% optimset( 'MaxFunEvals',1000,'MaxIter',1000), ...
% y_coeff( sp: ep)/ sigma_j( q_0- i+ 2), 3);
% else
% thre = sigma_j( q_0- i+ 2)* univ_thr;
% end
end
y_coeff( sp: ep)= wthresh( y_coeff( sp: ep), thre_func_type, thre);
end
elseif thre_type== 'dg' %scale-dependent Generalized Cross Validation thresholding
for i= 2: q_0+ 1 %thresholding for each scale
sp= sum( len_info( 1: i- 1), 2)+ 1; %starting point
ep= sp+ len_info( i)- 1; %ending point
[y_coeff( sp: ep), thre]= mingcv( y_coeff( sp: ep), thre_func_type);
end
else
error( 'wrong thresholding type');
end
%--reconstruct the thresholded coefficients
after_thre= waverec( y_coeff, len_info, lo_r, hi_r);
if s(1)>1
after_thre= after_thre';
end
%fprintf( 1, 'thre is %f\n', thre);
function mt_psd= psd_mt_sine( data, sine_tapers)
% this function uses sine tapers to get multitaper power spectrum estimation
% 'x' is the incoming data, 'sine_tapers' is a matrix with each column being
% sine taper, sine_tapers can be obtained using the function sine_taper
[frame_len, taper_num]= size( sine_tapers);
eigen_spectra= zeros( frame_len, taper_num);
data= data( :);
data_len= length( data);
data_hankel= hankel( data( 1: frame_len), data( frame_len: data_len));
x_mt_psd= zeros( frame_len, data_len- frame_len+ 1);
for pp= 1: data_len- frame_len+ 1
for index= 1: taper_num
x_taperd= sine_tapers( :, index).* data_hankel( :, pp);
x_taperd_spec= fft( x_taperd);
eigen_spectra( :, index)= abs( x_taperd_spec).^ 2;
end
x_mt_psd(:, pp)= mean( eigen_spectra, 2);
end
mt_psd= mean( x_mt_psd, 2);
function tapers= sine_taper( L, N)
% this function is used to generate the sine tapers proposed by Riedel et
% al in IEEE Transactions on Signal Processing, pp. 188- 195, Jan. 1995
% there are two parameters, 'L' is the number of the sine tapers generated,
% and 'N' is the length of each sine taper; the returned value 'tapers' is
% a N-by-L matrix with each column being sine taper
tapers= zeros( N, L);
for index= 1: L
tapers( :, index)= sqrt( 2/ (N+ 1))* sin (pi* index* (1: N)'/ (N+ 1));
end
function y = trigamma(z,method,debug)
% y = trigamma(z) ... Trigamma-Function for real positive z
%
% trigamma(z) = (d/dz)^2 log(gamma(z)) = d/dz digamma(z)
%
% if 'z' is a matrix, then the digamma-function is evaluated for
% each element. Results are inaccurate for real arguments < 10 which are
% neither integers nor half-integers.
%
% y = trigamma(z,method)
%
% possible values for optional argument 'method':
% method = 1 : quick asymptotic series expansion (approximate)
% method = 2 : finite recursion for integer values (exact)
% method = 3 : finite recursion for half-integer values (exact)
% method = 4 (default) : automatic selection of 1,2 or 3 for individual
% elements in z whichever is appropriate.
%
% see also: digamma, gamma, gammaln, gammainc, specfun
% reference: Abramowitz & Stegun, "Handbook of Mathematical Functions"
% Chapter "Gamma Function and Related Functions" :
% implemented by: Christoph Mecklenbraeuker
% (email: cfm@sth.ruhr-uni-bochum.de), July 4, 1995.
dim = size(z); % save original matrix dimension
z = reshape(z,dim(1)*dim(2),1); % make a column vector
I1 = ones(length(z),1); % auxiliary vector of ones
if(nargin==1)
method=4; debug=0;
elseif(nargin==2)
debug=0;
end;
if(debug == 1) % if debug==1: track recursion
[m,n] =size(z);
fprintf(1,'trigamma: method = %d, size(z)=[%d %d],\t min(z)=%f, max(z)=%f\n',...
method,m,n,min(min(z)),max(max(z)));
end;
if(method==1) % use 9th order asymptotic expansion
if(any(z<1))
fprintf(1,'Warning: some elements in argument of "trigamma(z,1)" are < 1\n');
fprintf(1,'minimal argument = %g: trigamma-result is inaccurate!\n',min(min(z)));
end
% calculate powers of 1/z :
w1 = 1./z; w2 = w1.*w1; w3 = w1.*w2; w5 = w2.*w3; w7 = w2.*w5; w9 = w2.*w7;
% generate coefficients of expansion: matrix with constant columns
a = [ I1 I1/2 I1/6 -I1/30 I1/42 -I1/30];
% make vector of powers of 1/z:
w = [ w1 w2 w3 w5 w7 w9];
% calculate expansion by summing the ROWS of (a .* w) :
y = sum((a.*w).').';
elseif(method==2)
zmax = max(max(floor(z)));
ytab = zeros(zmax,1);
ytab(1) = pi^2/6; % = psi'(1)
for n=1:zmax-1;
ytab(n+1) = ytab(n) - 1/n^2; % generate lookup table
end;
y = ytab(z);
elseif(method==3)
zmax = max(max(floor(z)));
ytab = zeros(zmax+1,1);
ytab(1) = pi^2/2; % = psi'(1/2)
for n=1:zmax;
ytab(n+1) = ytab(n) - 4/(2*n-1)^2; % generate lookup table
end;
y = ytab(z+0.5);
elseif(method==4) % decide here which method to use
Less0 = find(z<0); % negative arguments evaluated by reflexion formula
Less1 = find(z>0 & z<1); % values between 0 and 1.
fraction = rem(z,1); % fractional part of arguments
f2 = rem(2*fraction,1);
Integers = find(fraction==0 & z>0); % Index set of positive integer arguments
NegInts = find(fraction==0 & z<=0); % Index set of positive integer arguments
HalfInts = find(abs(fraction-0.5)<1e-7 & z>0); % Index set of positive half-integers
Reals = find(f2>1e-7 & z>1); % Index set of all other arguments > 1
if(~isempty(Reals)) y(Reals) = trigamma(z(Reals),1,debug); end;
if(~isempty(Less1)) y(Less1) = trigamma(z(Less1)+2,1,debug) + ...
1./z(Less1).^2+1./(z(Less1)+1).^2;end;
% reflexion formula:
if(~isempty(Less0)) y(Less0)= -trigamma(1-z(Less0),1,debug)+(pi./sin(pi*z(Less0))).^2; end;
% integers:
if(~isempty(Integers)) y(Integers) = trigamma(z(Integers),2,debug); end;
% half-integers:
if(~isempty(HalfInts)) y(HalfInts) = trigamma(z(HalfInts),3,debug); end;
% negative integers:
if(~isempty(NegInts)) y(NegInts) = Inf * NegInts; end;
end
y = reshape(y,dim(1),dim(2));
return;
function psi = digamma(z,method,debug)
%
% psi = digamma(z) ... Digamma-Function for real argument z.
%
% digamma(z) = d/dz log(gamma(z)) = gamma'(z)/gamma(z)
%
% if 'z' is a matrix, then the digamma-function is evaluated for
% each element. Results may be inaccurate for real arguments < 10
% which are neither integers nor half-integers.
%
% psi = digamma(z,method)
%
% possible values for optional argument 'method':
% method = 1 : quick asymptotic series expansion (approximate)
% method = 2 : finite recursion for integer values (exact)
% method = 3 : finite recursion for half-integer values (exact)
% method = 4 (default) : automatic selection of 1,2 or 3 for individual
% elements in z whichever is appropriate.
%
% see also: trigamma, gamma, gammaln, gammainc, specfun
% reference: Abramowitz & Stegun, "Handbook of Mathematical Functions"
% Chapter "Gamma Function and Related Functions" :
% implemented by: Christoph Mecklenbraeuker
% (email: cfm@sth.ruhr-uni-bochum.de), July 1, 1995.
dim = size(z); % save original matrix dimension
z = reshape(z,dim(1)*dim(2),1); % make a column vector
I1 = ones(length(z),1); % auxiliary vector of ones
if(nargin==1)
method=4; debug=0;
elseif(nargin==2)
debug=0;
end;
if(debug == 1) % if debug==1: track recursion
[m,n] = size(z);
fprintf(1,'digamma: method = %d, size(z)=[%d %d],\t min(z)=%f, max(z)=%f\n',...
method,m,n,min(min(z)),max(max(z)));
end;
if(method==1) % use 8th order asymptotic expansion
if(any(z<1))
fprintf(1,'Warning: some elements in argument of "digamma(z,1)" are < 1\n');
fprintf(1,'minimal argument = %g: digamma-result is inaccurate!\n',min(min(z)));
end
% calculate powers of 1/z :
w1 = 1./z; w2 = w1.*w1; w4 = w2.*w2; w6 = w2.*w4; w8 = w4.*w4;
% generate coefficients of expansion: matrix with constant columns
a = [ -I1/2 -I1/12 I1/120 -I1/252 I1/240 ];
% make vector of powers of 1/z:
w = [ w1 w2 w4 w6 w8 ];
% calculate expansion by summing the ROWS of (a .* w) :
psi = log(z) + sum((a.*w).').';
elseif(method==2)
zmax = max(max(floor(z)));
psitab = zeros(zmax,1);
psitab(1) = -0.5772156649015328606;
for n=1:zmax-1;
psitab(n+1) = psitab(n) + 1/n; % generate lookup table
end;
psi = psitab(z);
elseif(method==3)
zmax = max(max(floor(z)));
psitab = zeros(zmax+1,1);
psitab(1) = -0.5772156649015328606 - 2*log(2); % = psi(1/2)
for n=1:zmax;
psitab(n+1) = psitab(n) + 2/(2*n-1); % generate lookup table
end;
psi = psitab(z+0.5);
elseif(method==4) % decide here which method to use
Less0 = find(z<0); % negative arguments evaluated by reflexion formula
Less1 = find(z>0 & z<1); % values between 0 and 1.
fraction = rem(z,1); % fractional part of arguments
f2 = rem(2*fraction,1);
Integers = find(fraction==0 & z>0); % Index set of positive integer arguments
NegInts = find(fraction==0 & z<=0); % Index set of positive integer arguments
HalfInts = find(abs(fraction-0.5)<1e-7 & z>0); % Index set of positive half-integers
Reals = find(f2>1e-7 & z>1); % Index set of all other arguments > 1
if(~isempty(Reals)) psi(Reals) = digamma(z(Reals),1,debug); end;
if(~isempty(Less1)) psi(Less1) = digamma(z(Less1)+2,1,debug) - ...
1./z(Less1)-1./(z(Less1)+1);end;
% reflexion formula:
if(~isempty(Less0)) psi(Less0) = digamma(1-z(Less0),1,debug) - pi./tan(pi*z(Less0)); end;
if(~isempty(Integers)) psi(Integers) = digamma(z(Integers),2,debug); end;
if(~isempty(HalfInts)) psi(HalfInts) = digamma(z(HalfInts),3,debug); end;
if(~isempty(NegInts)) psi(NegInts) = Inf * NegInts; end;
end
psi = reshape(psi,dim(1),dim(2));
return;
% Author: Patrick J. Wolfe
% Signal Processing Group
% Cambridge University Engineering Department
% p.wolfe@ieee.org
% Johnston perceptual model initialisation
function M= mask( Sx, dft_length, Fs, nbits)
frame_overlap= dft_length/ 2;
freq_val = (0:Fs/dft_length:Fs/2)';
half_lsb = (1/(2^nbits-1))^2/dft_length;
freq= freq_val;
thresh= half_lsb;
crit_band_ends = [0;100;200;300;400;510;630;770;920;1080;1270;...
1480;1720;2000;2320;2700;3150;3700;4400;5300;6400;7700;...
9500;12000;15500;Inf];
% Maximum Bark frequency
%
imax = max(find(crit_band_ends < freq(end)));
% Normalised (to 0 dB) threshold of hearing values (Fletcher, 1929)
% as used by Johnston. First and last thresholds are corresponding
% critical band endpoint values, elsewhere means of interpolated
% critical band endpoint threshold values are used.
%
abs_thr = 10.^([38;31;22;18.5;15.5;13;11;9.5;8.75;7.25;4.75;2.75;...
1.5;0.5;0;0;0;0;2;7;12;15.5;18;24;29]./10);
ABSOLUTE_THRESH = thresh.*abs_thr(1:imax);
% Calculation of tone-masking-noise offset ratio in dB
%
OFFSET_RATIO_DB = 9+ (1:imax)';
% Initialisation of matrices for bark/linear frequency conversion
% (loop increments i to the proper critical band)
%
num_bins = length(freq);
LIN_TO_BARK = zeros(imax,num_bins);
i = 1;
for j = 1:num_bins
while ~((freq(j) >= crit_band_ends(i)) & ...
(freq(j) < crit_band_ends(i+1))),
i = i+1;
end
LIN_TO_BARK(i,j) = 1;
end
% Calculation of spreading function (Schroeder et al., 82)
spreading_fcn = zeros(imax);
summ = 0.474:imax;
spread = 10.^((15.81+7.5.*summ-17.5.*sqrt(1+summ.^2))./10);
for i = 1:imax
for j = 1:imax
spreading_fcn(i,j) = spread(abs(j-i)+1);
end
end
% Calculation of excitation pattern function
EX_PAT = spreading_fcn* LIN_TO_BARK;
% Calculation of DC gain due to spreading function
DC_GAIN = spreading_fcn* ones(imax,1);
%Sx = X.* conj(X);
C = EX_PAT* Sx;
% Calculation of spectral flatness measure SFM_dB
%
[num_bins num_frames] = size(Sx);
k = 1/num_bins;
SFM_dB = 10.*log10((prod(Sx).^k)./(k.*sum(Sx)+eps)+ eps);
% Calculation of tonality coefficient and masked threshold offset
%
alpha = min(1,SFM_dB./-60);
O_dB = OFFSET_RATIO_DB(:,ones(1,num_frames)).*...
alpha(ones(length(OFFSET_RATIO_DB),1),:) + 5.5;
% Threshold calculation and renormalisation, accounting for absolute
% thresholds
T = C./10.^(O_dB./10);
T = T./DC_GAIN(:,ones(1,num_frames));
T = max( T, ABSOLUTE_THRESH(:, ones(1, num_frames)));
% Reconversion to linear frequency scale
%M = 1.* sqrt((LIN_TO_BARK')*T);
M= LIN_TO_BARK'* T;

View File

@ -0,0 +1,153 @@
function stsa_mis(filename,outfile)
%
% Implements the Bayesian estimator based on the modified Itakura-Saito
% distortion measure [1, Eq. 43].
%
% Usage: stsa_mis(noisyFile, outputFile)
%
% infile - noisy speech file in .wav format
% outputFile - enhanced output file in .wav format
%
%
% Example call: stsa_mis('sp04_babble_sn10.wav','out_mis.wav');
%
% References:
% [1] Loizou, P. (2005). Speech enhancement based on perceptually motivated
% Bayesian estimators of the speech magnitude spectrum. IEEE Trans. on Speech
% and Audio Processing, 13(5), 857-869.
%
% Author: Philipos C. Loizou
%
% Copyright (c) 2006 by Philipos C. Loizou
% $Revision: 0.0 $ $Date: 10/09/2006 $
%-------------------------------------------------------------------------
if nargin<2
fprintf('Usage: stsa_mis inFile outFile.wav \n\n');
return;
end
[x, Srate, bits]= wavread( filename);
% =============== Initialize variables ===============
%
len=floor(20*Srate/1000); % Frame size in samples
if rem(len,2)==1, len=len+1; end;
PERC=50; % window overlap in percent of frame size
len1=floor(len*PERC/100);
len2=len-len1;
win=hanning(len); %tukey(len,PERC); % define window
% Noise magnitude calculations - assuming that the first 6 frames is noise/silence
%
nFFT=len;
nFFT2=len/2;
noise_mean=zeros(nFFT,1);
j=1;
for k=1:5
noise_mean=noise_mean+abs(fft(win.*x(j:j+len-1),nFFT));
j=j+len;
end
noise_mu=noise_mean/5;
noise_mu2=noise_mu.^2;
%--- allocate memory and initialize various variables
img=sqrt(-1);
x_old=zeros(len1,1);
Nframes=floor(length(x)/len2)-1;
xfinal=zeros(Nframes*len2,1);
%=============================== Start Processing =======================================================
%
k=1;
aa=0.98;
fprintf('\nThis might take some time ...\n');
for n=1:Nframes
insign=win.*x(k:k+len-1);
%--- Take fourier transform of frame ----
spec=fft(insign,nFFT);
sig=abs(spec); % compute the magnitude
sig2=sig.^2;
gammak=min(sig2./noise_mu2,40); % post SNR. Limit it to avoid overflows
if n==1
ksi=aa+(1-aa)*max(gammak-1,0);
else
ksi=aa*Xk_prev./noise_mu2 + (1-aa)*max(gammak-1,0); % a priori SNR
end
vk=ksi.*gammak./(1+ksi);
sig_hat=log(comp_int(vk,gammak,sig)); % Eq. 41
Xk_prev=sig_hat.^2;
xi_w= ifft( sig_hat.* exp(img*angle(spec)));
xi_w= real( xi_w);
% --- Overlap and add ---------------
%
xfinal(k:k+ len2-1)= x_old+ xi_w(1:len1);
x_old= xi_w(len1+ 1: len);
if rem(n,20)==0, fprintf('Frame: %d Percent completed:%4.2f\n',n,n*100/Nframes); end;
k=k+len2;
end
%========================================================================================
wavwrite(xfinal,Srate,16,outfile);
%------------------------------E N D -----------------------------------
function xhat=comp_int(vk,gammak,Yk)
% -- Evaluates Eq. 43 in [1]
%
Yk2=Yk.*Yk;
G2=gammak.^2;
EV=exp(-vk);
N=40; % number of terms to keep in infinite sum (Eq. 43)
L=length(vk)/2+1;
J1=zeros(L,1);
J2=zeros(L,1);
for j=1:L
sum=0; sum_b=0;
for m=0:N
F=factorial(m);
d1=(vk(j))^m;
d2=hyperg(-m,-m,0.5,Yk2(j)/(4*G2(j)),10);
d2_b=hyperg(-m,-m,1.5,Yk2(j)/(4*G2(j)),10);
sum=sum+d1*d2/F;
sum_b=sum_b+gamma(m+1.5)*d1*d2_b/(F*gamma(m+1));
end
J1(j)=sum;
J2(j)=sum_b;
end
J1=J1.*EV(1:L);
J2=J2.*EV(1:L).*sqrt(vk(1:L)).*Yk(1:L)./gammak(1:L);
xhat2=max(real(J1+J2),0.00001);
xhat = [xhat2; flipud(xhat2(2:L-1))];

View File

@ -0,0 +1,131 @@
function stsa_wcosh(filename,outfile,p)
%
% Implements the Bayesian estimator based on the weighted cosh
% distortion measure [1, Eq. 34].
%
% Usage: stsa_wcosh(noisyFile, outputFile, p)
%
% infile - noisy speech file in .wav format
% outputFile - enhanced output file in .wav format
% p - power exponent used in the weighted cosh measure.
% Valid values for p: p>-1
%
%
% Example call: stsa_wcosh('sp04_babble_sn10.wav','out_wcosh.wav',-0.5);
%
% References:
% [1] Loizou, P. (2005). Speech enhancement based on perceptually motivated
% Bayesian estimators of the speech magnitude spectrum. IEEE Trans. on Speech
% and Audio Processing, 13(5), 857-869.
%
% Author: Philipos C. Loizou
%
% Copyright (c) 2006 by Philipos C. Loizou
% $Revision: 0.0 $ $Date: 10/09/2006 $
%-------------------------------------------------------------------------
if nargin<3
fprintf('Usage: stsa_wcosh(infile.wav,outfile.wav,p) \n');
fprintf(' where p>-1 \n\n');
return;
end;
if p<-1
error('ERROR! p needs to be larger than -1.\n\n');
end
[x, Srate, bits]= wavread( filename);
% =============== Initialize variables ===============
%
len=floor(20*Srate/1000); % Frame size in samples
if rem(len,2)==1, len=len+1; end;
PERC=50; % window overlap in percent of frame size
len1=floor(len*PERC/100);
len2=len-len1;
win=hanning(len); %tukey(len,PERC); % define window
% Noise magnitude calculations - assuming that the first 6 frames is noise/silence
%
nFFT=2*len;
nFFT2=len/2;
noise_mean=zeros(nFFT,1);
j=1;
for k=1:5
noise_mean=noise_mean+abs(fft(win.*x(j:j+len-1),nFFT));
j=j+len;
end
noise_mu=noise_mean/5;
noise_mu2=noise_mu.^2;
%--- allocate memory and initialize various variables
x_old=zeros(len1,1);
Nframes=floor(length(x)/len2)-1;
xfinal=zeros(Nframes*len2,1);
%=============================== Start Processing =======================================================
%
k=1;
aa=0.98;
CC2=sqrt(gamma((p+3)/2)/gamma((p+1)/2));
for n=1:Nframes
insign=win.*x(k:k+len-1);
%--- Take fourier transform of frame
spec=fft(insign,nFFT);
sig=abs(spec); % compute the magnitude
sig2=sig.^2;
gammak=min(sig2./noise_mu2,40); % post SNR
if n==1
ksi=aa+(1-aa)*max(gammak-1,0);
else
ksi=aa*Xk_prev./noise_mu2 + (1-aa)*max(gammak-1,0); % a priori SNR
end
vk=ksi.*gammak./(1+ksi);
% --- for the weighted cosh measure
numer=CC2*sqrt(vk.*confhyperg(-(p+1)/2,1,-vk,100));
denom=gammak.*sqrt(confhyperg(-(p-1)/2,1,-vk,100));
hw=numer./denom;
sig=sig.*hw;
Xk_prev=sig.^2;
xi_w= ifft( hw .* spec, nFFT);
xi_w= real( xi_w);
% --- Overlap and add ---------------
%
xfinal(k:k+ len2-1)= x_old+ xi_w(1:len1);
x_old= xi_w(len1+ 1: len);
k=k+len2;
end
%========================================================================================
wavwrite(xfinal,Srate,16,outfile);

View File

@ -0,0 +1,145 @@
function stsa_weuclid(filename,outfile,p)
%
% Implements the Bayesian estimator based on the weighted-Euclidean
% distortion measure [1, Eq. 18].
%
% Usage: stsa_weuclid(noisyFile, outputFile, p)
%
% infile - noisy speech file in .wav format
% outputFile - enhanced output file in .wav format
% p - power exponent used in the weighted-Euclidean measure.
% Valid values for p: p>-2
%
%
% Example call: stsa_weuclid('sp04_babble_sn10.wav','out_weuclid.wav',-1);
%
% References:
% [1] Loizou, P. (2005). Speech enhancement based on perceptually motivated
% Bayesian estimators of the speech magnitude spectrum. IEEE Trans. on Speech
% and Audio Processing, 13(5), 857-869.
%
% Author: Philipos C. Loizou
%
% Copyright (c) 2006 by Philipos C. Loizou
% $Revision: 0.0 $ $Date: 10/09/2006 $
%-------------------------------------------------------------------------
if nargin<3
fprintf('Usage: stsa_weuclid(infile.wav,outfile.wav,p) \n');
fprintf(' where p>-2 \n\n');
return;
end;
if p<-2,
error('ERROR! p needs to be larger than -2.\n\n');
end
[x, Srate, bits]= wavread( filename);
% =============== Initialize variables ===============
len=floor(20*Srate/1000); % Frame size in samples
if rem(len,2)==1, len=len+1; end;
PERC=50; % window overlap in percent of frame size
len1=floor(len*PERC/100);
len2=len-len1;
win=hamming(len); %tukey(len,PERC); % define window
% Noise magnitude calculations - assuming that the first 6 frames is noise/silence
%
nFFT=2*len;
nFFT2=len/2;
noise_mean=zeros(nFFT,1);
j=1;
for k=1:6
noise_mean=noise_mean+abs(fft(win.*x(j:j+len-1),nFFT));
j=j+len;
end
noise_mu=noise_mean/6;
noise_mu2=noise_mu.^2;
%--- allocate memory and initialize various variables
k=1;
img=sqrt(-1);
x_old=zeros(len1,1);
Nframes=floor(length(x)/len2)-1;
xfinal=zeros(Nframes*len2,1);
%=============================== Start Processing =======================================================
%
k=1;
aa=0.98;
mu=0.98;
eta=0.15;
c=sqrt(pi)/2;
C2=gamma(0.5);
%p=-1;
CC=gamma((p+3)/2)/gamma(p/2+1);
ksi_min=10^(-25/10);
for n=1:Nframes
insign=win.*x(k:k+len-1);
%--- Take fourier transform of frame
spec=fft(insign,nFFT);
sig=abs(spec); % compute the magnitude
sig2=sig.^2;
gammak=min(sig2./noise_mu2,40); % post SNR
if n==1
ksi=aa+(1-aa)*max(gammak-1,0);
else
ksi=aa*Xk_prev./noise_mu2 + (1-aa)*max(gammak-1,0); % a priori SNR
ksi=max(ksi_min,ksi); % limit ksi to -25 dB
end
log_sigma_k= gammak.* ksi./ (1+ ksi)- log(1+ ksi);
vad_decision= sum( log_sigma_k)/ len;
if (vad_decision< eta)
% noise only frame found
noise_mu2= mu* noise_mu2+ (1- mu)* sig2;
end
% ===end of vad===
vk=ksi.*gammak./(1+ksi);
%----- weighted Euclidean distance ------------------------
if p==-1
hw=CC*sqrt(vk)./(gammak.*exp(-vk/2).*besseli(0,vk/2)); % if p=-1 use this equation as it's faster
else
numer=CC*sqrt(vk).*confhyperg(-(p+1)/2,1,-vk,100);
denom=gammak.*confhyperg(-p/2,1,-vk,100);
hw=numer./denom;
end
%
sig=sig.*hw;
Xk_prev=sig.^2;
xi_w= ifft( hw .* spec, nFFT);
xi_w= real( xi_w);
% --- Overlap and add ---------------
%
xfinal(k:k+ len2-1)= x_old+ xi_w(1:len1);
x_old= xi_w(len1+ 1: len);
k=k+len2;
end
%========================================================================================
wavwrite(xfinal,Srate,16,outfile);

View File

@ -0,0 +1,169 @@
function stsa_wlr(filename,outfile)
%
% Implements the Bayesian estimator based on the weighted likelihood ratio
% distortion measure [1, Eq. 37].
%
% Usage: stsa_wlr(noisyFile, outputFile)
%
% infile - noisy speech file in .wav format
% outputFile - enhanced output file in .wav format
%
%
% Example call: stsa_wlr('sp04_babble_sn10.wav','out_wlr.wav');
%
% References:
% [1] Loizou, P. (2005). Speech enhancement based on perceptually motivated
% Bayesian estimators of the speech magnitude spectrum. IEEE Trans. on Speech
% and Audio Processing, 13(5), 857-869.
%
% Author: Philipos C. Loizou
%
% Copyright (c) 2006 by Philipos C. Loizou
% $Revision: 0.0 $ $Date: 10/09/2006 $
%-------------------------------------------------------------------------
if nargin<2
fprintf('Usage: stsa_wlr inFile outFile.wav \n\n');
return;
end
[x, Srate, bits]= wavread( filename);
% =============== Initialize variables ===============
%
len=floor(20*Srate/1000); % Frame size in samples
if rem(len,2)==1, len=len+1; end;
PERC=50; % window overlap in percent of frame size
len1=floor(len*PERC/100);
len2=len-len1;
win=hanning(len); %tukey(len,PERC); % define window
% Noise magnitude calculations - assuming that the first 6 frames is noise/silence
%
nFFT=len;
nFFT2=len/2;
noise_mean=zeros(nFFT,1);
j=1;
for k=1:5
noise_mean=noise_mean+abs(fft(win.*x(j:j+len-1),nFFT));
j=j+len;
end
noise_mu=noise_mean/5;
noise_mu2=noise_mu.^2;
%--- allocate memory and initialize various variables
img=sqrt(-1);
x_old=zeros(len1,1);
Nframes=floor(length(x)/len2)-1;
xfinal=zeros(Nframes*len2,1);
xinterv=0.001:0.01:10;
k=1;
aa=0.98;
%=============================== Start Processing =======================================================
%
fprintf('This might take some time ...\n')
for n=1:Nframes
insign=win.*x(k:k+len-1);
%--- Take fourier transform of frame
spec=fft(insign,nFFT);
sig=abs(spec); % compute the magnitude
sig2=sig.^2;
gammak=min(sig2./noise_mu2,40); % post SNR. Limit it to avoid overflows
if n==1
ksi=aa+(1-aa)*max(gammak-1,0);
else
ksi=aa*Xk_prev./noise_mu2 + (1-aa)*max(gammak-1,0); % a priori SNR
end
vk=ksi.*gammak./(1+ksi);
xx=solve_wlr(vk,gammak,sig,xinterv); % solves Eq. 37 in [1]
sig_hat=xx;
Xk_prev=sig_hat.^2;
xi_w= ifft( sig_hat.* exp(img*angle(spec)));
xi_w= real( xi_w);
% --- Overlap and add ---------------
%
xfinal(k:k+ len2-1)= x_old+ xi_w(1:len1);
x_old= xi_w(len1+ 1: len);
if rem(n,20)==0, fprintf('Frame: %d Percent completed:%4.2f \n',n,n*100/Nframes); end;
k=k+len2;
end
%========================================================================================
wavwrite(xfinal,Srate,16,outfile);
%==========================================================================
function x=solve_wlr(vk,gammak,Yk,xx);
% solves non-linear Eq. 37 in [1]
%
Len=length(vk);
L2=Len/2+1;
lk05=sqrt(vk).*Yk./gammak;
Ex=gamma(1.5)*lk05.*confhyperg(-0.5,1,-vk,100);
Elogx=1-0.5*(2*log(lk05)+log(vk)+expint(vk));
x=zeros(Len,1);
for n=1:L2
a=Elogx(n);
b=Ex(n);
ff=sprintf('log(x)+%f - %f/x',a,b);
y=log(xx)+a-b./xx;
bet=xx(1); tox=200;
if y(1)<0
ind=find(y>0);
bet=xx(1)/2;
tox=xx(ind(1));
[x(n),fval,flag]=fzero(inline(ff),[bet tox]);
if flag<0
x(n)=x(n-1);
end
else
ind=find(y<0);
if ~isempty(ind)
bet=xx(1);
tox=xx(ind(1));
[x(n),fval]=fzero(inline(ff),[bet tox]);
else
x(n)=0.001; % spectral floor
end
end
end
x(L2+1:Len)=flipud(x(2:L2-1));

View File

@ -0,0 +1,126 @@
function wiener_as(filename,outfile)
%
% Implements the Wiener filtering algorithm based on a priori SNR estimation [1].
%
% Usage: wiener_as(noisyFile, outputFile)
%
% infile - noisy speech file in .wav format
% outputFile - enhanced output file in .wav format
%
% Example call: wiener_as('sp04_babble_sn10.wav','out_wien_as.wav');
%
% References:
% [1] Scalart, P. and Filho, J. (1996). Speech enhancement based on a priori
% signal to noise estimation. Proc. IEEE Int. Conf. Acoust. , Speech, Signal
% Processing, 629-632.
%
% Authors: Yi Hu and Philipos C. Loizou
%
% Copyright (c) 2006 by Philipos C. Loizou
% $Revision: 0.0 $ $Date: 10/09/2006 $
%-------------------------------------------------------------------------
if nargin<2
fprintf('Usage: wiener_as(noisyfile.wav,outFile.wav) \n\n');
return;
end
[noisy_speech, fs]= audioread( filename);
noisy_speech= noisy_speech;
% column vector noisy_speech
% set parameter values
mu= 0.98; % smoothing factor in noise spectrum update
a_dd= 0.98; % smoothing factor in priori update
eta= 0.15; % VAD threshold
frame_dur= 20; % frame duration
L= frame_dur* fs/ 1000; % L is frame length (160 for 8k sampling rate)
hamming_win= hamming( L); % hamming window
U= ( hamming_win'* hamming_win)/ L; % normalization factor
% first 120 ms is noise only
len_120ms= fs/ 1000* 120;
% first_120ms= noisy_speech( 1: len_120ms).* ...
% (hann( len_120ms, 'periodic'))';
first_120ms= noisy_speech( 1: len_120ms);
% =============now use Welch's method to estimate power spectrum with
% Hamming window and 50% overlap
nsubframes= floor( len_120ms/ (L/ 2))- 1; % 50% overlap
noise_ps= zeros( L, 1);
n_start= 1;
for j= 1: nsubframes
noise= first_120ms( n_start: n_start+ L- 1);
noise= noise.* hamming_win;
noise_fft= fft( noise, L);
noise_ps= noise_ps+ ( abs( noise_fft).^ 2)/ (L* U);
n_start= n_start+ L/ 2;
end
noise_ps= noise_ps/ nsubframes;
%==============
% number of noisy speech frames
len1= L/ 2; % with 50% overlap
nframes= floor( length( noisy_speech)/ len1)- 1;
n_start= 1;
for j= 1: nframes
noisy= noisy_speech( n_start: n_start+ L- 1);
noisy= noisy.* hamming_win;
noisy_fft= fft( noisy, L);
noisy_ps= ( abs( noisy_fft).^ 2)/ (L* U);
% ============ voice activity detection
if (j== 1) % initialize posteri
posteri= noisy_ps./ noise_ps;
posteri_prime= posteri- 1;
posteri_prime( find( posteri_prime< 0))= 0;
priori= a_dd+ (1-a_dd)* posteri_prime;
else
posteri= noisy_ps./ noise_ps;
posteri_prime= posteri- 1;
posteri_prime( find( posteri_prime< 0))= 0;
priori= a_dd* (G_prev.^ 2).* posteri_prev+ ...
(1-a_dd)* posteri_prime;
end
log_sigma_k= posteri.* priori./ (1+ priori)- log(1+ priori);
vad_decision(j)= sum( log_sigma_k)/ L;
if (vad_decision(j)< eta)
% noise only frame found
noise_ps= mu* noise_ps+ (1- mu)* noisy_ps;
vad( n_start: n_start+ L- 1)= 0;
else
vad( n_start: n_start+ L- 1)= 1;
end
% ===end of vad===
G= sqrt( priori./ (1+ priori)); % gain function
enhanced= ifft( noisy_fft.* G, L);
if (j== 1)
enhanced_speech( n_start: n_start+ L/2- 1)= ...
enhanced( 1: L/2);
else
enhanced_speech( n_start: n_start+ L/2- 1)= ...
overlap+ enhanced( 1: L/2);
end
overlap= enhanced( L/ 2+ 1: L);
n_start= n_start+ L/ 2;
G_prev= G;
posteri_prev= posteri;
end
enhanced_speech( n_start: n_start+ L/2- 1)= overlap;
audiowrite(outfile,enhanced_speech,fs,'BitsPerSample',16);

View File

@ -6,9 +6,9 @@
\@writefile{lof}{\contentsline {figure}{\numberline {1}{\ignorespaces The $N$-point DFT of $x[n]$, where $N=8$\relax }}{1}{}\protected@file@percent } \@writefile{lof}{\contentsline {figure}{\numberline {1}{\ignorespaces The $N$-point DFT of $x[n]$, where $N=8$\relax }}{1}{}\protected@file@percent }
\providecommand*\caption@xref[2]{\@setref\relax\@undefined{#1}} \providecommand*\caption@xref[2]{\@setref\relax\@undefined{#1}}
\newlabel{fig:N_point_DFT}{{1}{1}} \newlabel{fig:N_point_DFT}{{1}{1}}
\@writefile{toc}{\contentsline {subsection}{\numberline {1.2}The Z-Transform}{1}{}\protected@file@percent }
\@writefile{lof}{\contentsline {figure}{\numberline {2}{\ignorespaces The 9-point DFT of $x[n]$, where $N=8$\relax }}{2}{}\protected@file@percent } \@writefile{lof}{\contentsline {figure}{\numberline {2}{\ignorespaces The 9-point DFT of $x[n]$, where $N=8$\relax }}{2}{}\protected@file@percent }
\newlabel{fig:9_point_DFT}{{2}{2}} \newlabel{fig:9_point_DFT}{{2}{2}}
\@writefile{toc}{\contentsline {subsection}{\numberline {1.2}The Z-Transform}{2}{}\protected@file@percent }
\@writefile{toc}{\contentsline {subsection}{\numberline {1.3}The Inverse Z-Transform}{2}{}\protected@file@percent } \@writefile{toc}{\contentsline {subsection}{\numberline {1.3}The Inverse Z-Transform}{2}{}\protected@file@percent }
\@writefile{toc}{\contentsline {section}{\numberline {2}Conclusions}{3}{}\protected@file@percent } \@writefile{toc}{\contentsline {section}{\numberline {2}Conclusions}{3}{}\protected@file@percent }
\gdef \@abspage@last{4} \gdef \@abspage@last{4}

View File

@ -1,4 +1,4 @@
This is pdfTeX, Version 3.141592653-2.6-1.40.24 (TeX Live 2022/CVE-2023-32700 patched) (preloaded format=pdflatex 2024.3.9) 17 APR 2024 21:12 This is pdfTeX, Version 3.141592653-2.6-1.40.24 (TeX Live 2022/CVE-2023-32700 patched) (preloaded format=pdflatex 2024.3.9) 25 APR 2024 09:37
entering extended mode entering extended mode
restricted \write18 enabled. restricted \write18 enabled.
%&-line parsing enabled. %&-line parsing enabled.
@ -318,37 +318,34 @@ File: Q9_point_DFT.png Graphic file (type png)
<use Q9_point_DFT.png> <use Q9_point_DFT.png>
Package pdftex.def Info: Q9_point_DFT.png used on input line 61. Package pdftex.def Info: Q9_point_DFT.png used on input line 61.
(pdftex.def) Requested size: 234.8775pt x 176.15768pt. (pdftex.def) Requested size: 234.8775pt x 176.15768pt.
[1 <./N8_point_dft.png>]
LaTeX Warning: `h' float specifier changed to `ht'.
Underfull \hbox (badness 10000) in paragraph at lines 71--76 Underfull \hbox (badness 10000) in paragraph at lines 71--76
[] []
[1 <./N8_point_dft.png>] [2 <./Q9_point_dft.png>] [3] (./lab-4.aux) ) [2 <./Q9_point_dft.png>] [3] (./lab-4.aux) )
Here is how much of TeX's memory you used: Here is how much of TeX's memory you used:
5569 strings out of 476182 5570 strings out of 476182
90423 string characters out of 5796582 90433 string characters out of 5796582
1859793 words of memory out of 6000000 1858793 words of memory out of 6000000
25843 multiletter control sequences out of 15000+600000 25844 multiletter control sequences out of 15000+600000
520010 words of font info for 63 fonts, out of 8000000 for 9000 520010 words of font info for 63 fonts, out of 8000000 for 9000
1137 hyphenation exceptions out of 8191 1137 hyphenation exceptions out of 8191
55i,8n,63p,490b,340s stack positions out of 10000i,1000n,20000p,200000b,200000s 55i,8n,63p,490b,332s stack positions out of 10000i,1000n,20000p,200000b,200000s
</usr/shar </usr/share/texlive/texmf-dist/font
e/texlive/texmf-dist/fonts/type1/public/amsfonts/cm/cmbx12.pfb></usr/share/texl s/type1/public/amsfonts/cm/cmbx12.pfb></usr/share/texlive/texmf-dist/fonts/type
ive/texmf-dist/fonts/type1/public/amsfonts/cm/cmex10.pfb></usr/share/texlive/te 1/public/amsfonts/cm/cmex10.pfb></usr/share/texlive/texmf-dist/fonts/type1/publ
xmf-dist/fonts/type1/public/amsfonts/cm/cmmi10.pfb></usr/share/texlive/texmf-di ic/amsfonts/cm/cmmi10.pfb></usr/share/texlive/texmf-dist/fonts/type1/public/ams
st/fonts/type1/public/amsfonts/cm/cmmi5.pfb></usr/share/texlive/texmf-dist/font fonts/cm/cmmi5.pfb></usr/share/texlive/texmf-dist/fonts/type1/public/amsfonts/c
s/type1/public/amsfonts/cm/cmmi7.pfb></usr/share/texlive/texmf-dist/fonts/type1 m/cmmi7.pfb></usr/share/texlive/texmf-dist/fonts/type1/public/amsfonts/cm/cmr10
/public/amsfonts/cm/cmr10.pfb></usr/share/texlive/texmf-dist/fonts/type1/public .pfb></usr/share/texlive/texmf-dist/fonts/type1/public/amsfonts/cm/cmr12.pfb></
/amsfonts/cm/cmr12.pfb></usr/share/texlive/texmf-dist/fonts/type1/public/amsfon usr/share/texlive/texmf-dist/fonts/type1/public/amsfonts/cm/cmr17.pfb></usr/sha
ts/cm/cmr17.pfb></usr/share/texlive/texmf-dist/fonts/type1/public/amsfonts/cm/c re/texlive/texmf-dist/fonts/type1/public/amsfonts/cm/cmr5.pfb></usr/share/texli
mr5.pfb></usr/share/texlive/texmf-dist/fonts/type1/public/amsfonts/cm/cmr7.pfb> ve/texmf-dist/fonts/type1/public/amsfonts/cm/cmr7.pfb></usr/share/texlive/texmf
</usr/share/texlive/texmf-dist/fonts/type1/public/amsfonts/cm/cmsy10.pfb></usr/ -dist/fonts/type1/public/amsfonts/cm/cmsy10.pfb></usr/share/texlive/texmf-dist/
share/texlive/texmf-dist/fonts/type1/public/amsfonts/cm/cmsy7.pfb></usr/share/t fonts/type1/public/amsfonts/cm/cmsy7.pfb></usr/share/texlive/texmf-dist/fonts/t
exlive/texmf-dist/fonts/type1/public/amsfonts/cm/cmtt10.pfb> ype1/public/amsfonts/cm/cmtt10.pfb>
Output written on lab-4.pdf (4 pages, 162298 bytes). Output written on lab-4.pdf (4 pages, 162398 bytes).
PDF statistics: PDF statistics:
87 PDF objects out of 1000 (max. 8388607) 87 PDF objects out of 1000 (max. 8388607)
50 compressed objects within 1 object stream 50 compressed objects within 1 object stream

View File

@ -49,14 +49,14 @@ where $r$ is the common ratio between adjacent terms. For the $N$-point DFT of $
\label{eqn:DFT_N_point} \label{eqn:DFT_N_point}
\end{equation} \end{equation}
The $N$-point DFT of $x[n]$, where $N=8$ is seen in figure \ref{fig:N_point_DFT}. It only has a non-zero value for $k={N\over2}=4$. This is the case for all even-number-point DFTs. Therefore, only odd-number-point DFTs should be used. The $N$-point DFT of $x[n]$, where $N=8$ is seen in figure \ref{fig:N_point_DFT}. It only has a non-zero value for $k={N\over2}=4$. This is the case for all even-number-point DFTs. Therefore, only odd-number-point DFTs should be used.
\begin{figure}[h] \begin{figure}[H]
\center \center
\includegraphics[width=0.5\textwidth]{N8_point_DFT.png} \includegraphics[width=0.5\textwidth]{N8_point_DFT.png}
\caption{The $N$-point DFT of $x[n]$, where $N=8$} \caption{The $N$-point DFT of $x[n]$, where $N=8$}
\label{fig:N_point_DFT} \label{fig:N_point_DFT}
\end{figure} \end{figure}
For example, the 9-point DFT of $x[n]$, where $N=8$ is seen in figure \ref{fig:9_point_DFT}. While equation \ref{eqn:DFT_N_point} cannot be used because there are a different number of samples for the DFT and the input signal, the overall DFT is more useful than the 8-point DFT. For example, the 9-point DFT of $x[n]$, where $N=8$ is seen in figure \ref{fig:9_point_DFT}. While equation \ref{eqn:DFT_N_point} cannot be used because there are a different number of samples for the DFT and the input signal, the overall DFT is more useful than the 8-point DFT.
\begin{figure}[h] \begin{figure}[H]
\center \center
\includegraphics[width=0.5\textwidth]{Q9_point_DFT.png} \includegraphics[width=0.5\textwidth]{Q9_point_DFT.png}
\caption{The 9-point DFT of $x[n]$, where $N=8$} \caption{The 9-point DFT of $x[n]$, where $N=8$}

View File

@ -0,0 +1,45 @@
*--- SIMULATE FILE
*---SIMULATION PARAMETERS
.PARAM:
+ FS=100K ;SYSTEM SWITCHING FREQUENCY
+ TS={1/FS}
+ W={2*PI*FS}
+ CYCLE=3 ;SIMULATED CYCLES
+ START={500*TS}
+ END={START+CYCLE*TS}
+ STEP={TS/1000}
.TRAN {STEP} {END} {START} {STEP} UIC
*---CIRCUIT PARAMETERS
.PARAM:
+ VIN = 100
+ L1 = 50U
+ C1 = 20U
+ RL = 10
+ D = 0.5
+ TON = D*TS
*--- DC POWER SUPPLY
VIN IN 0 {VIN}
*--- CIRCUIT DISCRIPTION
S1 IN S1OUT GP 0 MYSWITCH
D1 0 S1OUT MYDIODE
L1 S1OUT L1OUT {L1} IC=0
C1 L1OUT 0 {C1} IC=0
*--- LOAD RESISTANCE
RL L1OUT 0 {RL}
*--- CONTROL SIGNAL FOR THE SWITCH
VGP GP 0 PULSE(0 10 0 0.1U 0.1U {TON-0.1U} {TS})
RGP GP 0 100K
*--- MEASURE POWER AND EFFICIENCY
.MEAS TRAN VOUT AVG V(L1OUT)
.MODEL MYDIODE D(RON=0.1M ROFF=100MEG VFWD=0.1M)
.MODEL MYSWITCH SW(RON=0.1M ROFF=100MEG VT=3)

View File

@ -0,0 +1,25 @@
Circuit: *--- SIMULATE FILE
Per .tran options, skipping operating point for transient analysis.
vout: AVG(v(l1out))=50.3944 FROM 0 TO 3e-05
Date: Tue Apr 23 17:05:31 2024
Total elapsed time: 2.271 seconds.
tnom = 27
temp = 27
method = modified trap
totiter = 1032241
traniter = 1032241
tranpoints = 514891
accept = 512502
rejected = 2389
matrix size = 6
fillins = 0
solver = Normal
Avg thread counts: 1.0/1.0/1.0/1.0
Matrix Compiler1: 146 bytes object code size 0.2/0.2/[0.2]
Matrix Compiler2: 346 bytes object code size 0.2/0.3/[0.2]

Binary file not shown.

After

Width:  |  Height:  |  Size: 60 KiB

View File

@ -0,0 +1,13 @@
Tu = 0.2
Ku = 0.022
Kp_pid = 0.6*Ku
Ki_pid = 1.2*Ku/Tu
Kd_pid = 0.075*Ku*Tu
Kp_no_os = 0.2*Ku
Ki_no_os = 0.4*Ku/Tu
Kd_no_os = 0.066*Ku*Tu
print(f"Ziegler Nichols PID Tune: K_p={Kp_pid}, K_i={Ki_pid}, K_d={Kd_pid}")
print(f"Ziegler Nichols No Overshoot Tune: K_p={Kp_no_os}, K_i={Ki_no_os}, K_d={Kd_no_os}")

Binary file not shown.

After

Width:  |  Height:  |  Size: 22 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 23 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 24 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 21 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 19 KiB