Started code for DSP final project
This commit is contained in:
@ -0,0 +1,176 @@
|
||||
function audnoise(ns_file,outfile)
|
||||
|
||||
%
|
||||
% Implements the audible-noise suppression algorithm [1].
|
||||
%
|
||||
% Usage: audnoise(noisyFile, outputFile)
|
||||
%
|
||||
% infile - noisy speech file in .wav format
|
||||
% outputFile - enhanced output file in .wav format
|
||||
%
|
||||
% It runs 2 iterations, but one could change the number of iterations by
|
||||
% modifying accordingly the variable iter_num on line 33.
|
||||
%
|
||||
% Example call: audnoise('sp04_babble_sn10.wav','out_aud.wav');
|
||||
%
|
||||
% References:
|
||||
% [1] Tsoukalas, D. E., Mourjopoulos, J. N., and Kokkinakis, G. (1997). Speech
|
||||
% enhancement based on audible noise suppression. IEEE Trans. on Speech and
|
||||
% Audio Processing, 5(6), 497-514.
|
||||
%
|
||||
% Authors: Yi Hu and Philipos C. Loizou
|
||||
%
|
||||
% Copyright (c) 2006 by Philipos C. Loizou
|
||||
% $Revision: 0.0 $ $Date: 10/09/2006 $
|
||||
%-------------------------------------------------------------------------
|
||||
|
||||
if nargin<2
|
||||
fprintf('Usage: audnoise(noisyfile.wav,outFile.wav) \n\n');
|
||||
return;
|
||||
end
|
||||
|
||||
|
||||
iter_num=2; % number of iterations
|
||||
NF_SABSENT= 6;
|
||||
%this is the number of speech-absent frames to estimate the initial
|
||||
%noise power spectrum
|
||||
|
||||
[nsdata, Fs, bits]= wavread( ns_file); %nsdata is a column vector
|
||||
|
||||
aa=0.98;
|
||||
mu=0.98;
|
||||
eta=0.15;
|
||||
|
||||
nwind= floor( 20* Fs/ 1000); %this corresponds to 20ms window
|
||||
if rem( nwind, 2)~= 0 nwind= nwind+ 1; end %made window length even
|
||||
noverlap= nwind/ 2;
|
||||
w= hamming( nwind);
|
||||
rowindex= ( 1: nwind)';
|
||||
|
||||
%we assume the first NF_SABSENT frames are speech absent, we use them to estimate the noise power spectrum
|
||||
noisedata= nsdata( 1: nwind* NF_SABSENT); noise_colindex= 1+ ( 0: NF_SABSENT- 1)* nwind;
|
||||
noisematrixdata = zeros( nwind, NF_SABSENT);
|
||||
noisematrixdata( :)= noisedata( ...
|
||||
rowindex( :, ones(1, NF_SABSENT))+ noise_colindex( ones( nwind, 1), :)- 1);
|
||||
noisematrixdata= noisematrixdata.* w( :, ones( 1, NF_SABSENT)) ; %WINDOWING NOISE DATA
|
||||
noise_ps= mean( (abs( fft( noisematrixdata))).^ 2, 2); %NOTE!!!! it is a column vector
|
||||
|
||||
% ----- estimate noise in CBs ------------------
|
||||
%
|
||||
noise_b=zeros(nwind/2+1,1);
|
||||
[CB_FREQ_INDICES]=find_CB_FREQ_INDICES(Fs,nwind,16,nwind/2);
|
||||
|
||||
for i = 1:length(CB_FREQ_INDICES)
|
||||
noise_b(CB_FREQ_INDICES{i})=ones(size(CB_FREQ_INDICES{i},2),1)*mean(noise_ps(CB_FREQ_INDICES{i}));
|
||||
end
|
||||
noise_b1=[noise_b; fliplr(noise_b(2:nwind/2))];
|
||||
|
||||
nslide= nwind- noverlap;
|
||||
|
||||
x= nsdata;
|
||||
nx= length( x); ncol= fix(( nx- noverlap)/ nslide);
|
||||
colindex = 1 + (0: (ncol- 1))* nslide;
|
||||
if nx< (nwind + colindex(ncol) - 1)
|
||||
x(nx+ 1: nwind+ colindex(ncol) - 1) = ...
|
||||
rand( nwind+ colindex( ncol)- 1- nx, 1)* (2^ (-15)); % zero-padding
|
||||
end
|
||||
|
||||
es_old= zeros( noverlap, 1);
|
||||
%es_old is actually the second half of the previous enhanced speech frame,
|
||||
%it is used for overlap-add
|
||||
|
||||
for k= 1: ncol
|
||||
|
||||
y= x( colindex( k): colindex( k)+ nwind- 1);
|
||||
y= y.* w; %WINDOWING NOISY SPEECH DATA
|
||||
|
||||
y_spec= fft( y); y_specmag= abs( y_spec); y_specang= angle( y_spec);
|
||||
%they are the frequency spectrum, spectrum magnitude and spectrum phase, respectively
|
||||
|
||||
y_ps= y_specmag.^ 2; %power spectrum of noisy speech
|
||||
y_ps1=y_ps(1:nwind/2+1);
|
||||
|
||||
% ====start of vad ===
|
||||
gammak=min(y_ps./noise_ps,40); % post SNR
|
||||
if k==1
|
||||
ksi=aa+(1-aa)*max(gammak-1,0);
|
||||
else
|
||||
ksi=aa*Xk_prev./noise_ps + (1-aa)*max(gammak-1,0); % a priori SNR
|
||||
end
|
||||
|
||||
log_sigma_k= gammak.* ksi./ (1+ ksi)- log(1+ ksi);
|
||||
vad_decision= sum( log_sigma_k)/ nwind;
|
||||
if (vad_decision < eta)
|
||||
% noise only frame found
|
||||
noise_ps= mu* noise_ps+ (1- mu)* y_ps;
|
||||
end
|
||||
|
||||
|
||||
for i = 1:length(CB_FREQ_INDICES)
|
||||
noise_b(CB_FREQ_INDICES{i})=...
|
||||
ones(size(CB_FREQ_INDICES{i},2),1)*mean(noise_ps(CB_FREQ_INDICES{i}));
|
||||
end
|
||||
|
||||
% ===end of vad===
|
||||
|
||||
x_cons1=max(y_ps-noise_ps,0.001);
|
||||
% conservative estimate of x from power spectral subtraction
|
||||
x_cons = x_cons1(1:nwind/2+1);
|
||||
|
||||
% --- Estimate masking thresholds iteratively (as per page 505) ----
|
||||
%
|
||||
Tk0=mask(x_cons,nwind,Fs,16);
|
||||
Xp=y_ps1;
|
||||
for j=1:iter_num
|
||||
ab = noise_b+(noise_b.^2)./Tk0; % Eq. 41
|
||||
Xp=(Xp.^2)./(ab+Xp); % Eq. 40
|
||||
Tk0=mask(Xp,nwind,Fs,16);
|
||||
end
|
||||
|
||||
% --- Estimate alpha ------
|
||||
%
|
||||
alpha = (noise_b+Tk0).*(noise_b./Tk0);
|
||||
% eq. 26 for Threshold (T) method with ni(b)=1
|
||||
|
||||
% ---- Apply suppression rule --------------
|
||||
%
|
||||
H0 = (Xp./(alpha+Xp));
|
||||
H=[H0(1:nwind/2+1); flipud(H0(2:nwind/2))];
|
||||
|
||||
x_hat = H.*y_spec;
|
||||
Xk_prev= abs( x_hat).^ 2;
|
||||
|
||||
es_tmp=real(ifft(x_hat));
|
||||
|
||||
% ---- Overlap and add ---------------
|
||||
|
||||
es_data( colindex( k): colindex( k)+ nwind- 1)= [es_tmp( 1: noverlap)+ es_old;...
|
||||
es_tmp( noverlap+ 1: nwind)];
|
||||
%overlap-add
|
||||
es_old= es_tmp( nwind- noverlap+ 1: nwind);
|
||||
end
|
||||
|
||||
wavwrite( es_data, Fs, bits, outfile);
|
||||
|
||||
%------------------------------------------------------
|
||||
|
||||
function [CB_FREQ_INDICES]=find_CB_FREQ_INDICES(Fs,dft_length,nbits,frame_overlap)
|
||||
% This function is from Matlab STSA Toolbox for Audio Signal Noise Reduction
|
||||
% Copyright (C) 2001 Patrick J. Wolfe
|
||||
|
||||
freq_val = (0:Fs/dft_length:Fs/2)';
|
||||
freq=freq_val;
|
||||
crit_band_ends = [0;100;200;300;400;510;630;770;920;1080;1270;1480;1720;2000;2320;2700;3150;3700;4400;5300;6400;7700;9500;12000;15500;Inf];
|
||||
imax = max(find(crit_band_ends < freq(end)));
|
||||
num_bins = length(freq);
|
||||
LIN_TO_BARK = zeros(imax,num_bins);
|
||||
i = 1;
|
||||
for j = 1:num_bins
|
||||
while ~((freq(j) >= crit_band_ends(i)) & (freq(j) < crit_band_ends(i+1))),i = i+1;end
|
||||
LIN_TO_BARK(i,j) = 1;
|
||||
end
|
||||
% Calculation of critical band frequency indices--i.e., which bins are in which critical band for i = 1:imax
|
||||
for i=1:imax,
|
||||
CB_FREQ_INDICES{i} = find(LIN_TO_BARK(i,:));
|
||||
end
|
||||
|
@ -0,0 +1,47 @@
|
||||
function z=confhyperg(a,b,x,n)
|
||||
%
|
||||
% Computes the confluent hypergeometric function
|
||||
% using a series expansion:
|
||||
%
|
||||
% f(a,b;x)=
|
||||
%
|
||||
% 1 + [ab/1!c]x + [a(a+1)/2!b(b+1)]x^2 +
|
||||
% [a(a+1)(a+2)/3!b(b+1)(b+2)]x^3 + ...
|
||||
%
|
||||
% The above series is expanded to n terms
|
||||
%
|
||||
%
|
||||
%
|
||||
% Philipos C. Loizou
|
||||
|
||||
if nargin ~= 4
|
||||
error('Usage: confhyperg(a,b,x,n) - Incorrect number of arguments')
|
||||
end
|
||||
|
||||
if (n <= 0 | n ~= floor(n))
|
||||
error('Usage: confhyperg (a,b,c,x,n) - n has to be a positive integer')
|
||||
end
|
||||
|
||||
NEG=0;
|
||||
if x<0
|
||||
x=abs(x);
|
||||
a=b-a;
|
||||
NEG=1;
|
||||
end
|
||||
|
||||
z = 0;
|
||||
m = 0;
|
||||
while (m<n)
|
||||
if (m == 0)
|
||||
delta = 1;
|
||||
else
|
||||
delta = delta .* x .* (a + (m - 1)) ./ (m .* (b + (m-1)));
|
||||
end
|
||||
|
||||
z = z + delta;
|
||||
m = m + 1;
|
||||
end
|
||||
|
||||
if NEG==1 % if x<0
|
||||
z=exp(-x).*z;
|
||||
end;
|
@ -0,0 +1,54 @@
|
||||
function z=hyperg(a,b,c,x,n)
|
||||
% HYPERGEOMETRIC2F1 Computes the hypergeometric function
|
||||
% using a series expansion:
|
||||
%
|
||||
% f(a,b;c;x)=
|
||||
%
|
||||
% 1 + [ab/1!c]x + [a(a+1)b(b+1)/2!c(c+1)]x^2 +
|
||||
% [a(a+1)(a+2)b(b+1)(b+2)/3!c(c+1)(c+2)]x^3 + ...
|
||||
%
|
||||
% The series is expanded to n terms
|
||||
%
|
||||
% This function solves the Gaussian Hypergeometric Differential Equation:
|
||||
%
|
||||
% x(1-x)y'' + {c-(a+b+1)x}y' - aby = 0
|
||||
%
|
||||
% The Hypergeometric function converges only for:
|
||||
% |x| < 1
|
||||
% c != 0, -1, -2, -3, ...
|
||||
%
|
||||
%
|
||||
% Comments to:
|
||||
% Diego Garcia - d.garcia@ieee.org
|
||||
% Chuck Mongiovi - mongiovi@fast.net
|
||||
% June 14, 2002
|
||||
|
||||
if nargin ~= 5
|
||||
error('Usage: hypergeometric2f1(a,b,c,x,n) --> Wrong number of arguments')
|
||||
end
|
||||
|
||||
if (n <= 0 | n ~= floor(n))
|
||||
error('Usage: hypergeometric2f1(a,b,c,x,n) --> n has to be a positive integer')
|
||||
end
|
||||
|
||||
% if (abs(x) > 1)
|
||||
% z=min(0.99,x);
|
||||
% return;
|
||||
% error('Usage: hypergeometric2f1(a,b,c,x,n) --> |x| has to be less than 1')
|
||||
% end
|
||||
|
||||
if (c <= 0 & c == floor(c))
|
||||
error('Usage: hypergeometric2f1(a,b,c,x,n) --> c != 0, -1, -2, -3, ...')
|
||||
end
|
||||
|
||||
z = 0;
|
||||
m = 0;
|
||||
while (m<n)
|
||||
if (m == 0)
|
||||
delta = 1;
|
||||
else
|
||||
delta = delta .* x .* (a + (m - 1)) .* (b + (m-1)) ./ m ./ (c + (m-1));
|
||||
end
|
||||
z = z + delta;
|
||||
m = m + 1;
|
||||
end
|
@ -0,0 +1,119 @@
|
||||
function logmmse(filename,outfile)
|
||||
|
||||
%
|
||||
% Implements the logMMSE algorithm [1].
|
||||
%
|
||||
% Usage: logmmse(noisyFile, outputFile)
|
||||
%
|
||||
% infile - noisy speech file in .wav format
|
||||
% outputFile - enhanced output file in .wav format
|
||||
%
|
||||
%
|
||||
% Example call: logmmse('sp04_babble_sn10.wav','out_log.wav');
|
||||
%
|
||||
% References:
|
||||
% [1] Ephraim, Y. and Malah, D. (1985). Speech enhancement using a minimum
|
||||
% mean-square error log-spectral amplitude estimator. IEEE Trans. Acoust.,
|
||||
% Speech, Signal Process., ASSP-23(2), 443-445.
|
||||
%
|
||||
% Authors: Philipos C. Loizou
|
||||
%
|
||||
% Copyright (c) 2006 by Philipos C. Loizou
|
||||
% $Revision: 0.0 $ $Date: 10/09/2006 $
|
||||
%-------------------------------------------------------------------------
|
||||
|
||||
if nargin<2
|
||||
fprintf('Usage: logmmse(noisyfile.wav,outFile.wav) \n\n');
|
||||
return;
|
||||
end
|
||||
|
||||
[x, Srate, bits]= wavread( filename); %nsdata is a column vector
|
||||
|
||||
% =============== Initialize variables ===============
|
||||
|
||||
len=floor(20*Srate/1000); % Frame size in samples
|
||||
if rem(len,2)==1, len=len+1; end;
|
||||
PERC=50; % window overlap in percent of frame size
|
||||
len1=floor(len*PERC/100);
|
||||
len2=len-len1;
|
||||
|
||||
|
||||
win=hamming(len); % define window
|
||||
|
||||
|
||||
% Noise magnitude calculations - assuming that the first 6 frames is
|
||||
% noise/silence
|
||||
|
||||
nFFT=2*len;
|
||||
noise_mean=zeros(nFFT,1);
|
||||
j=1;
|
||||
for m=1:6
|
||||
noise_mean=noise_mean+abs(fft(win.*x(j:j+len-1),nFFT));
|
||||
j=j+len;
|
||||
end
|
||||
noise_mu=noise_mean/6;
|
||||
noise_mu2=noise_mu.^2;
|
||||
|
||||
%--- allocate memory and initialize various variables
|
||||
|
||||
|
||||
|
||||
x_old=zeros(len1,1);
|
||||
Nframes=floor(length(x)/len2)-floor(len/len2);
|
||||
xfinal=zeros(Nframes*len2,1);
|
||||
|
||||
|
||||
%=============================== Start Processing =======================================================
|
||||
%
|
||||
k=1;
|
||||
aa=0.98;
|
||||
mu=0.98;
|
||||
eta=0.15;
|
||||
|
||||
ksi_min=10^(-25/10);
|
||||
|
||||
for n=1:Nframes
|
||||
|
||||
insign=win.*x(k:k+len-1);
|
||||
|
||||
spec=fft(insign,nFFT);
|
||||
sig=abs(spec); % compute the magnitude
|
||||
sig2=sig.^2;
|
||||
|
||||
gammak=min(sig2./noise_mu2,40); % limit post SNR to avoid overflows
|
||||
if n==1
|
||||
ksi=aa+(1-aa)*max(gammak-1,0);
|
||||
else
|
||||
ksi=aa*Xk_prev./noise_mu2 + (1-aa)*max(gammak-1,0); % a priori SNR
|
||||
ksi=max(ksi_min,ksi); % limit ksi to -25 dB
|
||||
end
|
||||
|
||||
log_sigma_k= gammak.* ksi./ (1+ ksi)- log(1+ ksi);
|
||||
vad_decision= sum(log_sigma_k)/ len;
|
||||
if (vad_decision< eta)
|
||||
% noise only frame found
|
||||
noise_mu2= mu* noise_mu2+ (1- mu)* sig2;
|
||||
end
|
||||
% ===end of vad===
|
||||
|
||||
A=ksi./(1+ksi); % Log-MMSE estimator
|
||||
vk=A.*gammak;
|
||||
ei_vk=0.5*expint(vk);
|
||||
hw=A.*exp(ei_vk);
|
||||
|
||||
sig=sig.*hw;
|
||||
Xk_prev=sig.^2;
|
||||
|
||||
xi_w= ifft( hw .* spec,nFFT);
|
||||
xi_w= real( xi_w);
|
||||
|
||||
xfinal(k:k+ len2-1)= x_old+ xi_w(1:len1);
|
||||
x_old= xi_w(len1+ 1: len);
|
||||
|
||||
k=k+len2;
|
||||
|
||||
end
|
||||
|
||||
wavwrite(xfinal,Srate,16,outfile);
|
||||
|
||||
|
@ -0,0 +1,287 @@
|
||||
function logmmse_SPU(filename,outfile,option)
|
||||
|
||||
%
|
||||
% Implements the logMMSE algorithm with signal-presence uncertainty (SPU) [1].
|
||||
% Four different methods for estimating the a priori probability of speech absence
|
||||
% (P(H0)) are implemented.
|
||||
%
|
||||
% Usage: logmmse_SPU(noisyFile, outputFile, option)
|
||||
%
|
||||
% infile - noisy speech file in .wav format
|
||||
% outputFile - enhanced output file in .wav format
|
||||
% option - method used to estimate the a priori probability of speech
|
||||
% absence, P(Ho):
|
||||
% 1 - hard decision (Soon et al. [2])
|
||||
% 2 - soft decision (Soon et al. [2])
|
||||
% 3 - Malah et al.(1999) - ICASSP
|
||||
% 4 - Cohen (2002) [1]
|
||||
%
|
||||
%
|
||||
% Example call: logmmse_SPU('sp04_babble_sn10.wav','out_logSPU.wav',1);
|
||||
%
|
||||
% References:
|
||||
% [1] Cohen, I. (2002). Optimal speech enhancement under signal presence
|
||||
% uncertainty using log-spectra amplitude estimator. IEEE Signal Processing
|
||||
% Letters, 9(4), 113-116.
|
||||
% [2] Soon, I., Koh, S., and Yeo, C. (1999). Improved noise suppression
|
||||
% filter using self-adaptive estimator of probability of speech absence.
|
||||
% Signal Processing, 75, 151-159.
|
||||
%
|
||||
% Author: Philipos C. Loizou
|
||||
%
|
||||
% Copyright (c) 2006 by Philipos C. Loizou
|
||||
% $Revision: 0.0 $ $Date: 10/09/2006 $
|
||||
%-------------------------------------------------------------------------
|
||||
|
||||
if nargin<3
|
||||
fprintf('Usage: logmmse_SPU(infile.wav,outfile.wav,option) \n');
|
||||
fprintf('where option = \n');
|
||||
fprintf(' 1 - hard decision ( Soon et al)\n');
|
||||
fprintf(' 2 - soft decision (Soon et al.)\n');
|
||||
fprintf(' 3 - Malah et al.(1999) \n');
|
||||
fprintf(' 4 - Cohen (2002) \n');
|
||||
return;
|
||||
end;
|
||||
|
||||
if option<1 | option>4 | rem(option,1)~=0
|
||||
error('ERROR! option needs to be an integer between 1 and 4.\n\n');
|
||||
end
|
||||
|
||||
[x, Srate, bits]= wavread( filename);
|
||||
|
||||
|
||||
|
||||
% =============== Initialize variables ===============
|
||||
%
|
||||
|
||||
len=floor(20*Srate/1000); % Frame size in samples
|
||||
if rem(len,2)==1, len=len+1; end;
|
||||
PERC=50; % window overlap in percent of frame size
|
||||
len1=floor(len*PERC/100);
|
||||
len2=len-len1;
|
||||
|
||||
win=hamming(len); % define window
|
||||
|
||||
% Noise magnitude calculations - assuming that the first 6 frames is
|
||||
% noise/silence
|
||||
%
|
||||
nFFT=len;
|
||||
nFFT2=floor(len/2);
|
||||
noise_mean=zeros(nFFT,1);
|
||||
j=1;
|
||||
for k=1:6
|
||||
noise_mean=noise_mean+abs(fft(win.*x(j:j+len-1),nFFT));
|
||||
j=j+len;
|
||||
end
|
||||
noise_mu=noise_mean/6;
|
||||
noise_mu2=noise_mu.^2;
|
||||
|
||||
%--- allocate memory and initialize various variables
|
||||
|
||||
|
||||
aa=0.98;
|
||||
mu=0.98;
|
||||
eta=0.15;
|
||||
img=sqrt(-1);
|
||||
x_old=zeros(len1,1);
|
||||
Nframes=floor(length(x)/len2)-floor(len/len2);
|
||||
xfinal=zeros(Nframes*len2,1);
|
||||
|
||||
|
||||
if option==4 % Cohen's method
|
||||
global zetak zeta_fr_old z_peak
|
||||
|
||||
len2a=len/2+1;
|
||||
zetak=zeros(len2a,1);
|
||||
zeta_fr_old=1000;
|
||||
z_peak=0;
|
||||
end;
|
||||
|
||||
%=============================== Start Processing =======================================================
|
||||
%
|
||||
|
||||
qk=0.5*ones(len,1);
|
||||
ksi_old=zeros(len,1);
|
||||
ksi_min=10^(-25/10);
|
||||
%qkr=(1-qk)/qk;
|
||||
%qk2=1/(1-qk);
|
||||
|
||||
Gmin=10^(-20/10); % needed for Cohen's implementation
|
||||
k=1;
|
||||
|
||||
for n=1:Nframes
|
||||
|
||||
insign=win.*x(k:k+len-1);
|
||||
|
||||
%--- Take fourier transform of frame
|
||||
|
||||
spec=fft(insign,nFFT);
|
||||
sig=abs(spec); % compute the magnitude
|
||||
sig2=sig.^2;
|
||||
|
||||
gammak=min(sig2./noise_mu2,40); % post SNR
|
||||
if n==1
|
||||
ksi=aa+(1-aa)*max(gammak-1,0);
|
||||
else
|
||||
ksi=aa*Xk_prev./noise_mu2 + (1-aa)*max(gammak-1,0);
|
||||
% a priori SNR
|
||||
ksi=max(ksi_min,ksi); % limit ksi to -25 dB
|
||||
end
|
||||
|
||||
log_sigma_k= gammak.* ksi./ (1+ ksi)- log(1+ ksi);
|
||||
vad_decision= sum( log_sigma_k)/ len;
|
||||
if (vad_decision< eta)
|
||||
% noise only frame found
|
||||
noise_mu2= mu* noise_mu2+ (1- mu)* sig2;
|
||||
end
|
||||
% ===end of vad===
|
||||
|
||||
%ksi=qk2*ksi;
|
||||
A=ksi./(1+ksi);
|
||||
vk=A.*gammak;
|
||||
ei_vk=0.5*expint(vk);
|
||||
hw=A.*exp(ei_vk);
|
||||
|
||||
% --- estimate conditional speech-presence probability ---------------
|
||||
%
|
||||
[qk]=est_sap(qk,ksi,ksi_old,gammak,option); % estimate P(Ho)- a priori speech absence prob.
|
||||
pSAP = (1-qk)./(1-qk+qk.*(1+ksi).*exp(-vk)); % P(H1 | Yk)
|
||||
|
||||
|
||||
% ---- Cohen's 2002 ------
|
||||
%
|
||||
Gmin2=Gmin.^(1-pSAP); % Cohen's (2002) - Eq 8
|
||||
Gcohen=(hw.^pSAP).*Gmin2;
|
||||
sig = sig.*Gcohen;
|
||||
%----------------------------
|
||||
|
||||
Xk_prev=sig.^2;
|
||||
ksi_old=ksi; % needed for Cohen's method for estimating q
|
||||
|
||||
xi_w= ifft( sig .* exp(img*angle(spec)));
|
||||
xi_w= real( xi_w);
|
||||
|
||||
% --------- Overlap and add ---------------
|
||||
%
|
||||
xfinal(k:k+ len2-1)= x_old+ xi_w(1:len1);
|
||||
x_old= xi_w(len1+ 1: len);
|
||||
|
||||
k=k+len2;
|
||||
end
|
||||
%========================================================================================
|
||||
|
||||
|
||||
wavwrite(xfinal,Srate,16,outfile);
|
||||
|
||||
%--------------------------- E N D -----------------------------------------
|
||||
|
||||
|
||||
function [qk]=est_sap(qk,xsi,xsi_old,gammak,type)
|
||||
|
||||
% function returns a priori probability of speech absence, P(Ho)
|
||||
%
|
||||
|
||||
global zetak zeta_fr_old z_peak
|
||||
|
||||
if type ==1 % hard-decision: Soon et al.
|
||||
beta=0.1;
|
||||
dk=ones(length(xsi),1);
|
||||
i0=besseli(0,2*(gammak.*xsi).^0.5);
|
||||
temp=exp(-xsi).*i0;
|
||||
indx=find(temp>1);
|
||||
dk(indx)=0;
|
||||
|
||||
qk=beta*dk + (1-beta)*qk;
|
||||
|
||||
|
||||
|
||||
elseif type==2 % soft-decision: Soon et al.
|
||||
beta=0.1;
|
||||
i0=besseli(0,2*(gammak.*xsi).^0.5);
|
||||
|
||||
temp=exp(-xsi).*i0;
|
||||
P_Ho=1./(1+temp);
|
||||
P_Ho=min(1,P_Ho);
|
||||
|
||||
qk=beta*P_Ho + (1-beta)*qk;
|
||||
|
||||
|
||||
elseif type==3 % Malah et al. (1999)
|
||||
|
||||
if mean(gammak(1:floor(length(gammak)/2)))> 2.4 % VAD detector
|
||||
|
||||
beta=0.95;
|
||||
gamma_th=0.8;
|
||||
dk=ones(length(xsi),1);
|
||||
indx=find(gammak>gamma_th);
|
||||
dk(indx)=0;
|
||||
|
||||
qk=beta*qk+(1-beta)*dk;
|
||||
end
|
||||
|
||||
elseif type==4 % Cohen (2002)
|
||||
beta=0.7;
|
||||
len=length(qk);
|
||||
len2=len/2+1;
|
||||
|
||||
zetak=beta*zetak+(1-beta)*xsi_old(1:len2);
|
||||
|
||||
|
||||
z_min=0.1; z_max=0.3162;
|
||||
C=log10(z_max/z_min);
|
||||
zp_min=1; zp_max=10;
|
||||
zeta_local=smoothing(zetak,1);
|
||||
zeta_global=smoothing(zetak,15);
|
||||
|
||||
Plocal=zeros(len2,1); % estimate P_local
|
||||
imax=find(zeta_local>z_max);
|
||||
Plocal(imax)=1;
|
||||
ibet=find(zeta_local>z_min & zeta_local<z_max);
|
||||
Plocal(ibet)=log10(zeta_local(ibet)/z_min)/C;
|
||||
|
||||
|
||||
Pglob=zeros(len2,1); % estimate P_global
|
||||
imax=find(zeta_global>z_max);
|
||||
Pglob(imax)=1;
|
||||
ibet=find(zeta_global>z_min & zeta_global<z_max);
|
||||
Pglob(ibet)=log10(zeta_global(ibet)/z_min)/C;
|
||||
|
||||
zeta_fr=mean(zetak); % estimate Pframe
|
||||
if zeta_fr>z_min
|
||||
if zeta_fr>zeta_fr_old
|
||||
Pframe=1;
|
||||
z_peak=min(max(zeta_fr,zp_min),zp_max);
|
||||
else
|
||||
if zeta_fr <=z_peak*z_min, Pframe=0;
|
||||
elseif zeta_fr>= z_peak*z_max, Pframe=1;
|
||||
else, Pframe=log10(zeta_fr/z_peak/z_min)/C;
|
||||
end
|
||||
end
|
||||
else
|
||||
Pframe=0;
|
||||
end
|
||||
zeta_fr_old=zeta_fr;
|
||||
qk2 = 1- Plocal.*Pglob*Pframe; % estimate prob of speech absence
|
||||
qk2= min(0.95,qk2);
|
||||
qk = [qk2; flipud(qk2(2:len2-1))];
|
||||
|
||||
|
||||
end
|
||||
|
||||
%----------------------------------------------
|
||||
function y=smoothing (x,N);
|
||||
|
||||
len=length(x);
|
||||
win=hanning(2*N+1);
|
||||
win1=win(1:N+1);
|
||||
win2=win(N+2:2*N+1);
|
||||
|
||||
y1=filter(flipud(win1),[1],x);
|
||||
|
||||
x2=zeros(len,1);
|
||||
x2(1:len-N)=x(N+1:len);
|
||||
|
||||
y2=filter(flipud(win2),[1],x2);
|
||||
|
||||
y=(y1+y2)/norm(win,2);
|
||||
|
@ -0,0 +1,96 @@
|
||||
% Author: Patrick J. Wolfe
|
||||
% Signal Processing Group
|
||||
% Cambridge University Engineering Department
|
||||
% p.wolfe@ieee.org
|
||||
% Johnston perceptual model initialisation
|
||||
function M= mask( Sx, dft_length, Fs, nbits)
|
||||
|
||||
frame_overlap= dft_length/ 2;
|
||||
freq_val = (0:Fs/dft_length:Fs/2)';
|
||||
half_lsb = (1/(2^nbits-1))^2/dft_length;
|
||||
|
||||
freq= freq_val;
|
||||
thresh= half_lsb;
|
||||
crit_band_ends = [0;100;200;300;400;510;630;770;920;1080;1270;...
|
||||
1480;1720;2000;2320;2700;3150;3700;4400;5300;6400;7700;...
|
||||
9500;12000;15500;Inf];
|
||||
|
||||
% Maximum Bark frequency
|
||||
%
|
||||
imax = max(find(crit_band_ends < freq(end)));
|
||||
|
||||
% Normalised (to 0 dB) threshold of hearing values (Fletcher, 1929)
|
||||
% as used by Johnston. First and last thresholds are corresponding
|
||||
% critical band endpoint values, elsewhere means of interpolated
|
||||
% critical band endpoint threshold values are used.
|
||||
%
|
||||
abs_thr = 10.^([38;31;22;18.5;15.5;13;11;9.5;8.75;7.25;4.75;2.75;...
|
||||
1.5;0.5;0;0;0;0;2;7;12;15.5;18;24;29]./10);
|
||||
ABSOLUTE_THRESH = thresh.*abs_thr(1:imax);
|
||||
|
||||
% Calculation of tone-masking-noise offset ratio in dB
|
||||
%
|
||||
OFFSET_RATIO_DB = 9+ (1:imax)';
|
||||
|
||||
% Initialisation of matrices for bark/linear frequency conversion
|
||||
% (loop increments i to the proper critical band)
|
||||
%
|
||||
num_bins = length(freq);
|
||||
LIN_TO_BARK = zeros(imax,num_bins);
|
||||
i = 1;
|
||||
for j = 1:num_bins
|
||||
while ~((freq(j) >= crit_band_ends(i)) & ...
|
||||
(freq(j) < crit_band_ends(i+1))),
|
||||
i = i+1;
|
||||
end
|
||||
LIN_TO_BARK(i,j) = 1;
|
||||
end
|
||||
|
||||
% Calculation of spreading function (Schroeder et al., 82)
|
||||
|
||||
spreading_fcn = zeros(imax);
|
||||
summ = 0.474:imax;
|
||||
spread = 10.^((15.81+7.5.*summ-17.5.*sqrt(1+summ.^2))./10);
|
||||
for i = 1:imax
|
||||
for j = 1:imax
|
||||
spreading_fcn(i,j) = spread(abs(j-i)+1);
|
||||
end
|
||||
end
|
||||
|
||||
% Calculation of excitation pattern function
|
||||
|
||||
EX_PAT = spreading_fcn* LIN_TO_BARK;
|
||||
|
||||
% Calculation of DC gain due to spreading function
|
||||
|
||||
DC_GAIN = spreading_fcn* ones(imax,1);
|
||||
|
||||
|
||||
%Sx = X.* conj(X);
|
||||
|
||||
|
||||
C = EX_PAT* Sx;
|
||||
|
||||
% Calculation of spectral flatness measure SFM_dB
|
||||
%
|
||||
[num_bins num_frames] = size(Sx);
|
||||
k = 1/num_bins;
|
||||
SFM_dB = 10.*log10((prod(Sx).^k)./(k.*sum(Sx))+ eps);
|
||||
|
||||
% Calculation of tonality coefficient and masked threshold offset
|
||||
%
|
||||
alpha = min(1,SFM_dB./-60);
|
||||
O_dB = OFFSET_RATIO_DB(:,ones(1,num_frames)).*...
|
||||
alpha(ones(length(OFFSET_RATIO_DB),1),:) + 5.5;
|
||||
|
||||
% Threshold calculation and renormalisation, accounting for absolute
|
||||
% thresholds
|
||||
|
||||
T = C./10.^(O_dB./10);
|
||||
T = T./DC_GAIN(:,ones(1,num_frames));
|
||||
T = max( T, ABSOLUTE_THRESH(:, ones(1, num_frames)));
|
||||
|
||||
% Reconversion to linear frequency scale
|
||||
|
||||
%M = 1.* sqrt((LIN_TO_BARK')*T);
|
||||
M= LIN_TO_BARK'* T;
|
@ -0,0 +1,150 @@
|
||||
function mmse(filename,outfile,SPU)
|
||||
|
||||
%
|
||||
% Implements the MMSE algorithm [1].
|
||||
%
|
||||
% Usage: mmse(noisyFile, outputFile, SPU)
|
||||
%
|
||||
% infile - noisy speech file in .wav format
|
||||
% outputFile - enhanced output file in .wav format
|
||||
% SPU - if 1, includes speech-presence uncertainty
|
||||
% if 0, doesnt include speech-presence uncertainty
|
||||
%
|
||||
%
|
||||
% Example call: mmse('sp04_babble_sn10.wav','out_mmse.wav',1);
|
||||
%
|
||||
% References:
|
||||
% [1] Ephraim, Y. and Malah, D. (1985). Speech enhancement using a minimum
|
||||
% mean-square error log-spectral amplitude estimator. IEEE Trans. Acoust.,
|
||||
% Speech, Signal Process., ASSP-23(2), 443-445.
|
||||
%
|
||||
% Authors: Philipos C. Loizou
|
||||
%
|
||||
% Copyright (c) 2006 by Philipos C. Loizou
|
||||
% $Revision: 0.0 $ $Date: 10/09/2006 $
|
||||
%-------------------------------------------------------------------------
|
||||
|
||||
if nargin<3
|
||||
fprintf('Usage: mmse(infile.wav,outfile.wav,SPU) \n');
|
||||
fprintf('where SPU=1 - includes speech presence uncertainty\n');
|
||||
fprintf(' SPU=0 - does not includes speech presence uncertainty\n\n');
|
||||
return;
|
||||
end;
|
||||
|
||||
if SPU~=1 & SPU~=0
|
||||
error('ERROR: SPU needs to be either 1 or 0.');
|
||||
end
|
||||
|
||||
[x, Srate, bits]= wavread( filename);
|
||||
|
||||
|
||||
% =============== Initialize variables ===============
|
||||
|
||||
len=floor(20*Srate/1000); % Frame size in samples
|
||||
if rem(len,2)==1, len=len+1; end;
|
||||
PERC=50; % window overlap in percent of frame size
|
||||
len1=floor(len*PERC/100);
|
||||
len2=len-len1;
|
||||
|
||||
win=hamming(len); %tukey(len,PERC); % define window
|
||||
|
||||
% Noise magnitude calculations - assuming that the first 6 frames is noise/silence
|
||||
%
|
||||
nFFT=2*len;
|
||||
j=1;
|
||||
noise_mean=zeros(nFFT,1);
|
||||
for k=1:6
|
||||
noise_mean=noise_mean+abs(fft(win.*x(j:j+len-1),nFFT));
|
||||
j=j+len;
|
||||
end
|
||||
noise_mu=noise_mean/6;
|
||||
noise_mu2=noise_mu.^2;
|
||||
|
||||
%--- allocate memory and initialize various variables
|
||||
|
||||
k=1;
|
||||
img=sqrt(-1);
|
||||
x_old=zeros(len1,1);
|
||||
Nframes=floor(length(x)/len2)-1;
|
||||
xfinal=zeros(Nframes*len2,1);
|
||||
|
||||
% --------------- Initialize parameters ------------
|
||||
%
|
||||
k=1;
|
||||
aa=0.98;
|
||||
eta= 0.15;
|
||||
mu=0.98;
|
||||
c=sqrt(pi)/2;
|
||||
qk=0.3;
|
||||
qkr=(1-qk)/qk;
|
||||
ksi_min=10^(-25/10);
|
||||
|
||||
%=============================== Start Processing =======================================================
|
||||
%
|
||||
for n=1:Nframes
|
||||
|
||||
insign=win.*x(k:k+len-1);
|
||||
|
||||
%--- Take fourier transform of frame
|
||||
%
|
||||
spec=fft(insign,nFFT);
|
||||
sig=abs(spec); % compute the magnitude
|
||||
sig2=sig.^2;
|
||||
|
||||
gammak=min(sig2./noise_mu2,40); % posteriori SNR
|
||||
if n==1
|
||||
ksi=aa+(1-aa)*max(gammak-1,0);
|
||||
else
|
||||
ksi=aa*Xk_prev./noise_mu2 + (1-aa)*max(gammak-1,0);
|
||||
% decision-direct estimate of a priori SNR
|
||||
ksi=max(ksi_min,ksi); % limit ksi to -25 dB
|
||||
end
|
||||
|
||||
log_sigma_k= gammak.* ksi./ (1+ ksi)- log(1+ ksi);
|
||||
vad_decision= sum( log_sigma_k)/ len;
|
||||
if (vad_decision< eta) % noise only frame found
|
||||
noise_mu2= mu* noise_mu2+ (1- mu)* sig2;
|
||||
end
|
||||
% ===end of vad===
|
||||
|
||||
vk=ksi.*gammak./(1+ksi);
|
||||
[j0,err]=besseli(0,vk/2);
|
||||
[j1,err2]=besseli(1,vk/2);
|
||||
if any(err) | any(err2)
|
||||
fprintf('ERROR! Overflow in Bessel calculation in frame: %d \n',n);
|
||||
else
|
||||
C=exp(-0.5*vk);
|
||||
A=((c*(vk.^0.5)).*C)./gammak;
|
||||
B=(1+vk).*j0+vk.*j1;
|
||||
hw=A.*B;
|
||||
end
|
||||
|
||||
|
||||
% --- estimate speech presence probability
|
||||
%
|
||||
if SPU==1
|
||||
evk=exp(vk);
|
||||
Lambda=qkr*evk./(1+ksi);
|
||||
pSAP=Lambda./(1+Lambda);
|
||||
sig=sig.*hw.*pSAP;
|
||||
else
|
||||
sig=sig.*hw;
|
||||
end
|
||||
|
||||
Xk_prev=sig.^2; % save for estimation of a priori SNR in next frame
|
||||
|
||||
xi_w= ifft( sig .* exp(img*angle(spec)),nFFT);
|
||||
|
||||
xi_w= real( xi_w);
|
||||
|
||||
xfinal(k:k+ len2-1)= x_old+ xi_w(1:len1);
|
||||
x_old= xi_w(len1+ 1: len);
|
||||
|
||||
k=k+len2;
|
||||
|
||||
end
|
||||
%========================================================================================
|
||||
|
||||
|
||||
wavwrite(xfinal,Srate,16,outfile);
|
||||
|
@ -0,0 +1,696 @@
|
||||
function outfile= mt_mask( noisy_file, outfile)
|
||||
|
||||
%
|
||||
% Implements a psychoacoustically motivated algorithm [1].
|
||||
%
|
||||
% Usage: mt_mask(noisyFile, outputFile)
|
||||
%
|
||||
% infile - noisy speech file in .wav format
|
||||
% outputFile - enhanced output file in .wav format
|
||||
%
|
||||
%
|
||||
% Example call: mt_mask('sp04_babble_sn10.wav','out_mask.wav');
|
||||
%
|
||||
% References:
|
||||
% [1] Hu, Y. and Loizou, P. (2004). Incorporating a psychoacoustical model in
|
||||
% frequency domain speech enhancement. IEEE Signal Processing Letters, 11(2),
|
||||
% 270-273.
|
||||
%
|
||||
% Authors: Yi Hu and Philipos C. Loizou
|
||||
%
|
||||
% Copyright (c) 2006 by Philipos C. Loizou
|
||||
% $Revision: 0.0 $ $Date: 10/09/2006 $
|
||||
%-------------------------------------------------------------------------
|
||||
|
||||
if nargin<2
|
||||
fprintf('Usage: mt_mask(noisyfile.wav,outFile.wav) \n\n');
|
||||
return;
|
||||
end
|
||||
|
||||
|
||||
|
||||
% Initialize wavelet parameters (see also wiener_wt.m)
|
||||
wavname='db4';
|
||||
thre_type='ds';thre_func_type='s';q_0=5;
|
||||
taper_num=16;
|
||||
|
||||
%------------------get the noisy speech data
|
||||
[noisy_speech, Srate, NBITS]= wavread( noisy_file);
|
||||
|
||||
%===========initiate the parameters=======================
|
||||
frame_dur= 20; %unit is milli-second
|
||||
len= floor( Srate* frame_dur/ 1000);
|
||||
if rem( len, 2)~= 0
|
||||
len= len+ 1;
|
||||
end
|
||||
NFFT= len; %number of FFT points
|
||||
tapers= sine_taper( taper_num, NFFT);
|
||||
diga= digamma( taper_num)- log( taper_num);
|
||||
|
||||
win= hamming( len);
|
||||
% win= win/ norm( win);
|
||||
PERC= 50; % window overlap in percent of frame size
|
||||
len1=floor(len* PERC/ 100);
|
||||
len2= len- len1;
|
||||
L120= floor( 120* Srate/ 1000);
|
||||
bfl=0.002; % spectral floor
|
||||
|
||||
|
||||
k= 1; %k is starting point of each frame
|
||||
|
||||
%================================================
|
||||
|
||||
q= ceil( log2( len));
|
||||
M= 2^ q;
|
||||
|
||||
sigma_eta_square= trigamma( taper_num);
|
||||
N_autoc= sigma_eta_square* ( 1- ( 0: taper_num+ 1)/ ( taper_num+ 1));
|
||||
N_autoc( M/ 2+ 1)= 0;
|
||||
Sigma_N_firstrow= [N_autoc( 1: M/ 2+ 1), fliplr( N_autoc( 2: M/ 2))];
|
||||
noise_stat= real( fft( Sigma_N_firstrow));
|
||||
|
||||
[wfilter( 1, :), wfilter( 2, :), wfilter( 3, :), wfilter( 4, :)]= ...
|
||||
wfilters( wavname);
|
||||
%------get the wavelet/scaling filter for decomposition/reconstruction
|
||||
|
||||
noise= noisy_speech( 1: L120);
|
||||
noise_ps= psd_mt_sine( noise, tapers);
|
||||
log_noise_ps= log( noise_ps)- diga;
|
||||
den_log_noise_ps= thre_wavelet( log_noise_ps, noise_stat, thre_type, ...
|
||||
thre_func_type, wfilter, q_0);
|
||||
den_log_noise_ps= [den_log_noise_ps( 1: len/ 2+ 1); ...
|
||||
flipud( den_log_noise_ps( 2: len/ 2))];
|
||||
noise_ps= exp( den_log_noise_ps);
|
||||
%=================
|
||||
|
||||
mu_vad= 0.98; % smoothing factor in noise spectrum update
|
||||
aa= 0.98; % smoothing factor in priori update
|
||||
eta= 0.15; % VAD threshold
|
||||
|
||||
%=================
|
||||
|
||||
Nframes= floor( length( noisy_speech)/ len2)- 1;
|
||||
x_old= zeros( len1, 1);
|
||||
xfinal= zeros( Nframes* len2, 1);
|
||||
|
||||
%=============================== Start Processing ==========
|
||||
|
||||
for n= 1: Nframes
|
||||
|
||||
insign= noisy_speech( k: k+ len- 1);
|
||||
insign_spec= fft( insign.* win, NFFT);
|
||||
|
||||
%========estimate the noisy speech power spectrum
|
||||
ns_ps= psd_mt_sine( insign, tapers);
|
||||
|
||||
log_ns_ps= log( ns_ps)- diga;
|
||||
den_log_ns_ps= thre_wavelet( log_ns_ps, noise_stat, thre_type, ...
|
||||
thre_func_type, wfilter, q_0);
|
||||
den_log_ns_ps= [den_log_ns_ps( 1: NFFT/ 2+ 1); ...
|
||||
flipud( den_log_ns_ps( 2: NFFT/ 2))];
|
||||
ns_ps= exp( den_log_ns_ps);
|
||||
%=================================================
|
||||
|
||||
gammak= abs( insign_spec).^ 2/ (norm( win)^2)./ noise_ps;
|
||||
if n==1
|
||||
ksi=aa+(1-aa)*max(gammak-1,0);
|
||||
else
|
||||
ksi=aa*Xk_prev./noise_ps + (1-aa)*max(gammak-1,0);
|
||||
% decision-direct estimate of a priori SNR
|
||||
end
|
||||
|
||||
log_sigma_k= gammak.* ksi./ (1+ ksi)- log(1+ ksi);
|
||||
vad_decision(n)= sum( log_sigma_k)/ len;
|
||||
if (vad_decision(n)< eta)
|
||||
% noise only frame found
|
||||
noise_ps= mu_vad* noise_ps+ (1- mu_vad)* ns_ps;
|
||||
vad( k: k+ len- 1)= 0;
|
||||
else
|
||||
vad( k: k+ len- 1)= 1;
|
||||
end
|
||||
% ===end of vad===
|
||||
|
||||
%========estimate the clean speech power spectrum
|
||||
cl_ps= ns_ps- noise_ps;
|
||||
cl_ps= max( cl_ps, bfl* ns_ps);
|
||||
%--providing a spectral floor
|
||||
%========
|
||||
|
||||
%compute the masking threshold
|
||||
mask_thre= mask( cl_ps( 1: NFFT/ 2+ 1), NFFT, Srate, 16);
|
||||
mask_thre= [mask_thre; flipud( mask_thre( 2: NFFT/ 2))];
|
||||
%expand it to NFFT length
|
||||
|
||||
noise_mask_ratio= noise_ps./ mask_thre;
|
||||
%=======two methods to compute g_wi
|
||||
% get the mu_k by u= max( sqrt( Sn/ alpha- 1), 0) * Sx/ Sn
|
||||
%aprioSNR= cl_ps./ noise_ps;
|
||||
%mu( :, n)= max( sqrt( noise_mask_ratio)-1, 0).* aprioSNR;
|
||||
%g_wi= aprioSNR./ ( aprioSNR+ mu_n);
|
||||
tmp= max( sqrt( noise_mask_ratio)-1, 0);
|
||||
g_wi= 1./ (1+ tmp);
|
||||
|
||||
xi_freq= g_wi.* insign_spec;
|
||||
Xk_prev= abs( xi_freq).^ 2;
|
||||
|
||||
xi_w= ifft( xi_freq);
|
||||
xi_w= real( xi_w);
|
||||
|
||||
xfinal( k: k+ len2- 1)= x_old+ xi_w( 1: len1);
|
||||
x_old= xi_w( len1+ 1: len);
|
||||
k= k+ len2;
|
||||
|
||||
end
|
||||
%========================================================================================
|
||||
|
||||
wavwrite( xfinal, Srate, 16, outfile);
|
||||
|
||||
|
||||
%========================================================================================
|
||||
|
||||
function after_thre= thre_wavelet( before_thre, noise_stat, ...
|
||||
thre_type, thre_func_type, wfilter, q_0)
|
||||
|
||||
%this function implements the wavelet thresholding technique
|
||||
% refer to the paper by Walden/1998, Donoho/1995, Johnstone/1997
|
||||
|
||||
%note on the parameters
|
||||
% before_thre: data before thresholding
|
||||
% noise_stat: the power spectrum of the noise (i.e., noise statistics),
|
||||
% DFT of the first row of Sigma_N, refer to Eq. (8) in Walden's paper
|
||||
% thre_type: threshold type, scale-dependent Universal ('d'),
|
||||
% scale-independent Universal ('i'), scale-dependent SURE ('ds'),
|
||||
% scale-independent SURE ('is'), or scale-dependent Generalized
|
||||
% Corss-Validation ('dg')
|
||||
% thre_func_type: threshold function type: soft ('s') or hard ('h');
|
||||
% wfilter: wavelet low pass and high pass decomposition/reconstruction filters [lo_d, hi_d, lo_r, hi_r]
|
||||
% the 1st row is lo_d, the 2nd row is hi_d, the 3rd row is lo_r, and the 4th row is hi_r
|
||||
% q_0 is the decomposition level
|
||||
|
||||
% after_thre: data after thresholding
|
||||
|
||||
s= size( before_thre);
|
||||
before_thre= before_thre( :)'; %make it a row vector
|
||||
noise_stat= noise_stat( :)';
|
||||
|
||||
N= length( before_thre); %length of before-thresholded data
|
||||
q= ceil( log2( N));
|
||||
M= 2^ q;
|
||||
|
||||
%==get the low pass and high pass decomposition/reconstruction filters from wfilter
|
||||
lo_d= wfilter( 1, :); %low pass decomposition filter/ scaling filter
|
||||
hi_d= wfilter( 2, :); %high pass decomposition filter/ wavelet filter
|
||||
lo_r= wfilter( 3, :); %low pass reconstruction filter/ scaling filter
|
||||
hi_r= wfilter( 4, :); %high pass reconstruction filter/ wavelet filter
|
||||
|
||||
%==refer to pp. 3155 in Walden's paper
|
||||
H= zeros( q_0, M);
|
||||
H( 1, :)= fft( hi_d, M); %frequency response of wavelet filter
|
||||
G( 1, :)= fft( lo_d, M); %frequency response of scaling filter
|
||||
for i= 2: q_0- 1
|
||||
G( i, :)= G( 1, rem( (2^ (i- 1) )* (0: M- 1), M)+ 1);
|
||||
end
|
||||
|
||||
for j= 2: q_0
|
||||
H( j, :)= prod( [G( 1: j- 1, :); H( 1, rem( (2^ (j- 1) )* (0: M- 1), M)+ 1)], 1);
|
||||
end
|
||||
|
||||
[y_coeff, len_info]= wavedec( before_thre, q_0, lo_d, hi_d);
|
||||
|
||||
% --decompose before_thre into q_0 levels using wavelet filter hi_d and scaling filter lo_d
|
||||
% --where y_coeff contains the coefficients and len_info contains the length information
|
||||
% --different segments of y_coeff correspond approximation and detail coefficients;
|
||||
% -- length of len_info should be q_0+ 2
|
||||
|
||||
%===============processing according to 'thre_type'
|
||||
%-------with 'd'--scale-dependent thresholding, threshold has to be computed for each level
|
||||
%-------with 'i'--scale-independent thresholding, threshold is set to a fixed level
|
||||
|
||||
if thre_type== 'i' %scale-independent universal thresholding
|
||||
sigma_square= mean( noise_stat);
|
||||
thre= sqrt( sigma_square* 2* log( M)) ; %mean( noise_stat) is sigma_eta_square in Eq. (6)
|
||||
y_coeff( len_info( 1)+ 1: end)= ...
|
||||
wthresh( y_coeff( len_info( 1)+ 1: end), thre_func_type, thre);
|
||||
|
||||
elseif thre_type== 'd' %scale-dependent universal thresholding
|
||||
%------first we need to compute the energy level of each scale from j= 1: q_0
|
||||
for i= 1: q_0 %refer to Eq. (9) in Walden's paper
|
||||
sigma_j_square( i)= mean( noise_stat.* (abs( H( i, :)).^ 2), 2); %average along the row
|
||||
end
|
||||
|
||||
for i= 2: q_0+ 1 %thresholding for each scale
|
||||
|
||||
sp= sum( len_info( 1: i- 1), 2)+ 1; %starting point
|
||||
ep= sp+ len_info( i)- 1;
|
||||
thre= sqrt( sigma_j_square( q_0- i+ 2)* 2* log( len_info( i)));
|
||||
y_coeff( sp: ep)= wthresh( y_coeff( sp: ep), thre_func_type, thre);
|
||||
|
||||
end
|
||||
|
||||
elseif thre_type== 'ds' %scale-dependent SURE thresholding
|
||||
|
||||
%=======use Eq. (9) in Walden's paper to get sigma_j, MDA estimate seems to be better
|
||||
% for i= 1: q_0
|
||||
% sigma_j_square( i)= mean( noise_stat.* (abs( H( i, :)).^ 2), 2); %average along the row
|
||||
% sigma_j( i)= sqrt( sigma_j_square( i));
|
||||
% end
|
||||
|
||||
%======MDA estimate of sigma_j
|
||||
sigma_j= wnoisest( y_coeff, len_info, 1: q_0);
|
||||
|
||||
for i= 2: q_0+ 1 %thresholding for each scale
|
||||
|
||||
sp= sum( len_info( 1: i- 1), 2)+ 1; %starting point
|
||||
ep= sp+ len_info( i)- 1; %ending point
|
||||
if sigma_j( q_0- i+ 2)< sqrt( eps)* max( y_coeff( sp: ep));
|
||||
thre= 0;
|
||||
else
|
||||
thre= sigma_j( q_0- i+ 2)* thselect( y_coeff( sp: ep)/ ...
|
||||
sigma_j( q_0- i+ 2), 'heursure');
|
||||
end
|
||||
|
||||
%fprintf( 1, 'sigma_j is %6.2f, thre is %6.2f\n', sigma_j, thre);
|
||||
y_coeff( sp: ep)= wthresh( y_coeff( sp: ep), thre_func_type, thre);
|
||||
|
||||
end
|
||||
|
||||
elseif thre_type== 'dn' %new risk function defined in Xiao-ping Zhang's paper
|
||||
|
||||
sigma_j= wnoisest( y_coeff, len_info, 1: q_0);
|
||||
sigma_j_square= sigma_j.^ 2;
|
||||
|
||||
for i= 2: q_0+ 1 %thresholding for each scale
|
||||
|
||||
sp= sum( len_info( 1: i- 1), 2)+ 1; %starting point
|
||||
ep= sp+ len_info( i)- 1; %ending point
|
||||
if sigma_j( q_0- i+ 2)< sqrt( eps)* max( y_coeff( sp: ep));
|
||||
thre= 0;
|
||||
else
|
||||
|
||||
%based on some evidece, the following theme let thre vary with SNR
|
||||
% with ultra low SNR indicating low probability of signal presence,
|
||||
% hence using universal threshold
|
||||
% and very high SNR indicates high probability of signal presence,
|
||||
% hence using SURE threshold
|
||||
|
||||
thre_max= sigma_j( q_0- i+ 2)* sqrt( 2* log( len_info( i))); %thre with SNRlog< -5dB
|
||||
thre_min= sigma_j( q_0- i+ 2)* fminbnd( @riskfunc, 0, sqrt(2* log( ep- sp+ 1)), ...
|
||||
optimset( 'MaxFunEvals',1000,'MaxIter',1000), ...
|
||||
y_coeff( sp: ep)/ sigma_j( q_0- i+ 2), 3); %thre with SNRlog> 20dB
|
||||
slope= (thre_max- thre_min)/ 25;
|
||||
thre_0= thre_min+ 20* slope;
|
||||
|
||||
SNRlog= 10* log10( mean( max( y_coeff( sp: ep).^ 2/ sigma_j_square( q_0- i+ 2)- 1, 0)));
|
||||
if SNRlog>= 20
|
||||
thre= thre_min; %actually this corresponds to SURE threshold
|
||||
elseif ( SNRlog< 20) & ( SNRlog>= -5)
|
||||
thre= thre_0- SNRlog* slope;
|
||||
else
|
||||
thre= thre_max; %this corresponds to oversmooth threshold
|
||||
end
|
||||
|
||||
%the theme below is similar to the option 'heursure' in the function 'thselect'
|
||||
% univ_thr = sqrt(2* log( len_info( i))); %universal thresholding
|
||||
% eta = (norm( y_coeff( sp: ep)/ sigma_j( q_0- i+ 2)).^2)/ ( len_info( i))- 1;
|
||||
% crit = (log2( len_info( i)))^(1.5)/ sqrt( len_info( i));
|
||||
% if 1%eta > crit %high probility that speech exists
|
||||
% thre= sigma_j( q_0- i+ 2)* fminbnd( @riskfunc, 0, sqrt(2* log( ep- sp+ 1)), ...
|
||||
% optimset( 'MaxFunEvals',1000,'MaxIter',1000), ...
|
||||
% y_coeff( sp: ep)/ sigma_j( q_0- i+ 2), 3);
|
||||
% else
|
||||
% thre = sigma_j( q_0- i+ 2)* univ_thr;
|
||||
% end
|
||||
|
||||
end
|
||||
|
||||
y_coeff( sp: ep)= wthresh( y_coeff( sp: ep), thre_func_type, thre);
|
||||
|
||||
end
|
||||
|
||||
elseif thre_type== 'dg' %scale-dependent Generalized Cross Validation thresholding
|
||||
|
||||
for i= 2: q_0+ 1 %thresholding for each scale
|
||||
|
||||
sp= sum( len_info( 1: i- 1), 2)+ 1; %starting point
|
||||
ep= sp+ len_info( i)- 1; %ending point
|
||||
[y_coeff( sp: ep), thre]= mingcv( y_coeff( sp: ep), thre_func_type);
|
||||
|
||||
end
|
||||
|
||||
else
|
||||
error( 'wrong thresholding type');
|
||||
end
|
||||
|
||||
%--reconstruct the thresholded coefficients
|
||||
after_thre= waverec( y_coeff, len_info, lo_r, hi_r);
|
||||
|
||||
if s(1)>1
|
||||
after_thre= after_thre';
|
||||
end
|
||||
%fprintf( 1, 'thre is %f\n', thre);
|
||||
|
||||
|
||||
|
||||
function mt_psd= psd_mt_sine( data, sine_tapers)
|
||||
|
||||
% this function uses sine tapers to get multitaper power spectrum estimation
|
||||
% 'x' is the incoming data, 'sine_tapers' is a matrix with each column being
|
||||
% sine taper, sine_tapers can be obtained using the function sine_taper
|
||||
|
||||
[frame_len, taper_num]= size( sine_tapers);
|
||||
|
||||
eigen_spectra= zeros( frame_len, taper_num);
|
||||
|
||||
data= data( :);
|
||||
data_len= length( data);
|
||||
data_hankel= hankel( data( 1: frame_len), data( frame_len: data_len));
|
||||
|
||||
x_mt_psd= zeros( frame_len, data_len- frame_len+ 1);
|
||||
|
||||
for pp= 1: data_len- frame_len+ 1
|
||||
for index= 1: taper_num
|
||||
x_taperd= sine_tapers( :, index).* data_hankel( :, pp);
|
||||
x_taperd_spec= fft( x_taperd);
|
||||
eigen_spectra( :, index)= abs( x_taperd_spec).^ 2;
|
||||
end
|
||||
x_mt_psd(:, pp)= mean( eigen_spectra, 2);
|
||||
end
|
||||
|
||||
mt_psd= mean( x_mt_psd, 2);
|
||||
|
||||
|
||||
|
||||
function tapers= sine_taper( L, N)
|
||||
|
||||
% this function is used to generate the sine tapers proposed by Riedel et
|
||||
% al in IEEE Transactions on Signal Processing, pp. 188- 195, Jan. 1995
|
||||
|
||||
% there are two parameters, 'L' is the number of the sine tapers generated,
|
||||
% and 'N' is the length of each sine taper; the returned value 'tapers' is
|
||||
% a N-by-L matrix with each column being sine taper
|
||||
|
||||
tapers= zeros( N, L);
|
||||
|
||||
for index= 1: L
|
||||
tapers( :, index)= sqrt( 2/ (N+ 1))* sin (pi* index* (1: N)'/ (N+ 1));
|
||||
end
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
function y = trigamma(z,method,debug)
|
||||
|
||||
% y = trigamma(z) ... Trigamma-Function for real positive z
|
||||
%
|
||||
% trigamma(z) = (d/dz)^2 log(gamma(z)) = d/dz digamma(z)
|
||||
%
|
||||
% if 'z' is a matrix, then the digamma-function is evaluated for
|
||||
% each element. Results are inaccurate for real arguments < 10 which are
|
||||
% neither integers nor half-integers.
|
||||
%
|
||||
% y = trigamma(z,method)
|
||||
%
|
||||
% possible values for optional argument 'method':
|
||||
% method = 1 : quick asymptotic series expansion (approximate)
|
||||
% method = 2 : finite recursion for integer values (exact)
|
||||
% method = 3 : finite recursion for half-integer values (exact)
|
||||
% method = 4 (default) : automatic selection of 1,2 or 3 for individual
|
||||
% elements in z whichever is appropriate.
|
||||
%
|
||||
% see also: digamma, gamma, gammaln, gammainc, specfun
|
||||
|
||||
|
||||
% reference: Abramowitz & Stegun, "Handbook of Mathematical Functions"
|
||||
% Chapter "Gamma Function and Related Functions" :
|
||||
% implemented by: Christoph Mecklenbraeuker
|
||||
% (email: cfm@sth.ruhr-uni-bochum.de), July 4, 1995.
|
||||
|
||||
|
||||
dim = size(z); % save original matrix dimension
|
||||
z = reshape(z,dim(1)*dim(2),1); % make a column vector
|
||||
I1 = ones(length(z),1); % auxiliary vector of ones
|
||||
|
||||
if(nargin==1)
|
||||
method=4; debug=0;
|
||||
elseif(nargin==2)
|
||||
debug=0;
|
||||
end;
|
||||
|
||||
|
||||
if(debug == 1) % if debug==1: track recursion
|
||||
[m,n] =size(z);
|
||||
fprintf(1,'trigamma: method = %d, size(z)=[%d %d],\t min(z)=%f, max(z)=%f\n',...
|
||||
method,m,n,min(min(z)),max(max(z)));
|
||||
end;
|
||||
|
||||
if(method==1) % use 9th order asymptotic expansion
|
||||
if(any(z<1))
|
||||
fprintf(1,'Warning: some elements in argument of "trigamma(z,1)" are < 1\n');
|
||||
fprintf(1,'minimal argument = %g: trigamma-result is inaccurate!\n',min(min(z)));
|
||||
end
|
||||
|
||||
% calculate powers of 1/z :
|
||||
w1 = 1./z; w2 = w1.*w1; w3 = w1.*w2; w5 = w2.*w3; w7 = w2.*w5; w9 = w2.*w7;
|
||||
% generate coefficients of expansion: matrix with constant columns
|
||||
a = [ I1 I1/2 I1/6 -I1/30 I1/42 -I1/30];
|
||||
% make vector of powers of 1/z:
|
||||
w = [ w1 w2 w3 w5 w7 w9];
|
||||
% calculate expansion by summing the ROWS of (a .* w) :
|
||||
y = sum((a.*w).').';
|
||||
elseif(method==2)
|
||||
zmax = max(max(floor(z)));
|
||||
ytab = zeros(zmax,1);
|
||||
ytab(1) = pi^2/6; % = psi'(1)
|
||||
for n=1:zmax-1;
|
||||
ytab(n+1) = ytab(n) - 1/n^2; % generate lookup table
|
||||
end;
|
||||
y = ytab(z);
|
||||
elseif(method==3)
|
||||
zmax = max(max(floor(z)));
|
||||
ytab = zeros(zmax+1,1);
|
||||
ytab(1) = pi^2/2; % = psi'(1/2)
|
||||
for n=1:zmax;
|
||||
ytab(n+1) = ytab(n) - 4/(2*n-1)^2; % generate lookup table
|
||||
end;
|
||||
y = ytab(z+0.5);
|
||||
elseif(method==4) % decide here which method to use
|
||||
Less0 = find(z<0); % negative arguments evaluated by reflexion formula
|
||||
Less1 = find(z>0 & z<1); % values between 0 and 1.
|
||||
fraction = rem(z,1); % fractional part of arguments
|
||||
f2 = rem(2*fraction,1);
|
||||
Integers = find(fraction==0 & z>0); % Index set of positive integer arguments
|
||||
NegInts = find(fraction==0 & z<=0); % Index set of positive integer arguments
|
||||
HalfInts = find(abs(fraction-0.5)<1e-7 & z>0); % Index set of positive half-integers
|
||||
Reals = find(f2>1e-7 & z>1); % Index set of all other arguments > 1
|
||||
if(~isempty(Reals)) y(Reals) = trigamma(z(Reals),1,debug); end;
|
||||
if(~isempty(Less1)) y(Less1) = trigamma(z(Less1)+2,1,debug) + ...
|
||||
1./z(Less1).^2+1./(z(Less1)+1).^2;end;
|
||||
% reflexion formula:
|
||||
if(~isempty(Less0)) y(Less0)= -trigamma(1-z(Less0),1,debug)+(pi./sin(pi*z(Less0))).^2; end;
|
||||
% integers:
|
||||
if(~isempty(Integers)) y(Integers) = trigamma(z(Integers),2,debug); end;
|
||||
% half-integers:
|
||||
if(~isempty(HalfInts)) y(HalfInts) = trigamma(z(HalfInts),3,debug); end;
|
||||
% negative integers:
|
||||
if(~isempty(NegInts)) y(NegInts) = Inf * NegInts; end;
|
||||
end
|
||||
|
||||
y = reshape(y,dim(1),dim(2));
|
||||
return;
|
||||
|
||||
|
||||
|
||||
|
||||
function psi = digamma(z,method,debug)
|
||||
%
|
||||
% psi = digamma(z) ... Digamma-Function for real argument z.
|
||||
%
|
||||
% digamma(z) = d/dz log(gamma(z)) = gamma'(z)/gamma(z)
|
||||
%
|
||||
% if 'z' is a matrix, then the digamma-function is evaluated for
|
||||
% each element. Results may be inaccurate for real arguments < 10
|
||||
% which are neither integers nor half-integers.
|
||||
%
|
||||
% psi = digamma(z,method)
|
||||
%
|
||||
% possible values for optional argument 'method':
|
||||
% method = 1 : quick asymptotic series expansion (approximate)
|
||||
% method = 2 : finite recursion for integer values (exact)
|
||||
% method = 3 : finite recursion for half-integer values (exact)
|
||||
% method = 4 (default) : automatic selection of 1,2 or 3 for individual
|
||||
% elements in z whichever is appropriate.
|
||||
%
|
||||
% see also: trigamma, gamma, gammaln, gammainc, specfun
|
||||
|
||||
% reference: Abramowitz & Stegun, "Handbook of Mathematical Functions"
|
||||
% Chapter "Gamma Function and Related Functions" :
|
||||
% implemented by: Christoph Mecklenbraeuker
|
||||
% (email: cfm@sth.ruhr-uni-bochum.de), July 1, 1995.
|
||||
|
||||
|
||||
dim = size(z); % save original matrix dimension
|
||||
z = reshape(z,dim(1)*dim(2),1); % make a column vector
|
||||
I1 = ones(length(z),1); % auxiliary vector of ones
|
||||
|
||||
if(nargin==1)
|
||||
method=4; debug=0;
|
||||
elseif(nargin==2)
|
||||
debug=0;
|
||||
end;
|
||||
|
||||
if(debug == 1) % if debug==1: track recursion
|
||||
[m,n] = size(z);
|
||||
fprintf(1,'digamma: method = %d, size(z)=[%d %d],\t min(z)=%f, max(z)=%f\n',...
|
||||
method,m,n,min(min(z)),max(max(z)));
|
||||
end;
|
||||
|
||||
|
||||
if(method==1) % use 8th order asymptotic expansion
|
||||
if(any(z<1))
|
||||
fprintf(1,'Warning: some elements in argument of "digamma(z,1)" are < 1\n');
|
||||
fprintf(1,'minimal argument = %g: digamma-result is inaccurate!\n',min(min(z)));
|
||||
end
|
||||
% calculate powers of 1/z :
|
||||
w1 = 1./z; w2 = w1.*w1; w4 = w2.*w2; w6 = w2.*w4; w8 = w4.*w4;
|
||||
% generate coefficients of expansion: matrix with constant columns
|
||||
a = [ -I1/2 -I1/12 I1/120 -I1/252 I1/240 ];
|
||||
% make vector of powers of 1/z:
|
||||
w = [ w1 w2 w4 w6 w8 ];
|
||||
% calculate expansion by summing the ROWS of (a .* w) :
|
||||
psi = log(z) + sum((a.*w).').';
|
||||
elseif(method==2)
|
||||
zmax = max(max(floor(z)));
|
||||
psitab = zeros(zmax,1);
|
||||
psitab(1) = -0.5772156649015328606;
|
||||
for n=1:zmax-1;
|
||||
psitab(n+1) = psitab(n) + 1/n; % generate lookup table
|
||||
end;
|
||||
psi = psitab(z);
|
||||
elseif(method==3)
|
||||
zmax = max(max(floor(z)));
|
||||
psitab = zeros(zmax+1,1);
|
||||
psitab(1) = -0.5772156649015328606 - 2*log(2); % = psi(1/2)
|
||||
for n=1:zmax;
|
||||
psitab(n+1) = psitab(n) + 2/(2*n-1); % generate lookup table
|
||||
end;
|
||||
psi = psitab(z+0.5);
|
||||
elseif(method==4) % decide here which method to use
|
||||
Less0 = find(z<0); % negative arguments evaluated by reflexion formula
|
||||
Less1 = find(z>0 & z<1); % values between 0 and 1.
|
||||
fraction = rem(z,1); % fractional part of arguments
|
||||
f2 = rem(2*fraction,1);
|
||||
Integers = find(fraction==0 & z>0); % Index set of positive integer arguments
|
||||
NegInts = find(fraction==0 & z<=0); % Index set of positive integer arguments
|
||||
HalfInts = find(abs(fraction-0.5)<1e-7 & z>0); % Index set of positive half-integers
|
||||
Reals = find(f2>1e-7 & z>1); % Index set of all other arguments > 1
|
||||
if(~isempty(Reals)) psi(Reals) = digamma(z(Reals),1,debug); end;
|
||||
if(~isempty(Less1)) psi(Less1) = digamma(z(Less1)+2,1,debug) - ...
|
||||
1./z(Less1)-1./(z(Less1)+1);end;
|
||||
% reflexion formula:
|
||||
if(~isempty(Less0)) psi(Less0) = digamma(1-z(Less0),1,debug) - pi./tan(pi*z(Less0)); end;
|
||||
if(~isempty(Integers)) psi(Integers) = digamma(z(Integers),2,debug); end;
|
||||
if(~isempty(HalfInts)) psi(HalfInts) = digamma(z(HalfInts),3,debug); end;
|
||||
if(~isempty(NegInts)) psi(NegInts) = Inf * NegInts; end;
|
||||
end
|
||||
|
||||
psi = reshape(psi,dim(1),dim(2));
|
||||
|
||||
return;
|
||||
|
||||
|
||||
% Author: Patrick J. Wolfe
|
||||
% Signal Processing Group
|
||||
% Cambridge University Engineering Department
|
||||
% p.wolfe@ieee.org
|
||||
% Johnston perceptual model initialisation
|
||||
function M= mask( Sx, dft_length, Fs, nbits)
|
||||
|
||||
frame_overlap= dft_length/ 2;
|
||||
freq_val = (0:Fs/dft_length:Fs/2)';
|
||||
half_lsb = (1/(2^nbits-1))^2/dft_length;
|
||||
|
||||
freq= freq_val;
|
||||
thresh= half_lsb;
|
||||
crit_band_ends = [0;100;200;300;400;510;630;770;920;1080;1270;...
|
||||
1480;1720;2000;2320;2700;3150;3700;4400;5300;6400;7700;...
|
||||
9500;12000;15500;Inf];
|
||||
|
||||
% Maximum Bark frequency
|
||||
%
|
||||
imax = max(find(crit_band_ends < freq(end)));
|
||||
|
||||
% Normalised (to 0 dB) threshold of hearing values (Fletcher, 1929)
|
||||
% as used by Johnston. First and last thresholds are corresponding
|
||||
% critical band endpoint values, elsewhere means of interpolated
|
||||
% critical band endpoint threshold values are used.
|
||||
%
|
||||
abs_thr = 10.^([38;31;22;18.5;15.5;13;11;9.5;8.75;7.25;4.75;2.75;...
|
||||
1.5;0.5;0;0;0;0;2;7;12;15.5;18;24;29]./10);
|
||||
ABSOLUTE_THRESH = thresh.*abs_thr(1:imax);
|
||||
|
||||
% Calculation of tone-masking-noise offset ratio in dB
|
||||
%
|
||||
OFFSET_RATIO_DB = 9+ (1:imax)';
|
||||
|
||||
% Initialisation of matrices for bark/linear frequency conversion
|
||||
% (loop increments i to the proper critical band)
|
||||
%
|
||||
num_bins = length(freq);
|
||||
LIN_TO_BARK = zeros(imax,num_bins);
|
||||
i = 1;
|
||||
for j = 1:num_bins
|
||||
while ~((freq(j) >= crit_band_ends(i)) & ...
|
||||
(freq(j) < crit_band_ends(i+1))),
|
||||
i = i+1;
|
||||
end
|
||||
LIN_TO_BARK(i,j) = 1;
|
||||
end
|
||||
|
||||
% Calculation of spreading function (Schroeder et al., 82)
|
||||
|
||||
spreading_fcn = zeros(imax);
|
||||
summ = 0.474:imax;
|
||||
spread = 10.^((15.81+7.5.*summ-17.5.*sqrt(1+summ.^2))./10);
|
||||
for i = 1:imax
|
||||
for j = 1:imax
|
||||
spreading_fcn(i,j) = spread(abs(j-i)+1);
|
||||
end
|
||||
end
|
||||
|
||||
% Calculation of excitation pattern function
|
||||
|
||||
EX_PAT = spreading_fcn* LIN_TO_BARK;
|
||||
|
||||
% Calculation of DC gain due to spreading function
|
||||
|
||||
DC_GAIN = spreading_fcn* ones(imax,1);
|
||||
|
||||
|
||||
%Sx = X.* conj(X);
|
||||
|
||||
C = EX_PAT* Sx;
|
||||
|
||||
% Calculation of spectral flatness measure SFM_dB
|
||||
%
|
||||
[num_bins num_frames] = size(Sx);
|
||||
k = 1/num_bins;
|
||||
SFM_dB = 10.*log10((prod(Sx).^k)./(k.*sum(Sx)+eps)+ eps);
|
||||
|
||||
% Calculation of tonality coefficient and masked threshold offset
|
||||
%
|
||||
alpha = min(1,SFM_dB./-60);
|
||||
O_dB = OFFSET_RATIO_DB(:,ones(1,num_frames)).*...
|
||||
alpha(ones(length(OFFSET_RATIO_DB),1),:) + 5.5;
|
||||
|
||||
% Threshold calculation and renormalisation, accounting for absolute
|
||||
% thresholds
|
||||
|
||||
T = C./10.^(O_dB./10);
|
||||
T = T./DC_GAIN(:,ones(1,num_frames));
|
||||
T = max( T, ABSOLUTE_THRESH(:, ones(1, num_frames)));
|
||||
|
||||
% Reconversion to linear frequency scale
|
||||
|
||||
%M = 1.* sqrt((LIN_TO_BARK')*T);
|
||||
M= LIN_TO_BARK'* T;
|
@ -0,0 +1,153 @@
|
||||
function stsa_mis(filename,outfile)
|
||||
|
||||
%
|
||||
% Implements the Bayesian estimator based on the modified Itakura-Saito
|
||||
% distortion measure [1, Eq. 43].
|
||||
%
|
||||
% Usage: stsa_mis(noisyFile, outputFile)
|
||||
%
|
||||
% infile - noisy speech file in .wav format
|
||||
% outputFile - enhanced output file in .wav format
|
||||
%
|
||||
%
|
||||
% Example call: stsa_mis('sp04_babble_sn10.wav','out_mis.wav');
|
||||
%
|
||||
% References:
|
||||
% [1] Loizou, P. (2005). Speech enhancement based on perceptually motivated
|
||||
% Bayesian estimators of the speech magnitude spectrum. IEEE Trans. on Speech
|
||||
% and Audio Processing, 13(5), 857-869.
|
||||
%
|
||||
% Author: Philipos C. Loizou
|
||||
%
|
||||
% Copyright (c) 2006 by Philipos C. Loizou
|
||||
% $Revision: 0.0 $ $Date: 10/09/2006 $
|
||||
%-------------------------------------------------------------------------
|
||||
|
||||
if nargin<2
|
||||
fprintf('Usage: stsa_mis inFile outFile.wav \n\n');
|
||||
return;
|
||||
end
|
||||
|
||||
|
||||
[x, Srate, bits]= wavread( filename);
|
||||
|
||||
% =============== Initialize variables ===============
|
||||
%
|
||||
|
||||
len=floor(20*Srate/1000); % Frame size in samples
|
||||
if rem(len,2)==1, len=len+1; end;
|
||||
PERC=50; % window overlap in percent of frame size
|
||||
len1=floor(len*PERC/100);
|
||||
len2=len-len1;
|
||||
|
||||
|
||||
win=hanning(len); %tukey(len,PERC); % define window
|
||||
|
||||
|
||||
|
||||
% Noise magnitude calculations - assuming that the first 6 frames is noise/silence
|
||||
%
|
||||
nFFT=len;
|
||||
nFFT2=len/2;
|
||||
noise_mean=zeros(nFFT,1);
|
||||
j=1;
|
||||
for k=1:5
|
||||
noise_mean=noise_mean+abs(fft(win.*x(j:j+len-1),nFFT));
|
||||
j=j+len;
|
||||
end
|
||||
noise_mu=noise_mean/5;
|
||||
noise_mu2=noise_mu.^2;
|
||||
|
||||
%--- allocate memory and initialize various variables
|
||||
|
||||
|
||||
img=sqrt(-1);
|
||||
x_old=zeros(len1,1);
|
||||
Nframes=floor(length(x)/len2)-1;
|
||||
xfinal=zeros(Nframes*len2,1);
|
||||
|
||||
%=============================== Start Processing =======================================================
|
||||
%
|
||||
k=1;
|
||||
aa=0.98;
|
||||
fprintf('\nThis might take some time ...\n');
|
||||
for n=1:Nframes
|
||||
|
||||
|
||||
insign=win.*x(k:k+len-1);
|
||||
|
||||
%--- Take fourier transform of frame ----
|
||||
|
||||
spec=fft(insign,nFFT);
|
||||
sig=abs(spec); % compute the magnitude
|
||||
sig2=sig.^2;
|
||||
|
||||
gammak=min(sig2./noise_mu2,40); % post SNR. Limit it to avoid overflows
|
||||
if n==1
|
||||
ksi=aa+(1-aa)*max(gammak-1,0);
|
||||
else
|
||||
ksi=aa*Xk_prev./noise_mu2 + (1-aa)*max(gammak-1,0); % a priori SNR
|
||||
end
|
||||
|
||||
vk=ksi.*gammak./(1+ksi);
|
||||
|
||||
sig_hat=log(comp_int(vk,gammak,sig)); % Eq. 41
|
||||
|
||||
Xk_prev=sig_hat.^2;
|
||||
|
||||
xi_w= ifft( sig_hat.* exp(img*angle(spec)));
|
||||
xi_w= real( xi_w);
|
||||
|
||||
|
||||
% --- Overlap and add ---------------
|
||||
%
|
||||
xfinal(k:k+ len2-1)= x_old+ xi_w(1:len1);
|
||||
x_old= xi_w(len1+ 1: len);
|
||||
|
||||
if rem(n,20)==0, fprintf('Frame: %d Percent completed:%4.2f\n',n,n*100/Nframes); end;
|
||||
|
||||
k=k+len2;
|
||||
end
|
||||
%========================================================================================
|
||||
|
||||
|
||||
|
||||
|
||||
wavwrite(xfinal,Srate,16,outfile);
|
||||
|
||||
%------------------------------E N D -----------------------------------
|
||||
function xhat=comp_int(vk,gammak,Yk)
|
||||
|
||||
% -- Evaluates Eq. 43 in [1]
|
||||
%
|
||||
|
||||
Yk2=Yk.*Yk;
|
||||
G2=gammak.^2;
|
||||
EV=exp(-vk);
|
||||
|
||||
N=40; % number of terms to keep in infinite sum (Eq. 43)
|
||||
L=length(vk)/2+1;
|
||||
J1=zeros(L,1);
|
||||
J2=zeros(L,1);
|
||||
|
||||
for j=1:L
|
||||
sum=0; sum_b=0;
|
||||
for m=0:N
|
||||
F=factorial(m);
|
||||
d1=(vk(j))^m;
|
||||
d2=hyperg(-m,-m,0.5,Yk2(j)/(4*G2(j)),10);
|
||||
d2_b=hyperg(-m,-m,1.5,Yk2(j)/(4*G2(j)),10);
|
||||
sum=sum+d1*d2/F;
|
||||
sum_b=sum_b+gamma(m+1.5)*d1*d2_b/(F*gamma(m+1));
|
||||
end
|
||||
J1(j)=sum;
|
||||
J2(j)=sum_b;
|
||||
end
|
||||
|
||||
|
||||
J1=J1.*EV(1:L);
|
||||
J2=J2.*EV(1:L).*sqrt(vk(1:L)).*Yk(1:L)./gammak(1:L);
|
||||
|
||||
|
||||
xhat2=max(real(J1+J2),0.00001);
|
||||
xhat = [xhat2; flipud(xhat2(2:L-1))];
|
@ -0,0 +1,131 @@
|
||||
function stsa_wcosh(filename,outfile,p)
|
||||
|
||||
%
|
||||
% Implements the Bayesian estimator based on the weighted cosh
|
||||
% distortion measure [1, Eq. 34].
|
||||
%
|
||||
% Usage: stsa_wcosh(noisyFile, outputFile, p)
|
||||
%
|
||||
% infile - noisy speech file in .wav format
|
||||
% outputFile - enhanced output file in .wav format
|
||||
% p - power exponent used in the weighted cosh measure.
|
||||
% Valid values for p: p>-1
|
||||
%
|
||||
%
|
||||
% Example call: stsa_wcosh('sp04_babble_sn10.wav','out_wcosh.wav',-0.5);
|
||||
%
|
||||
% References:
|
||||
% [1] Loizou, P. (2005). Speech enhancement based on perceptually motivated
|
||||
% Bayesian estimators of the speech magnitude spectrum. IEEE Trans. on Speech
|
||||
% and Audio Processing, 13(5), 857-869.
|
||||
%
|
||||
% Author: Philipos C. Loizou
|
||||
%
|
||||
% Copyright (c) 2006 by Philipos C. Loizou
|
||||
% $Revision: 0.0 $ $Date: 10/09/2006 $
|
||||
%-------------------------------------------------------------------------
|
||||
|
||||
if nargin<3
|
||||
fprintf('Usage: stsa_wcosh(infile.wav,outfile.wav,p) \n');
|
||||
fprintf(' where p>-1 \n\n');
|
||||
return;
|
||||
end;
|
||||
|
||||
if p<-1
|
||||
error('ERROR! p needs to be larger than -1.\n\n');
|
||||
end
|
||||
|
||||
[x, Srate, bits]= wavread( filename);
|
||||
|
||||
|
||||
|
||||
% =============== Initialize variables ===============
|
||||
%
|
||||
|
||||
len=floor(20*Srate/1000); % Frame size in samples
|
||||
if rem(len,2)==1, len=len+1; end;
|
||||
PERC=50; % window overlap in percent of frame size
|
||||
len1=floor(len*PERC/100);
|
||||
len2=len-len1;
|
||||
|
||||
|
||||
win=hanning(len); %tukey(len,PERC); % define window
|
||||
|
||||
|
||||
|
||||
% Noise magnitude calculations - assuming that the first 6 frames is noise/silence
|
||||
%
|
||||
nFFT=2*len;
|
||||
nFFT2=len/2;
|
||||
noise_mean=zeros(nFFT,1);
|
||||
j=1;
|
||||
for k=1:5
|
||||
noise_mean=noise_mean+abs(fft(win.*x(j:j+len-1),nFFT));
|
||||
j=j+len;
|
||||
end
|
||||
noise_mu=noise_mean/5;
|
||||
noise_mu2=noise_mu.^2;
|
||||
|
||||
%--- allocate memory and initialize various variables
|
||||
|
||||
x_old=zeros(len1,1);
|
||||
Nframes=floor(length(x)/len2)-1;
|
||||
xfinal=zeros(Nframes*len2,1);
|
||||
|
||||
|
||||
%=============================== Start Processing =======================================================
|
||||
%
|
||||
k=1;
|
||||
aa=0.98;
|
||||
CC2=sqrt(gamma((p+3)/2)/gamma((p+1)/2));
|
||||
|
||||
for n=1:Nframes
|
||||
|
||||
|
||||
insign=win.*x(k:k+len-1);
|
||||
|
||||
%--- Take fourier transform of frame
|
||||
|
||||
spec=fft(insign,nFFT);
|
||||
sig=abs(spec); % compute the magnitude
|
||||
sig2=sig.^2;
|
||||
|
||||
gammak=min(sig2./noise_mu2,40); % post SNR
|
||||
if n==1
|
||||
ksi=aa+(1-aa)*max(gammak-1,0);
|
||||
else
|
||||
|
||||
ksi=aa*Xk_prev./noise_mu2 + (1-aa)*max(gammak-1,0); % a priori SNR
|
||||
|
||||
end
|
||||
|
||||
vk=ksi.*gammak./(1+ksi);
|
||||
|
||||
|
||||
% --- for the weighted cosh measure
|
||||
|
||||
numer=CC2*sqrt(vk.*confhyperg(-(p+1)/2,1,-vk,100));
|
||||
denom=gammak.*sqrt(confhyperg(-(p-1)/2,1,-vk,100));
|
||||
hw=numer./denom;
|
||||
|
||||
|
||||
sig=sig.*hw;
|
||||
Xk_prev=sig.^2;
|
||||
|
||||
xi_w= ifft( hw .* spec, nFFT);
|
||||
xi_w= real( xi_w);
|
||||
|
||||
|
||||
% --- Overlap and add ---------------
|
||||
%
|
||||
xfinal(k:k+ len2-1)= x_old+ xi_w(1:len1);
|
||||
x_old= xi_w(len1+ 1: len);
|
||||
|
||||
|
||||
k=k+len2;
|
||||
end
|
||||
%========================================================================================
|
||||
|
||||
|
||||
wavwrite(xfinal,Srate,16,outfile);
|
||||
|
@ -0,0 +1,145 @@
|
||||
function stsa_weuclid(filename,outfile,p)
|
||||
|
||||
%
|
||||
% Implements the Bayesian estimator based on the weighted-Euclidean
|
||||
% distortion measure [1, Eq. 18].
|
||||
%
|
||||
% Usage: stsa_weuclid(noisyFile, outputFile, p)
|
||||
%
|
||||
% infile - noisy speech file in .wav format
|
||||
% outputFile - enhanced output file in .wav format
|
||||
% p - power exponent used in the weighted-Euclidean measure.
|
||||
% Valid values for p: p>-2
|
||||
%
|
||||
%
|
||||
% Example call: stsa_weuclid('sp04_babble_sn10.wav','out_weuclid.wav',-1);
|
||||
%
|
||||
% References:
|
||||
% [1] Loizou, P. (2005). Speech enhancement based on perceptually motivated
|
||||
% Bayesian estimators of the speech magnitude spectrum. IEEE Trans. on Speech
|
||||
% and Audio Processing, 13(5), 857-869.
|
||||
%
|
||||
% Author: Philipos C. Loizou
|
||||
%
|
||||
% Copyright (c) 2006 by Philipos C. Loizou
|
||||
% $Revision: 0.0 $ $Date: 10/09/2006 $
|
||||
%-------------------------------------------------------------------------
|
||||
|
||||
if nargin<3
|
||||
fprintf('Usage: stsa_weuclid(infile.wav,outfile.wav,p) \n');
|
||||
fprintf(' where p>-2 \n\n');
|
||||
return;
|
||||
end;
|
||||
|
||||
if p<-2,
|
||||
error('ERROR! p needs to be larger than -2.\n\n');
|
||||
end
|
||||
|
||||
[x, Srate, bits]= wavread( filename);
|
||||
|
||||
|
||||
% =============== Initialize variables ===============
|
||||
|
||||
len=floor(20*Srate/1000); % Frame size in samples
|
||||
if rem(len,2)==1, len=len+1; end;
|
||||
PERC=50; % window overlap in percent of frame size
|
||||
len1=floor(len*PERC/100);
|
||||
len2=len-len1;
|
||||
|
||||
|
||||
win=hamming(len); %tukey(len,PERC); % define window
|
||||
|
||||
|
||||
% Noise magnitude calculations - assuming that the first 6 frames is noise/silence
|
||||
%
|
||||
nFFT=2*len;
|
||||
nFFT2=len/2;
|
||||
noise_mean=zeros(nFFT,1);
|
||||
j=1;
|
||||
for k=1:6
|
||||
noise_mean=noise_mean+abs(fft(win.*x(j:j+len-1),nFFT));
|
||||
j=j+len;
|
||||
end
|
||||
noise_mu=noise_mean/6;
|
||||
noise_mu2=noise_mu.^2;
|
||||
|
||||
%--- allocate memory and initialize various variables
|
||||
|
||||
k=1;
|
||||
img=sqrt(-1);
|
||||
x_old=zeros(len1,1);
|
||||
Nframes=floor(length(x)/len2)-1;
|
||||
xfinal=zeros(Nframes*len2,1);
|
||||
|
||||
%=============================== Start Processing =======================================================
|
||||
%
|
||||
k=1;
|
||||
aa=0.98;
|
||||
mu=0.98;
|
||||
eta=0.15;
|
||||
c=sqrt(pi)/2;
|
||||
C2=gamma(0.5);
|
||||
|
||||
%p=-1;
|
||||
CC=gamma((p+3)/2)/gamma(p/2+1);
|
||||
ksi_min=10^(-25/10);
|
||||
|
||||
for n=1:Nframes
|
||||
|
||||
|
||||
insign=win.*x(k:k+len-1);
|
||||
|
||||
%--- Take fourier transform of frame
|
||||
|
||||
spec=fft(insign,nFFT);
|
||||
sig=abs(spec); % compute the magnitude
|
||||
sig2=sig.^2;
|
||||
|
||||
gammak=min(sig2./noise_mu2,40); % post SNR
|
||||
if n==1
|
||||
ksi=aa+(1-aa)*max(gammak-1,0);
|
||||
else
|
||||
ksi=aa*Xk_prev./noise_mu2 + (1-aa)*max(gammak-1,0); % a priori SNR
|
||||
ksi=max(ksi_min,ksi); % limit ksi to -25 dB
|
||||
end
|
||||
|
||||
log_sigma_k= gammak.* ksi./ (1+ ksi)- log(1+ ksi);
|
||||
vad_decision= sum( log_sigma_k)/ len;
|
||||
if (vad_decision< eta)
|
||||
% noise only frame found
|
||||
noise_mu2= mu* noise_mu2+ (1- mu)* sig2;
|
||||
end
|
||||
% ===end of vad===
|
||||
|
||||
vk=ksi.*gammak./(1+ksi);
|
||||
|
||||
%----- weighted Euclidean distance ------------------------
|
||||
if p==-1
|
||||
hw=CC*sqrt(vk)./(gammak.*exp(-vk/2).*besseli(0,vk/2)); % if p=-1 use this equation as it's faster
|
||||
else
|
||||
numer=CC*sqrt(vk).*confhyperg(-(p+1)/2,1,-vk,100);
|
||||
denom=gammak.*confhyperg(-p/2,1,-vk,100);
|
||||
hw=numer./denom;
|
||||
end
|
||||
%
|
||||
|
||||
sig=sig.*hw;
|
||||
Xk_prev=sig.^2;
|
||||
|
||||
xi_w= ifft( hw .* spec, nFFT);
|
||||
xi_w= real( xi_w);
|
||||
|
||||
|
||||
% --- Overlap and add ---------------
|
||||
%
|
||||
xfinal(k:k+ len2-1)= x_old+ xi_w(1:len1);
|
||||
x_old= xi_w(len1+ 1: len);
|
||||
|
||||
|
||||
k=k+len2;
|
||||
end
|
||||
%========================================================================================
|
||||
|
||||
|
||||
wavwrite(xfinal,Srate,16,outfile);
|
||||
|
@ -0,0 +1,169 @@
|
||||
function stsa_wlr(filename,outfile)
|
||||
|
||||
%
|
||||
% Implements the Bayesian estimator based on the weighted likelihood ratio
|
||||
% distortion measure [1, Eq. 37].
|
||||
%
|
||||
% Usage: stsa_wlr(noisyFile, outputFile)
|
||||
%
|
||||
% infile - noisy speech file in .wav format
|
||||
% outputFile - enhanced output file in .wav format
|
||||
%
|
||||
%
|
||||
% Example call: stsa_wlr('sp04_babble_sn10.wav','out_wlr.wav');
|
||||
%
|
||||
% References:
|
||||
% [1] Loizou, P. (2005). Speech enhancement based on perceptually motivated
|
||||
% Bayesian estimators of the speech magnitude spectrum. IEEE Trans. on Speech
|
||||
% and Audio Processing, 13(5), 857-869.
|
||||
%
|
||||
% Author: Philipos C. Loizou
|
||||
%
|
||||
% Copyright (c) 2006 by Philipos C. Loizou
|
||||
% $Revision: 0.0 $ $Date: 10/09/2006 $
|
||||
%-------------------------------------------------------------------------
|
||||
|
||||
if nargin<2
|
||||
fprintf('Usage: stsa_wlr inFile outFile.wav \n\n');
|
||||
return;
|
||||
end
|
||||
|
||||
[x, Srate, bits]= wavread( filename);
|
||||
|
||||
|
||||
% =============== Initialize variables ===============
|
||||
%
|
||||
len=floor(20*Srate/1000); % Frame size in samples
|
||||
if rem(len,2)==1, len=len+1; end;
|
||||
PERC=50; % window overlap in percent of frame size
|
||||
len1=floor(len*PERC/100);
|
||||
len2=len-len1;
|
||||
|
||||
|
||||
win=hanning(len); %tukey(len,PERC); % define window
|
||||
|
||||
|
||||
% Noise magnitude calculations - assuming that the first 6 frames is noise/silence
|
||||
%
|
||||
nFFT=len;
|
||||
nFFT2=len/2;
|
||||
noise_mean=zeros(nFFT,1);
|
||||
j=1;
|
||||
for k=1:5
|
||||
noise_mean=noise_mean+abs(fft(win.*x(j:j+len-1),nFFT));
|
||||
j=j+len;
|
||||
end
|
||||
noise_mu=noise_mean/5;
|
||||
noise_mu2=noise_mu.^2;
|
||||
|
||||
%--- allocate memory and initialize various variables
|
||||
|
||||
img=sqrt(-1);
|
||||
x_old=zeros(len1,1);
|
||||
Nframes=floor(length(x)/len2)-1;
|
||||
xfinal=zeros(Nframes*len2,1);
|
||||
xinterv=0.001:0.01:10;
|
||||
k=1;
|
||||
aa=0.98;
|
||||
|
||||
%=============================== Start Processing =======================================================
|
||||
%
|
||||
fprintf('This might take some time ...\n')
|
||||
for n=1:Nframes
|
||||
|
||||
|
||||
insign=win.*x(k:k+len-1);
|
||||
|
||||
%--- Take fourier transform of frame
|
||||
|
||||
spec=fft(insign,nFFT);
|
||||
sig=abs(spec); % compute the magnitude
|
||||
sig2=sig.^2;
|
||||
|
||||
gammak=min(sig2./noise_mu2,40); % post SNR. Limit it to avoid overflows
|
||||
|
||||
if n==1
|
||||
ksi=aa+(1-aa)*max(gammak-1,0);
|
||||
else
|
||||
ksi=aa*Xk_prev./noise_mu2 + (1-aa)*max(gammak-1,0); % a priori SNR
|
||||
end
|
||||
|
||||
vk=ksi.*gammak./(1+ksi);
|
||||
|
||||
|
||||
xx=solve_wlr(vk,gammak,sig,xinterv); % solves Eq. 37 in [1]
|
||||
|
||||
sig_hat=xx;
|
||||
Xk_prev=sig_hat.^2;
|
||||
|
||||
xi_w= ifft( sig_hat.* exp(img*angle(spec)));
|
||||
xi_w= real( xi_w);
|
||||
|
||||
|
||||
% --- Overlap and add ---------------
|
||||
%
|
||||
xfinal(k:k+ len2-1)= x_old+ xi_w(1:len1);
|
||||
x_old= xi_w(len1+ 1: len);
|
||||
|
||||
if rem(n,20)==0, fprintf('Frame: %d Percent completed:%4.2f \n',n,n*100/Nframes); end;
|
||||
|
||||
k=k+len2;
|
||||
end
|
||||
%========================================================================================
|
||||
|
||||
|
||||
|
||||
|
||||
wavwrite(xfinal,Srate,16,outfile);
|
||||
|
||||
|
||||
%==========================================================================
|
||||
function x=solve_wlr(vk,gammak,Yk,xx);
|
||||
|
||||
% solves non-linear Eq. 37 in [1]
|
||||
%
|
||||
|
||||
Len=length(vk);
|
||||
L2=Len/2+1;
|
||||
|
||||
lk05=sqrt(vk).*Yk./gammak;
|
||||
Ex=gamma(1.5)*lk05.*confhyperg(-0.5,1,-vk,100);
|
||||
Elogx=1-0.5*(2*log(lk05)+log(vk)+expint(vk));
|
||||
|
||||
x=zeros(Len,1);
|
||||
|
||||
for n=1:L2
|
||||
|
||||
a=Elogx(n);
|
||||
b=Ex(n);
|
||||
ff=sprintf('log(x)+%f - %f/x',a,b);
|
||||
y=log(xx)+a-b./xx;
|
||||
bet=xx(1); tox=200;
|
||||
if y(1)<0
|
||||
ind=find(y>0);
|
||||
bet=xx(1)/2;
|
||||
tox=xx(ind(1));
|
||||
|
||||
[x(n),fval,flag]=fzero(inline(ff),[bet tox]);
|
||||
if flag<0
|
||||
x(n)=x(n-1);
|
||||
end
|
||||
else
|
||||
ind=find(y<0);
|
||||
if ~isempty(ind)
|
||||
bet=xx(1);
|
||||
tox=xx(ind(1));
|
||||
[x(n),fval]=fzero(inline(ff),[bet tox]);
|
||||
|
||||
else
|
||||
|
||||
x(n)=0.001; % spectral floor
|
||||
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
end
|
||||
|
||||
x(L2+1:Len)=flipud(x(2:L2-1));
|
||||
|
@ -0,0 +1,126 @@
|
||||
function wiener_as(filename,outfile)
|
||||
|
||||
%
|
||||
% Implements the Wiener filtering algorithm based on a priori SNR estimation [1].
|
||||
%
|
||||
% Usage: wiener_as(noisyFile, outputFile)
|
||||
%
|
||||
% infile - noisy speech file in .wav format
|
||||
% outputFile - enhanced output file in .wav format
|
||||
|
||||
%
|
||||
% Example call: wiener_as('sp04_babble_sn10.wav','out_wien_as.wav');
|
||||
%
|
||||
% References:
|
||||
% [1] Scalart, P. and Filho, J. (1996). Speech enhancement based on a priori
|
||||
% signal to noise estimation. Proc. IEEE Int. Conf. Acoust. , Speech, Signal
|
||||
% Processing, 629-632.
|
||||
%
|
||||
% Authors: Yi Hu and Philipos C. Loizou
|
||||
%
|
||||
% Copyright (c) 2006 by Philipos C. Loizou
|
||||
% $Revision: 0.0 $ $Date: 10/09/2006 $
|
||||
%-------------------------------------------------------------------------
|
||||
|
||||
if nargin<2
|
||||
fprintf('Usage: wiener_as(noisyfile.wav,outFile.wav) \n\n');
|
||||
return;
|
||||
end
|
||||
|
||||
|
||||
|
||||
[noisy_speech, fs]= audioread( filename);
|
||||
noisy_speech= noisy_speech;
|
||||
% column vector noisy_speech
|
||||
|
||||
% set parameter values
|
||||
mu= 0.98; % smoothing factor in noise spectrum update
|
||||
a_dd= 0.98; % smoothing factor in priori update
|
||||
eta= 0.15; % VAD threshold
|
||||
frame_dur= 20; % frame duration
|
||||
L= frame_dur* fs/ 1000; % L is frame length (160 for 8k sampling rate)
|
||||
hamming_win= hamming( L); % hamming window
|
||||
U= ( hamming_win'* hamming_win)/ L; % normalization factor
|
||||
|
||||
% first 120 ms is noise only
|
||||
len_120ms= fs/ 1000* 120;
|
||||
% first_120ms= noisy_speech( 1: len_120ms).* ...
|
||||
% (hann( len_120ms, 'periodic'))';
|
||||
first_120ms= noisy_speech( 1: len_120ms);
|
||||
|
||||
% =============now use Welch's method to estimate power spectrum with
|
||||
% Hamming window and 50% overlap
|
||||
nsubframes= floor( len_120ms/ (L/ 2))- 1; % 50% overlap
|
||||
noise_ps= zeros( L, 1);
|
||||
n_start= 1;
|
||||
for j= 1: nsubframes
|
||||
noise= first_120ms( n_start: n_start+ L- 1);
|
||||
noise= noise.* hamming_win;
|
||||
noise_fft= fft( noise, L);
|
||||
noise_ps= noise_ps+ ( abs( noise_fft).^ 2)/ (L* U);
|
||||
n_start= n_start+ L/ 2;
|
||||
end
|
||||
noise_ps= noise_ps/ nsubframes;
|
||||
%==============
|
||||
|
||||
% number of noisy speech frames
|
||||
len1= L/ 2; % with 50% overlap
|
||||
nframes= floor( length( noisy_speech)/ len1)- 1;
|
||||
n_start= 1;
|
||||
|
||||
for j= 1: nframes
|
||||
noisy= noisy_speech( n_start: n_start+ L- 1);
|
||||
noisy= noisy.* hamming_win;
|
||||
noisy_fft= fft( noisy, L);
|
||||
noisy_ps= ( abs( noisy_fft).^ 2)/ (L* U);
|
||||
|
||||
% ============ voice activity detection
|
||||
if (j== 1) % initialize posteri
|
||||
posteri= noisy_ps./ noise_ps;
|
||||
posteri_prime= posteri- 1;
|
||||
posteri_prime( find( posteri_prime< 0))= 0;
|
||||
priori= a_dd+ (1-a_dd)* posteri_prime;
|
||||
else
|
||||
posteri= noisy_ps./ noise_ps;
|
||||
posteri_prime= posteri- 1;
|
||||
posteri_prime( find( posteri_prime< 0))= 0;
|
||||
priori= a_dd* (G_prev.^ 2).* posteri_prev+ ...
|
||||
(1-a_dd)* posteri_prime;
|
||||
end
|
||||
|
||||
log_sigma_k= posteri.* priori./ (1+ priori)- log(1+ priori);
|
||||
vad_decision(j)= sum( log_sigma_k)/ L;
|
||||
if (vad_decision(j)< eta)
|
||||
% noise only frame found
|
||||
noise_ps= mu* noise_ps+ (1- mu)* noisy_ps;
|
||||
vad( n_start: n_start+ L- 1)= 0;
|
||||
else
|
||||
vad( n_start: n_start+ L- 1)= 1;
|
||||
end
|
||||
% ===end of vad===
|
||||
|
||||
G= sqrt( priori./ (1+ priori)); % gain function
|
||||
|
||||
enhanced= ifft( noisy_fft.* G, L);
|
||||
|
||||
if (j== 1)
|
||||
enhanced_speech( n_start: n_start+ L/2- 1)= ...
|
||||
enhanced( 1: L/2);
|
||||
else
|
||||
enhanced_speech( n_start: n_start+ L/2- 1)= ...
|
||||
overlap+ enhanced( 1: L/2);
|
||||
end
|
||||
|
||||
overlap= enhanced( L/ 2+ 1: L);
|
||||
n_start= n_start+ L/ 2;
|
||||
|
||||
G_prev= G;
|
||||
posteri_prev= posteri;
|
||||
|
||||
end
|
||||
|
||||
enhanced_speech( n_start: n_start+ L/2- 1)= overlap;
|
||||
|
||||
audiowrite(outfile,enhanced_speech,fs,'BitsPerSample',16);
|
||||
|
||||
|
Reference in New Issue
Block a user