Started code for DSP final project
This commit is contained in:
parent
824a46b1fd
commit
50a5e57e18
46
6th-Semester-Spring-2024/DSP/Labs/FinalProject/addnoisex.m
Normal file
46
6th-Semester-Spring-2024/DSP/Labs/FinalProject/addnoisex.m
Normal file
@ -0,0 +1,46 @@
|
||||
%*********************ADD FROM NOISEX DATABASE *******************
|
||||
% function [t] = addnoisex(sclean,snoise,snr,outfile)
|
||||
% add noise from a file in noisex database to signal
|
||||
% t - noisy signal - written in output wave file
|
||||
% sclean - clean signal - read either as dat or wave file
|
||||
% snoise - name of noise file, example: white for 'white.dat'
|
||||
% - read in as dat or wave file
|
||||
% snr - desired snr in db
|
||||
% outfile - The output file is written as a wav file
|
||||
% Example [t]=addnoisex('s.wav','street.dat',30,'s_noisy_snr30.wav')
|
||||
%****************************************************
|
||||
%
|
||||
function [t] = addnoisex(sclean,snoise,snr,outfile)
|
||||
|
||||
% Read input clean sognal and noise file
|
||||
[s]=load_or_audioread(sclean);
|
||||
[nfile]=load_or_audioread(snoise);
|
||||
|
||||
% Record length of speech signal and noise file
|
||||
nspeech=length(s);
|
||||
nns=length(nfile);
|
||||
|
||||
% Randomly select starting sample of noise file and
|
||||
% read same number of samples as speech signal
|
||||
start=ceil(rand()*(nns-nspeech+1));
|
||||
finish=start+nspeech-1;
|
||||
noise=nfile(start:finish);
|
||||
|
||||
% Calculate noise power and signal power
|
||||
powernoise=norm(noise,2);
|
||||
powersignal=norm(s,2);
|
||||
|
||||
% Adjust noise level for desired SNR
|
||||
u=10^(snr/20);
|
||||
powerdesirednoise=powersignal/u;
|
||||
ratio=powerdesirednoise/powernoise;
|
||||
noise=ratio*noise;
|
||||
|
||||
% Add the noise
|
||||
t=s+noise;
|
||||
|
||||
% Display snr
|
||||
signaltonoise=20.0*log10(powersignal/norm(noise));
|
||||
|
||||
% Write as wave file
|
||||
audiowrite(outfile,t,8000,'BitsPerSample',16);
|
Binary file not shown.
@ -0,0 +1,19 @@
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
% Either load in an ascii .dat file or
|
||||
% uses wavread to read a wave file
|
||||
% function [speechData] = load_or_audioread(speechFile)
|
||||
% speechfile in quotes
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
|
||||
function [speechData] = load_or_audioread(speechFile)
|
||||
|
||||
% The speech file is loaded
|
||||
if (ischar(speechFile))
|
||||
if(strcmpi(speechFile(end-3:end),'.dat'))
|
||||
speechData = load(speechFile);
|
||||
elseif(strcmpi(speechFile(end-3:end),'.wav'))
|
||||
speechData = audioread(speechFile);
|
||||
end
|
||||
elseif (isnumeric(speechFile))
|
||||
speechData = speechFile;
|
||||
end
|
539973
6th-Semester-Spring-2024/DSP/Labs/FinalProject/noisefiles/exhibition.dat
Normal file
539973
6th-Semester-Spring-2024/DSP/Labs/FinalProject/noisefiles/exhibition.dat
Normal file
File diff suppressed because it is too large
Load Diff
517480
6th-Semester-Spring-2024/DSP/Labs/FinalProject/noisefiles/street.dat
Normal file
517480
6th-Semester-Spring-2024/DSP/Labs/FinalProject/noisefiles/street.dat
Normal file
File diff suppressed because it is too large
Load Diff
560907
6th-Semester-Spring-2024/DSP/Labs/FinalProject/noisefiles/train.dat
Normal file
560907
6th-Semester-Spring-2024/DSP/Labs/FinalProject/noisefiles/train.dat
Normal file
File diff suppressed because it is too large
Load Diff
517480
6th-Semester-Spring-2024/DSP/Labs/FinalProject/noisefiles/white.dat
Normal file
517480
6th-Semester-Spring-2024/DSP/Labs/FinalProject/noisefiles/white.dat
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,26 @@
|
||||
function mod_data= DC_block( data, Nsamples)
|
||||
|
||||
global Downsample DATAPADDING_MSECS SEARCHBUFFER
|
||||
|
||||
ofs= SEARCHBUFFER* Downsample;
|
||||
mod_data= data;
|
||||
|
||||
%compute dc component, it is a little weird
|
||||
facc= sum( data( ofs+ 1: Nsamples- ofs))/ Nsamples;
|
||||
mod_data( ofs+ 1: Nsamples- ofs)= data( ofs+ 1: Nsamples- ofs)- facc;
|
||||
|
||||
mod_data( ofs+ 1: ofs+ Downsample)= mod_data( ofs+ 1: ofs+ Downsample).* ...
|
||||
( 0.5+ (0: Downsample- 1))/ Downsample;
|
||||
|
||||
mod_data( Nsamples- ofs: -1: Nsamples- ofs-Downsample+ 1)= ...
|
||||
mod_data( Nsamples- ofs: -1: Nsamples- ofs-Downsample+ 1).* ...
|
||||
( 0.5+ (0: Downsample- 1))/ Downsample;
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -0,0 +1,53 @@
|
||||
function Y= FFTNXCorr( ref_VAD, startr, nr, deg_VAD, startd, nd)
|
||||
% this function has other simple implementations, current implementation is
|
||||
% consistent with the C version
|
||||
|
||||
% one way to do this (in time domain) =====
|
||||
x1= ref_VAD( startr: startr+ nr- 1);
|
||||
x2= deg_VAD( startd: startd+ nd- 1);
|
||||
x1= fliplr( x1);
|
||||
Y= conv( x2, x1);
|
||||
% done =====
|
||||
|
||||
% % the other way to do this (in freq domain)===
|
||||
% Nx= 2^ (ceil( log2( max( nr, nd))));
|
||||
% x1= zeros( 1, 2* Nx);
|
||||
% x2= zeros( 1, 2* Nx);
|
||||
% x1( 1: nr)= fliplr( ref_VAD( startr: startr+ nr- 1));
|
||||
% x2( 1: nd)= deg_VAD( startd: startd+ nd- 1);
|
||||
%
|
||||
% if (nr== 491)
|
||||
% fid= fopen( 'mat_debug.txt', 'wt');
|
||||
% fprintf( fid, '%f\n', x1);
|
||||
% fclose( fid);
|
||||
% end
|
||||
%
|
||||
% x1_fft= fft( x1, 2* Nx);
|
||||
% x2_fft= fft( x2, 2* Nx);
|
||||
%
|
||||
% tmp1= ifft( x1_fft.* x2_fft, 2* Nx);
|
||||
%
|
||||
% Ny= nr+ nd- 1;
|
||||
% Y= tmp1( 1: Ny);
|
||||
% % done ===========
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -0,0 +1,162 @@
|
||||
function [VAD, logVAD]= apply_VAD( data, Nsamples)
|
||||
|
||||
global Downsample MINSPEECHLGTH JOINSPEECHLGTH
|
||||
|
||||
Nwindows= floor( Nsamples/ Downsample);
|
||||
%number of 4ms window
|
||||
|
||||
VAD= zeros( 1, Nwindows);
|
||||
for count= 1: Nwindows
|
||||
VAD( count)= sum( data( (count-1)* Downsample+ 1: ...
|
||||
count* Downsample).^ 2)/ Downsample;
|
||||
end
|
||||
%VAD is the power of each 4ms window
|
||||
|
||||
LevelThresh = sum( VAD)/ Nwindows;
|
||||
%LevelThresh is set to mean value of VAD
|
||||
|
||||
LevelMin= max( VAD);
|
||||
if( LevelMin > 0 )
|
||||
LevelMin= LevelMin* 1.0e-4;
|
||||
else
|
||||
LevelMin = 1.0;
|
||||
end
|
||||
%fprintf( 1, 'LevelMin is %f\n', LevelMin);
|
||||
|
||||
VAD( find( VAD< LevelMin))= LevelMin;
|
||||
|
||||
for iteration= 1: 12
|
||||
LevelNoise= 0;
|
||||
len= 0;
|
||||
StDNoise= 0;
|
||||
|
||||
VAD_lessthan_LevelThresh= VAD( find( VAD<= LevelThresh));
|
||||
len= length( VAD_lessthan_LevelThresh);
|
||||
LevelNoise= sum( VAD_lessthan_LevelThresh);
|
||||
if (len> 0)
|
||||
LevelNoise= LevelNoise/ len;
|
||||
StDNoise= sqrt( sum( ...
|
||||
(VAD_lessthan_LevelThresh- LevelNoise).^ 2)/ len);
|
||||
end
|
||||
LevelThresh= 1.001* (LevelNoise+ 2* StDNoise);
|
||||
end
|
||||
%fprintf( 1, 'LevelThresh is %f\n', LevelThresh);
|
||||
|
||||
LevelNoise= 0;
|
||||
LevelSig= 0;
|
||||
len= 0;
|
||||
VAD_greaterthan_LevelThresh= VAD( find( VAD> LevelThresh));
|
||||
len= length( VAD_greaterthan_LevelThresh);
|
||||
LevelSig= sum( VAD_greaterthan_LevelThresh);
|
||||
|
||||
VAD_lessorequal_LevelThresh= VAD( find( VAD<= LevelThresh));
|
||||
LevelNoise= sum( VAD_lessorequal_LevelThresh);
|
||||
|
||||
if (len> 0)
|
||||
LevelSig= LevelSig/ len;
|
||||
else
|
||||
LevelThresh= -1;
|
||||
end
|
||||
%fprintf( 1, 'LevelSig is %f\n', LevelSig);
|
||||
|
||||
if (len< Nwindows)
|
||||
LevelNoise= LevelNoise/( Nwindows- len);
|
||||
else
|
||||
LevelNoise= 1;
|
||||
end
|
||||
%fprintf( 1, 'LevelNoise is %f\n', LevelNoise);
|
||||
|
||||
VAD( find( VAD<= LevelThresh))= -VAD( find( VAD<= LevelThresh));
|
||||
VAD(1)= -LevelMin;
|
||||
VAD(Nwindows)= -LevelMin;
|
||||
|
||||
|
||||
start= 0;
|
||||
finish= 0;
|
||||
for count= 2: Nwindows
|
||||
if( (VAD(count) > 0.0) && (VAD(count-1) <= 0.0) )
|
||||
start = count;
|
||||
end
|
||||
if( (VAD(count) <= 0.0) && (VAD(count-1) > 0.0) )
|
||||
finish = count;
|
||||
if( (finish - start)<= MINSPEECHLGTH )
|
||||
VAD( start: finish- 1)= -VAD( start: finish- 1);
|
||||
end
|
||||
end
|
||||
end
|
||||
%to make sure finish- start is more than 4
|
||||
|
||||
if( LevelSig >= (LevelNoise* 1000) )
|
||||
for count= 2: Nwindows
|
||||
if( (VAD(count)> 0) && (VAD(count-1)<= 0) )
|
||||
start= count;
|
||||
end
|
||||
if( (VAD(count)<= 0) && (VAD(count-1)> 0) )
|
||||
finish = count;
|
||||
g = sum( VAD( start: finish- 1));
|
||||
if( g< 3.0* LevelThresh* (finish - start) )
|
||||
VAD( start: finish- 1)= -VAD( start: finish- 1);
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
start = 0;
|
||||
finish = 0;
|
||||
for count= 2: Nwindows
|
||||
if( (VAD(count) > 0.0) && (VAD(count-1) <= 0.0) )
|
||||
start = count;
|
||||
if( (finish > 0) && ((start - finish) <= JOINSPEECHLGTH) )
|
||||
VAD( finish: start- 1)= LevelMin;
|
||||
end
|
||||
end
|
||||
if( (VAD(count) <= 0.0) && (VAD(count-1) > 0.0) )
|
||||
finish = count;
|
||||
end
|
||||
end
|
||||
|
||||
start= 0;
|
||||
for count= 2: Nwindows
|
||||
if( (VAD(count)> 0) && (VAD(count-1)<= 0) )
|
||||
start= count;
|
||||
end
|
||||
end
|
||||
if( start== 0 )
|
||||
VAD= abs(VAD);
|
||||
VAD(1) = -LevelMin;
|
||||
VAD(Nwindows) = -LevelMin;
|
||||
end
|
||||
|
||||
count = 4;
|
||||
while( count< (Nwindows-1) )
|
||||
if( (VAD(count)> 0) && (VAD(count-2) <= 0) )
|
||||
VAD(count-2)= VAD(count)* 0.1;
|
||||
VAD(count-1)= VAD(count)* 0.3;
|
||||
count= count+ 1;
|
||||
end
|
||||
if( (VAD(count)<= 0) && (VAD(count-1)> 0) )
|
||||
VAD(count)= VAD(count-1)* 0.3;
|
||||
VAD(count+ 1)= VAD(count-1)* 0.1;
|
||||
count= count+ 3;
|
||||
end
|
||||
count= count+ 1;
|
||||
end
|
||||
|
||||
VAD( find( VAD< 0))= 0;
|
||||
|
||||
% fid= fopen( 'mat_vad.txt', 'wt');
|
||||
% fprintf( fid, '%f\n', VAD);
|
||||
% fclose( fid);
|
||||
|
||||
if( LevelThresh<= 0 )
|
||||
LevelThresh= LevelMin;
|
||||
end
|
||||
|
||||
logVAD( find( VAD<= LevelThresh))= 0;
|
||||
VAD_greaterthan_LevelThresh= find( VAD> LevelThresh);
|
||||
logVAD( VAD_greaterthan_LevelThresh)= log( VAD( ...
|
||||
VAD_greaterthan_LevelThresh)/ LevelThresh);
|
||||
|
||||
|
||||
|
||||
|
@ -0,0 +1,40 @@
|
||||
function align_filtered= apply_filter( data, data_Nsamples, align_filter_dB)
|
||||
|
||||
global Downsample DATAPADDING_MSECS SEARCHBUFFER Fs
|
||||
|
||||
align_filtered= data;
|
||||
n= data_Nsamples- 2* SEARCHBUFFER* Downsample+ DATAPADDING_MSECS* (Fs/ 1000);
|
||||
% now find the next power of 2 which is greater or equal to n
|
||||
pow_of_2= 2^ (ceil( log2( n)));
|
||||
|
||||
[number_of_points, trivial]= size( align_filter_dB);
|
||||
overallGainFilter= interp1( align_filter_dB( :, 1), align_filter_dB( :, 2), ...
|
||||
1000);
|
||||
|
||||
x= zeros( 1, pow_of_2);
|
||||
x( 1: n)= data( SEARCHBUFFER* Downsample+ 1: SEARCHBUFFER* Downsample+ n);
|
||||
|
||||
x_fft= fft( x, pow_of_2);
|
||||
|
||||
freq_resolution= Fs/ pow_of_2;
|
||||
|
||||
factorDb( 1: pow_of_2/2+ 1)= interp1( align_filter_dB( :, 1), ...
|
||||
align_filter_dB( :, 2), (0: pow_of_2/2)* freq_resolution)- ...
|
||||
overallGainFilter;
|
||||
factor= 10.^ (factorDb/ 20);
|
||||
|
||||
factor= [factor, fliplr( factor( 2: pow_of_2/2))];
|
||||
x_fft= x_fft.* factor;
|
||||
|
||||
y= ifft( x_fft, pow_of_2);
|
||||
|
||||
align_filtered( SEARCHBUFFER* Downsample+ 1: SEARCHBUFFER* Downsample+ n)...
|
||||
= y( 1: n);
|
||||
|
||||
% fid= fopen( 'log_mat.txt', 'wt');
|
||||
% fprintf( fid, '%f\n', y( 1: n));
|
||||
% fclose( fid);
|
||||
|
||||
|
||||
|
||||
|
@ -0,0 +1,26 @@
|
||||
function mod_data= apply_filters( data, Nsamples)
|
||||
%IIRFilt( InIIR_Hsos, InIIR_Nsos, data, data_Nsamples);
|
||||
|
||||
global InIIR_Hsos InIIR_Nsos DATAPADDING_MSECS Fs
|
||||
% data_Nsamples= Nsamples+ DATAPADDING_MSECS* (Fs/ 1000);
|
||||
|
||||
% now we construct the second order section matrix
|
||||
sosMatrix= zeros( InIIR_Nsos, 6);
|
||||
sosMatrix( :, 4)= 1; %set a(1) to 1
|
||||
% each row of sosMatrix holds [b(1*3) a(1*3)] for each section
|
||||
sosMatrix( :, 1: 3)= InIIR_Hsos( :, 1: 3);
|
||||
sosMatrix( :, 5: 6)= InIIR_Hsos( :, 4: 5);
|
||||
%sosMatrix
|
||||
|
||||
% now we construct second order section direct form II filter
|
||||
iirdf2= dfilt.df2sos( sosMatrix);
|
||||
|
||||
mod_data= filter( iirdf2, data);
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -0,0 +1,200 @@
|
||||
function cep_mean= comp_cep(cleanFile, enhdFile);
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Cepstrum Distance Objective Speech Quality Measure
|
||||
%
|
||||
% This function implements the cepstrum distance measure used
|
||||
% in [1]
|
||||
%
|
||||
% Usage: CEP=comp_cep(cleanFile.wav, enhancedFile.wav)
|
||||
%
|
||||
% cleanFile.wav - clean input file in .wav format
|
||||
% enhancedFile - enhanced output file in .wav format
|
||||
% CEP - computed cepstrum distance measure
|
||||
%
|
||||
% Note that the cepstrum measure is limited in the range [0, 10].
|
||||
%
|
||||
% Example call: CEP =comp_cep('sp04.wav','enhanced.wav')
|
||||
%
|
||||
%
|
||||
% References:
|
||||
%
|
||||
% [1] Kitawaki, N., Nagabuchi, H., and Itoh, K. (1988). Objective quality
|
||||
% evaluation for low bit-rate speech coding systems. IEEE J. Select.
|
||||
% Areas in Comm., 6(2), 262-273.
|
||||
%
|
||||
% Author: Philipos C. Loizou
|
||||
% (LPC routines were written by Bryan Pellom & John Hansen)
|
||||
%
|
||||
% Copyright (c) 2006 by Philipos C. Loizou
|
||||
% $Revision: 0.0 $ $Date: 10/09/2006 $
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
if nargin~=2
|
||||
fprintf('USAGE: CEP=comp_cep(cleanFile.wav, enhancedFile.wav)\n');
|
||||
fprintf('For more help, type: help comp_cep\n\n');
|
||||
return;
|
||||
end
|
||||
|
||||
alpha=0.95;
|
||||
|
||||
[data1, Srate1, Nbits1]= wavread(cleanFile);
|
||||
[data2, Srate2, Nbits2]= wavread(enhdFile);
|
||||
if ( Srate1~= Srate2) | ( Nbits1~= Nbits2)
|
||||
error( 'The two files do not match!\n');
|
||||
end
|
||||
|
||||
len= min( length( data1), length( data2));
|
||||
data1= data1( 1: len)+eps;
|
||||
data2= data2( 1: len)+eps;
|
||||
|
||||
IS_dist= cepstrum( data1, data2,Srate1);
|
||||
|
||||
IS_len= round( length( IS_dist)* alpha);
|
||||
IS= sort( IS_dist);
|
||||
|
||||
cep_mean= mean( IS( 1: IS_len));
|
||||
|
||||
|
||||
|
||||
|
||||
function distortion = cepstrum(clean_speech, processed_speech,sample_rate)
|
||||
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Check the length of the clean and processed speech. Must be the same.
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
clean_length = length(clean_speech);
|
||||
processed_length = length(processed_speech);
|
||||
|
||||
if (clean_length ~= processed_length)
|
||||
disp('Error: Both Speech Files must be same length.');
|
||||
return
|
||||
end
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Scale both clean speech and processed speech to have same dynamic
|
||||
% range. Also remove DC component from each signal
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
%clean_speech = clean_speech - mean(clean_speech);
|
||||
%processed_speech = processed_speech - mean(processed_speech);
|
||||
|
||||
%processed_speech = processed_speech.*(max(abs(clean_speech))/ max(abs(processed_speech)));
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Global Variables
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
winlength = round(30*sample_rate/1000); %240; % window length in samples
|
||||
skiprate = floor(winlength/4); % window skip in samples
|
||||
if sample_rate<10000
|
||||
P = 10; % LPC Analysis Order
|
||||
else
|
||||
P=16; % this could vary depending on sampling frequency.
|
||||
end
|
||||
C=10*sqrt(2)/log(10);
|
||||
% ----------------------------------------------------------------------
|
||||
% For each frame of input speech, calculate the Itakura-Saito Measure
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames
|
||||
start = 1; % starting sample
|
||||
window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
|
||||
|
||||
for frame_count = 1:num_frames
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (1) Get the Frames for the test and reference speech.
|
||||
% Multiply by Hanning Window.
|
||||
% ----------------------------------------------------------
|
||||
|
||||
clean_frame = clean_speech(start:start+winlength-1);
|
||||
processed_frame = processed_speech(start:start+winlength-1);
|
||||
clean_frame = clean_frame.*window;
|
||||
processed_frame = processed_frame.*window;
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (2) Get the autocorrelation lags and LPC parameters used
|
||||
% to compute the IS measure.
|
||||
% ----------------------------------------------------------
|
||||
|
||||
[R_clean, Ref_clean, A_clean] = ...
|
||||
lpcoeff(clean_frame, P);
|
||||
[R_processed, Ref_processed, A_processed] = ...
|
||||
lpcoeff(processed_frame, P);
|
||||
|
||||
C_clean=lpc2cep(A_clean);
|
||||
C_processed=lpc2cep(A_processed);
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (3) Compute the cepstrum-distance measure
|
||||
% ----------------------------------------------------------
|
||||
|
||||
|
||||
distortion(frame_count) = min(10,C*norm(C_clean-C_processed,2));
|
||||
|
||||
|
||||
start = start + skiprate;
|
||||
|
||||
end
|
||||
|
||||
|
||||
|
||||
function [acorr, refcoeff, lpparams] = lpcoeff(speech_frame, model_order)
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (1) Compute Autocorrelation Lags
|
||||
% ----------------------------------------------------------
|
||||
|
||||
winlength = max(size(speech_frame));
|
||||
for k=1:model_order+1
|
||||
R(k) = sum(speech_frame(1:winlength-k+1) ...
|
||||
.*speech_frame(k:winlength));
|
||||
end
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (2) Levinson-Durbin
|
||||
% ----------------------------------------------------------
|
||||
|
||||
a = ones(1,model_order);
|
||||
E(1)=R(1);
|
||||
for i=1:model_order
|
||||
a_past(1:i-1) = a(1:i-1);
|
||||
sum_term = sum(a_past(1:i-1).*R(i:-1:2));
|
||||
rcoeff(i)=(R(i+1) - sum_term) / E(i);
|
||||
a(i)=rcoeff(i);
|
||||
a(1:i-1) = a_past(1:i-1) - rcoeff(i).*a_past(i-1:-1:1);
|
||||
E(i+1)=(1-rcoeff(i)*rcoeff(i))*E(i);
|
||||
end
|
||||
|
||||
acorr = R;
|
||||
refcoeff = rcoeff;
|
||||
lpparams = [1 -a];
|
||||
|
||||
%----------------------------------------------
|
||||
function [cep]=lpc2cep(a)
|
||||
%
|
||||
% converts prediction to cepstrum coefficients
|
||||
%
|
||||
% Author: Philipos C. Loizou
|
||||
|
||||
M=length(a);
|
||||
cep=zeros(1,M-1);
|
||||
|
||||
cep(1)=-a(2);
|
||||
|
||||
for k=2:M-1
|
||||
ix=1:k-1;
|
||||
vec1=cep(ix).*a(k-1+1:-1:2).*ix;
|
||||
cep(k)=-(a(k+1)+sum(vec1)/k);
|
||||
|
||||
end
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -0,0 +1,259 @@
|
||||
function fwseg_dist= comp_fwseg(cleanFile, enhancedFile);
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Frequency weighted SNRseg Objective Speech Quality Measure
|
||||
%
|
||||
% This function implements the frequency-weighted SNRseg measure [1]
|
||||
% using a different weighting function, the clean spectrum.
|
||||
%
|
||||
% Usage: fwSNRseg=comp_fwseg(cleanFile.wav, enhancedFile.wav)
|
||||
%
|
||||
% cleanFile.wav - clean input file in .wav format
|
||||
% enhancedFile - enhanced output file in .wav format
|
||||
% fwSNRseg - computed frequency weighted SNRseg in dB
|
||||
%
|
||||
% Note that large numbers of fwSNRseg are better.
|
||||
%
|
||||
% Example call: fwSNRseg =comp_fwseg('sp04.wav','enhanced.wav')
|
||||
%
|
||||
%
|
||||
% References:
|
||||
% [1] Tribolet, J., Noll, P., McDermott, B., and Crochiere, R. E. (1978).
|
||||
% A study of complexity and quality of speech waveform coders. Proc.
|
||||
% IEEE Int. Conf. Acoust. , Speech, Signal Processing, 586-590.
|
||||
%
|
||||
% Author: Philipos C. Loizou
|
||||
% (critical-band filtering routines were written by Bryan Pellom & John Hansen)
|
||||
%
|
||||
% Copyright (c) 2006 by Philipos C. Loizou
|
||||
% $Revision: 0.0 $ $Date: 10/09/2006 $
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
if nargin~=2
|
||||
fprintf('USAGE: fwSNRseg=comp_fwseg(cleanFile.wav, enhancedFile.wav)\n');
|
||||
fprintf('For more help, type: help comp_fwseg\n\n');
|
||||
return;
|
||||
end
|
||||
|
||||
|
||||
[data1, Srate1, Nbits1]= wavread(cleanFile);
|
||||
[data2, Srate2, Nbits2]= wavread(enhancedFile);
|
||||
if ( Srate1~= Srate2) | ( Nbits1~= Nbits2)
|
||||
error( 'The two files do not match!\n');
|
||||
end
|
||||
|
||||
len= min( length( data1), length( data2));
|
||||
data1= data1( 1: len)+eps;
|
||||
data2= data2( 1: len)+eps;
|
||||
|
||||
wss_dist_vec= fwseg( data1, data2,Srate1);
|
||||
fwseg_dist=mean(wss_dist_vec);
|
||||
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
function distortion = fwseg(clean_speech, processed_speech,sample_rate)
|
||||
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Check the length of the clean and processed speech. Must be the same.
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
clean_length = length(clean_speech);
|
||||
processed_length = length(processed_speech);
|
||||
|
||||
if (clean_length ~= processed_length)
|
||||
disp('Error: Files must have same length.');
|
||||
return
|
||||
end
|
||||
|
||||
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Global Variables
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
|
||||
winlength = round(30*sample_rate/1000); % window length in samples
|
||||
skiprate = floor(winlength/4); % window skip in samples
|
||||
max_freq = sample_rate/2; % maximum bandwidth
|
||||
num_crit = 25; % number of critical bands
|
||||
USE_25=1;
|
||||
n_fft = 2^nextpow2(2*winlength);
|
||||
n_fftby2 = n_fft/2; % FFT size/2
|
||||
gamma=0.2; % power exponent
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Critical Band Filter Definitions (Center Frequency and Bandwidths in Hz)
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
cent_freq(1) = 50.0000; bandwidth(1) = 70.0000;
|
||||
cent_freq(2) = 120.000; bandwidth(2) = 70.0000;
|
||||
cent_freq(3) = 190.000; bandwidth(3) = 70.0000;
|
||||
cent_freq(4) = 260.000; bandwidth(4) = 70.0000;
|
||||
cent_freq(5) = 330.000; bandwidth(5) = 70.0000;
|
||||
cent_freq(6) = 400.000; bandwidth(6) = 70.0000;
|
||||
cent_freq(7) = 470.000; bandwidth(7) = 70.0000;
|
||||
cent_freq(8) = 540.000; bandwidth(8) = 77.3724;
|
||||
cent_freq(9) = 617.372; bandwidth(9) = 86.0056;
|
||||
cent_freq(10) = 703.378; bandwidth(10) = 95.3398;
|
||||
cent_freq(11) = 798.717; bandwidth(11) = 105.411;
|
||||
cent_freq(12) = 904.128; bandwidth(12) = 116.256;
|
||||
cent_freq(13) = 1020.38; bandwidth(13) = 127.914;
|
||||
cent_freq(14) = 1148.30; bandwidth(14) = 140.423;
|
||||
cent_freq(15) = 1288.72; bandwidth(15) = 153.823;
|
||||
cent_freq(16) = 1442.54; bandwidth(16) = 168.154;
|
||||
cent_freq(17) = 1610.70; bandwidth(17) = 183.457;
|
||||
cent_freq(18) = 1794.16; bandwidth(18) = 199.776;
|
||||
cent_freq(19) = 1993.93; bandwidth(19) = 217.153;
|
||||
cent_freq(20) = 2211.08; bandwidth(20) = 235.631;
|
||||
cent_freq(21) = 2446.71; bandwidth(21) = 255.255;
|
||||
cent_freq(22) = 2701.97; bandwidth(22) = 276.072;
|
||||
cent_freq(23) = 2978.04; bandwidth(23) = 298.126;
|
||||
cent_freq(24) = 3276.17; bandwidth(24) = 321.465;
|
||||
cent_freq(25) = 3597.63; bandwidth(25) = 346.136;
|
||||
|
||||
W=[ % articulation index weights
|
||||
0.003
|
||||
0.003
|
||||
0.003
|
||||
0.007
|
||||
0.010
|
||||
0.016
|
||||
0.016
|
||||
0.017
|
||||
0.017
|
||||
0.022
|
||||
0.027
|
||||
0.028
|
||||
0.030
|
||||
0.032
|
||||
0.034
|
||||
0.035
|
||||
0.037
|
||||
0.036
|
||||
0.036
|
||||
0.033
|
||||
0.030
|
||||
0.029
|
||||
0.027
|
||||
0.026
|
||||
0.026];
|
||||
|
||||
W=W';
|
||||
|
||||
if USE_25==0 % use 13 bands
|
||||
% ----- lump adjacent filters together ----------------
|
||||
k=2;
|
||||
cent_freq2(1)=cent_freq(1);
|
||||
bandwidth2(1)=bandwidth(1)+bandwidth(2);
|
||||
W2(1)=W(1);
|
||||
for i=2:13
|
||||
cent_freq2(i)=cent_freq2(i-1)+bandwidth2(i-1);
|
||||
bandwidth2(i)=bandwidth(k)+bandwidth(k+1);
|
||||
W2(i)=0.5*(W(k)+W(k+1));
|
||||
k=k+2;
|
||||
end
|
||||
|
||||
sumW=sum(W2);
|
||||
bw_min = bandwidth2 (1); % minimum critical bandwidth
|
||||
else
|
||||
sumW=sum(W);
|
||||
bw_min=bandwidth(1);
|
||||
end
|
||||
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Set up the critical band filters. Note here that Gaussianly shaped
|
||||
% filters are used. Also, the sum of the filter weights are equivalent
|
||||
% for each critical band filter. Filter less than -30 dB and set to
|
||||
% zero.
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
min_factor = exp (-30.0 / (2.0 * 2.303)); % -30 dB point of filter
|
||||
if USE_25==0
|
||||
|
||||
num_crit=length(cent_freq2);
|
||||
|
||||
for i = 1:num_crit
|
||||
f0 = (cent_freq2 (i) / max_freq) * (n_fftby2);
|
||||
all_f0(i) = floor(f0);
|
||||
bw = (bandwidth2 (i) / max_freq) * (n_fftby2);
|
||||
norm_factor = log(bw_min) - log(bandwidth2(i));
|
||||
j = 0:1:n_fftby2-1;
|
||||
crit_filter(i,:) = exp (-11 *(((j - floor(f0)) ./bw).^2) + norm_factor);
|
||||
crit_filter(i,:) = crit_filter(i,:).*(crit_filter(i,:) > min_factor);
|
||||
end
|
||||
|
||||
else
|
||||
for i = 1:num_crit
|
||||
f0 = (cent_freq (i) / max_freq) * (n_fftby2);
|
||||
all_f0(i) = floor(f0);
|
||||
bw = (bandwidth (i) / max_freq) * (n_fftby2);
|
||||
norm_factor = log(bw_min) - log(bandwidth(i));
|
||||
j = 0:1:n_fftby2-1;
|
||||
crit_filter(i,:) = exp (-11 *(((j - floor(f0)) ./bw).^2) + norm_factor);
|
||||
crit_filter(i,:) = crit_filter(i,:).*(crit_filter(i,:) > min_factor);
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
|
||||
num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames
|
||||
start = 1; % starting sample
|
||||
window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
|
||||
|
||||
for frame_count = 1:num_frames
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (1) Get the Frames for the test and reference speech.
|
||||
% Multiply by Hanning Window.
|
||||
% ----------------------------------------------------------
|
||||
|
||||
clean_frame = clean_speech(start:start+winlength-1);
|
||||
processed_frame = processed_speech(start:start+winlength-1);
|
||||
clean_frame = clean_frame.*window;
|
||||
processed_frame = processed_frame.*window;
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (2) Compute the magnitude Spectrum of Clean and Processed
|
||||
% ----------------------------------------------------------
|
||||
|
||||
|
||||
clean_spec = abs(fft(clean_frame,n_fft));
|
||||
processed_spec = abs(fft(processed_frame,n_fft));
|
||||
|
||||
% normalize spectra to have area of one
|
||||
%
|
||||
clean_spec=clean_spec/sum(clean_spec(1:n_fftby2));
|
||||
processed_spec=processed_spec/sum(processed_spec(1:n_fftby2));
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (3) Compute Filterbank Output Energies
|
||||
% ----------------------------------------------------------
|
||||
|
||||
clean_energy=zeros(1,num_crit);
|
||||
processed_energy=zeros(1,num_crit);
|
||||
error_energy=zeros(1,num_crit);
|
||||
W_freq=zeros(1,num_crit);
|
||||
|
||||
for i = 1:num_crit
|
||||
clean_energy(i) = sum(clean_spec(1:n_fftby2) ...
|
||||
.*crit_filter(i,:)');
|
||||
processed_energy(i) = sum(processed_spec(1:n_fftby2) ...
|
||||
.*crit_filter(i,:)');
|
||||
|
||||
error_energy(i)=max((clean_energy(i)-processed_energy(i))^2,eps);
|
||||
W_freq(i)=(clean_energy(i))^gamma;
|
||||
|
||||
end
|
||||
SNRlog=10*log10((clean_energy.^2)./error_energy);
|
||||
|
||||
fwSNR=sum(W_freq.*SNRlog)/sum(W_freq);
|
||||
|
||||
distortion(frame_count)=min(max(fwSNR,-10),35);
|
||||
|
||||
start = start + skiprate;
|
||||
|
||||
end
|
||||
|
@ -0,0 +1,493 @@
|
||||
function [SIG,BAK,OVL]= comp_fwseg_mars(cleanFile, enhancedFile);
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% MARS Frequency-variant fwSNRseg objective speech quality measure
|
||||
%
|
||||
% This function implements the frequency-variant fwSNRseg measure based
|
||||
% on MARS analysis (see Chap. 10, Sec. 10.5.4)
|
||||
%
|
||||
%
|
||||
% Usage: [sig,bak,ovl]=comp_fwseg_mars(cleanFile.wav, enhancedFile.wav)
|
||||
%
|
||||
% cleanFile.wav - clean input file in .wav format
|
||||
% enhancedFile - enhanced output file in .wav format
|
||||
% sig - predicted rating [1-5] of speech distortion
|
||||
% bak - predicted rating [1-5] of noise distortion
|
||||
% ovl - predicted rating [1-5] of overall quality
|
||||
%
|
||||
%
|
||||
% Example call: [s,b,o] =comp_fwseg_mars('sp04.wav','enhanced.wav')
|
||||
%
|
||||
%
|
||||
% References:
|
||||
% [1] Chapter 10, Sec 10.5.4,
|
||||
% [2] Chapter 11
|
||||
%
|
||||
% Authors: Yi Hu and Philipos C. Loizou
|
||||
% (critical-band filtering routines were written by Bryan Pellom & John Hansen)
|
||||
%
|
||||
% Copyright (c) 2006 by Philipos C. Loizou
|
||||
% $Revision: 0.0 $ $Date: 10/09/2006 $
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
if nargin~=2
|
||||
fprintf('USAGE: [sig,bak,ovl]=comp_fwseg_mars(cleanFile.wav, enhancedFile.wav)\n');
|
||||
fprintf('For more help, type: help comp_fwseg_mars\n\n');
|
||||
return;
|
||||
end
|
||||
|
||||
[data1, Srate1, Nbits1]= wavread(cleanFile);
|
||||
[data2, Srate2, Nbits2]= wavread(enhancedFile);
|
||||
if ( Srate1~= Srate2) | ( Nbits1~= Nbits2)
|
||||
error( 'The two files do not match!\n');
|
||||
end
|
||||
|
||||
len= min( length( data1), length( data2));
|
||||
data1= data1( 1: len)+eps;
|
||||
data2= data2( 1: len)+eps;
|
||||
|
||||
wss_dist_matrix= fwseg( data1, data2,Srate1);
|
||||
wss_dist=mean(wss_dist_matrix);
|
||||
|
||||
|
||||
SIG= sig_mars( wss_dist( 1), wss_dist( 2), wss_dist( 3), wss_dist( 4), ...
|
||||
wss_dist( 5), wss_dist( 6), wss_dist( 7), wss_dist( 8), ...
|
||||
wss_dist( 9), wss_dist( 10), wss_dist( 11), wss_dist( 12), ...
|
||||
wss_dist( 13), wss_dist( 14), wss_dist( 15), wss_dist( 16), ...
|
||||
wss_dist( 17), wss_dist( 18), wss_dist( 19), wss_dist( 20), ...
|
||||
wss_dist( 21), wss_dist( 22), wss_dist( 23), wss_dist( 24), ...
|
||||
wss_dist( 25));
|
||||
SIG=max(1,SIG); SIG=min(5, SIG); % limit values to [1, 5]
|
||||
|
||||
BAK= bak_mars( wss_dist( 1), wss_dist( 2), wss_dist( 3), wss_dist( 4), ...
|
||||
wss_dist( 5), wss_dist( 6), wss_dist( 7), wss_dist( 8), ...
|
||||
wss_dist( 9), wss_dist( 10), wss_dist( 11), wss_dist( 12), ...
|
||||
wss_dist( 13), wss_dist( 14), wss_dist( 15), wss_dist( 16), ...
|
||||
wss_dist( 17), wss_dist( 18), wss_dist( 19), wss_dist( 20), ...
|
||||
wss_dist( 21), wss_dist( 22), wss_dist( 23), wss_dist( 24), ...
|
||||
wss_dist( 25));
|
||||
BAK=max(1,BAK); BAK=min(5, BAK); % limit values to [1, 5]
|
||||
|
||||
OVL= ovl_mars( wss_dist( 1), wss_dist( 2), wss_dist( 3), wss_dist( 4), ...
|
||||
wss_dist( 5), wss_dist( 6), wss_dist( 7), wss_dist( 8), ...
|
||||
wss_dist( 9), wss_dist( 10), wss_dist( 11), wss_dist( 12), ...
|
||||
wss_dist( 13), wss_dist( 14), wss_dist( 15), wss_dist( 16), ...
|
||||
wss_dist( 17), wss_dist( 18), wss_dist( 19), wss_dist( 20), ...
|
||||
wss_dist( 21), wss_dist( 22), wss_dist( 23), wss_dist( 24), ...
|
||||
wss_dist( 25));
|
||||
OVL=max(1,OVL); OVL=min(5, OVL); % limit values to [1, 5]
|
||||
|
||||
|
||||
%-------------------------------------------------
|
||||
function Y= bak_mars( FWSEG_VA, V5, V6, V7, V8, V9, V10, V11, V12, ...
|
||||
V13, V14, V15, V16, V17, V18, V19, V20, ...
|
||||
V21, V22, V23, V24, V25, V26, V27, V28)
|
||||
|
||||
BF1 = max(0, V21 - 0.282);
|
||||
BF2 = max(0, FWSEG_VA + 9.094);
|
||||
BF3 = max(0, - 9.094 - FWSEG_VA );
|
||||
BF5 = max(0, 10.089 - V11 );
|
||||
BF7 = max(0, 3.624 - V26 ) * BF3;
|
||||
BF8 = max(0, V24 - 5.584) * BF5;
|
||||
BF9 = max(0, 5.584 - V24 ) * BF5;
|
||||
BF10 = max(0, V19 - 8.030) * BF1;
|
||||
BF11 = max(0, 8.030 - V19 ) * BF1;
|
||||
BF12 = max(0, V27 - 4.858) * BF1;
|
||||
BF13 = max(0, 4.858 - V27 ) * BF1;
|
||||
BF14 = max(0, FWSEG_VA + 7.282) * BF1;
|
||||
BF15 = max(0, - 7.282 - FWSEG_VA ) * BF1;
|
||||
BF17 = max(0, 9.458 - V16 ) * BF10;
|
||||
BF18 = max(0, V27 - 10.431) * BF11;
|
||||
BF19 = max(0, 10.431 - V27 ) * BF11;
|
||||
BF21 = max(0, 11.059 - V22 ) * BF1;
|
||||
BF22 = max(0, V26 - 8.675) * BF1;
|
||||
BF23 = max(0, 8.675 - V26 ) * BF1;
|
||||
BF25 = max(0, 11.195 - V6 ) * BF10;
|
||||
BF26 = max(0, V8 - 7.138) * BF1;
|
||||
BF27 = max(0, 7.138 - V8 ) * BF1;
|
||||
BF29 = max(0, 9.006 - V10 ) * BF26;
|
||||
BF30 = max(0, V14 - 8.210) * BF15;
|
||||
BF35 = max(0, 7.026 - V19 ) * BF15;
|
||||
BF36 = max(0, V11 - 3.424) * BF27;
|
||||
BF39 = max(0, 5.418 - V17 ) * BF23;
|
||||
BF40 = max(0, V28 - 6.813);
|
||||
BF41 = max(0, 6.813 - V28 );
|
||||
BF42 = max(0, V26 - 5.998) * BF14;
|
||||
BF43 = max(0, 5.998 - V26 ) * BF14;
|
||||
BF44 = max(0, V5 + 0.206) * BF41;
|
||||
BF45 = max(0, - 0.206 - V5 ) * BF41;
|
||||
BF46 = max(0, V22 - 7.901) * BF45;
|
||||
BF49 = max(0, 7.496 - V8 ) * BF44;
|
||||
BF51 = max(0, 7.904 - V11 ) * BF45;
|
||||
BF52 = max(0, V26 - 10.938) * BF27;
|
||||
BF54 = max(0, V9 - 4.507) * BF26;
|
||||
BF56 = max(0, V28 - 0.549) * BF15;
|
||||
BF57 = max(0, 0.549 - V28 ) * BF15;
|
||||
BF58 = max(0, V25 - 3.252) * BF41;
|
||||
BF59 = max(0, 3.252 - V25 ) * BF41;
|
||||
BF60 = max(0, V23 - 7.650) * BF58;
|
||||
BF61 = max(0, 7.650 - V23 ) * BF58;
|
||||
BF62 = max(0, V25 - 9.931) * BF44;
|
||||
BF63 = max(0, 9.931 - V25 ) * BF44;
|
||||
BF64 = max(0, V25 - 4.923) * BF21;
|
||||
BF65 = max(0, 4.923 - V25 ) * BF21;
|
||||
BF67 = max(0, 3.746 - V28 ) * BF10;
|
||||
BF68 = max(0, V11 - 5.346) * BF41;
|
||||
BF69 = max(0, 5.346 - V11 ) * BF41;
|
||||
BF70 = max(0, V12 - 9.026) * BF68;
|
||||
BF71 = max(0, 9.026 - V12 ) * BF68;
|
||||
BF73 = max(0, - 2.668 - V28 ) * BF21;
|
||||
BF74 = max(0, V24 - 7.028) * BF41;
|
||||
BF75 = max(0, 7.028 - V24 ) * BF41;
|
||||
BF77 = max(0, - 0.224 - V6 ) * BF74;
|
||||
BF78 = max(0, V5 - 3.884);
|
||||
BF79 = max(0, 3.884 - V5 );
|
||||
BF80 = max(0, V15 - 5.019) * BF78;
|
||||
BF83 = max(0, - 1.880 - V28 ) * BF13;
|
||||
BF84 = max(0, V7 - 3.067) * BF12;
|
||||
BF85 = max(0, 3.067 - V7 ) * BF12;
|
||||
BF87 = max(0, 5.353 - V6 );
|
||||
BF88 = max(0, V13 - 3.405) * BF9;
|
||||
BF89 = max(0, 3.405 - V13 ) * BF9;
|
||||
BF91 = max(0, 5.599 - V13 ) * BF45;
|
||||
BF92 = max(0, V15 - 9.821) * BF8;
|
||||
BF94 = max(0, V14 + 2.594) * BF79;
|
||||
BF97 = max(0, 8.635 - V23 ) * BF94;
|
||||
BF99 = max(0, 1.332 - V24 ) * BF45;
|
||||
BF100 = max(0, V7 - 0.209) * BF1;
|
||||
|
||||
Y = 2.751 + 0.135 * BF1 - 0.037 * BF2 + 0.328 * BF3 - 0.098 * BF5 ...
|
||||
+ 0.988 * BF7 + 0.014 * BF8 - 0.034 * BF11 - 0.011 * BF12 ...
|
||||
- 0.013 * BF13 - 0.002 * BF17 + 0.014 * BF18 ...
|
||||
+ 0.004 * BF19 - 0.007 * BF21 - 0.017 * BF22 ...
|
||||
- .895791E-03 * BF25 + 0.011 * BF26 - 0.009 * BF27 ...
|
||||
- 0.007 * BF29 + 0.052 * BF30 + 0.022 * BF35 ...
|
||||
- 0.002 * BF36 - 0.005 * BF39 - 0.059 * BF40 ...
|
||||
- 0.050 * BF41 + 0.001 * BF42 + .743730E-03 * BF43 ...
|
||||
+ 0.011 * BF44 + 0.022 * BF45 + 0.009 * BF46 ...
|
||||
+ 0.004 * BF49 - 0.005 * BF51 + 0.010 * BF52 ...
|
||||
- 0.001 * BF54 - 0.005 * BF56 - 0.015 * BF57 ...
|
||||
- 0.032 * BF59 + 0.009 * BF60 - 0.002 * BF61 ...
|
||||
- 0.009 * BF62 - 0.001 * BF63 + .819374E-03 * BF64 ...
|
||||
+ 0.002 * BF65 + 0.003 * BF67 + 0.024 * BF69 ...
|
||||
- 0.011 * BF70 - 0.004 * BF71 + 0.013 * BF73 ...
|
||||
- 0.026 * BF74 + 0.005 * BF75 + 0.253 * BF77 ...
|
||||
- 0.065 * BF78 + 0.014 * BF80 - 0.010 * BF83 ...
|
||||
+ 0.001 * BF84 + 0.018 * BF85 - 0.050 * BF87 ...
|
||||
- 0.002 * BF88 - 0.020 * BF89 + 0.003 * BF91 ...
|
||||
- 0.043 * BF92 + .707581E-03 * BF97 - 0.015 * BF99 ...
|
||||
- 0.005 * BF100;
|
||||
|
||||
|
||||
function Y= sig_mars( FWSEG_VA, V5, V6, V7, V8, V9, V10, V11, V12, ...
|
||||
V13, V14, V15, V16, V17, V18, V19, V20, ...
|
||||
V21, V22, V23, V24, V25, V26, V27, V28)
|
||||
|
||||
BF1 = max(0, V7 - 9.535);
|
||||
BF2 = max(0, 9.535 - V7 );
|
||||
BF3 = max(0, V27 - 1.578);
|
||||
BF5 = max(0, V6 - 5.422);
|
||||
BF6 = max(0, 5.422 - V6 );
|
||||
BF8 = max(0, 11.333 - V19 );
|
||||
BF10 = max(0, - 6.774 - FWSEG_VA );
|
||||
BF11 = max(0, V10 - 6.255) * BF8;
|
||||
BF12 = max(0, 6.255 - V10 ) * BF8;
|
||||
BF13 = max(0, V24 - 3.894);
|
||||
BF15 = max(0, V5 - 3.884);
|
||||
BF16 = max(0, 3.884 - V5 );
|
||||
BF17 = max(0, V28 - 7.918);
|
||||
BF18 = max(0, 7.918 - V28 );
|
||||
BF19 = max(0, V13 - 6.077) * BF18;
|
||||
BF20 = max(0, 6.077 - V13 ) * BF18;
|
||||
BF22 = max(0, 6.614 - V20 ) * BF10;
|
||||
BF23 = max(0, FWSEG_VA + 0.936) * BF8;
|
||||
BF25 = max(0, V23 - 5.039);
|
||||
BF26 = max(0, 5.039 - V23 );
|
||||
BF28 = max(0, 9.007 - V20 ) * BF25;
|
||||
BF29 = max(0, V25 - 7.582);
|
||||
BF30 = max(0, 7.582 - V25 );
|
||||
BF31 = max(0, V11 + 3.336) * BF16;
|
||||
BF32 = max(0, V26 - 1.877);
|
||||
BF35 = max(0, - 5.749 - FWSEG_VA ) * BF6;
|
||||
BF36 = max(0, V7 - 4.451) * BF29;
|
||||
BF37 = max(0, 4.451 - V7 ) * BF29;
|
||||
BF38 = max(0, V14 - 10.158);
|
||||
BF39 = max(0, 10.158 - V14 );
|
||||
BF41 = max(0, 7.172 - V17 ) * BF39;
|
||||
BF43 = max(0, 7.810 - V24 ) * BF26;
|
||||
BF44 = max(0, V8 + 1.636) * BF3;
|
||||
BF45 = max(0, FWSEG_VA - 10.068) * BF39;
|
||||
BF47 = max(0, V23 - 4.721) * BF30;
|
||||
BF48 = max(0, 4.721 - V23 ) * BF30;
|
||||
BF50 = max(0, - 2.397 - V24 ) * BF16;
|
||||
BF51 = max(0, V14 - 1.428) * BF17;
|
||||
BF53 = max(0, V16 + 1.940) * BF18;
|
||||
BF54 = max(0, V10 - 9.442) * BF18;
|
||||
BF56 = max(0, V10 + 2.144) * BF16;
|
||||
BF58 = max(0, 1.969 - V26 ) * BF2;
|
||||
BF59 = max(0, V19 - 6.089) * BF16;
|
||||
BF62 = max(0, 8.952 - V21 ) * BF15;
|
||||
BF63 = max(0, V24 - 7.371) * BF3;
|
||||
BF65 = max(0, V22 - 8.908) * BF6;
|
||||
BF66 = max(0, 8.908 - V22 ) * BF6;
|
||||
BF67 = max(0, V27 - 9.485) * BF30;
|
||||
BF69 = max(0, V18 - 8.608) * BF10;
|
||||
BF71 = max(0, V13 - 3.374) * BF25;
|
||||
BF73 = max(0, V14 - 3.616) * BF13;
|
||||
BF75 = max(0, V18 - 10.321) * BF32;
|
||||
BF76 = max(0, 10.321 - V18 ) * BF32;
|
||||
BF78 = max(0, 3.972 - V15 ) * BF26;
|
||||
BF79 = max(0, V14 - 7.105) * BF26;
|
||||
BF80 = max(0, 7.105 - V14 ) * BF26;
|
||||
|
||||
Y = 2.638 - 0.089 * BF1 + 0.083 * BF5 - 0.162 * BF6 - 0.037 * BF8 ...
|
||||
- 0.241 * BF10 + 0.018 * BF11 - 0.008 * BF12 ...
|
||||
+ 0.059 * BF13 - 0.144 * BF17 - 0.116 * BF18 ...
|
||||
+ 0.010 * BF19 - 0.012 * BF20 + 0.085 * BF22 ...
|
||||
+ 0.011 * BF23 + 0.049 * BF25 - 0.159 * BF26 ...
|
||||
- 0.016 * BF28 - 0.138 * BF29 + 0.010 * BF31 ...
|
||||
+ 0.016 * BF35 + 0.018 * BF36 + 0.246 * BF37 ...
|
||||
- 0.417 * BF38 + 0.052 * BF39 - 0.005 * BF41 ...
|
||||
+ 0.021 * BF43 + 0.006 * BF44 - 0.047 * BF45 ...
|
||||
- 0.051 * BF47 - 0.014 * BF48 - 0.113 * BF50 ...
|
||||
+ 0.019 * BF51 + 0.007 * BF53 + 0.017 * BF54 ...
|
||||
- 0.007 * BF56 - 0.098 * BF58 + 0.011 * BF59 ...
|
||||
- 0.016 * BF62 - 0.012 * BF63 + 0.113 * BF65 ...
|
||||
+ 0.016 * BF66 + 0.040 * BF67 - 0.065 * BF69 ...
|
||||
- 0.018 * BF71 + 0.014 * BF73 - 0.009 * BF75 ...
|
||||
- 0.008 * BF76 - 0.032 * BF78 + 0.032 * BF79 ...
|
||||
+ 0.011 * BF80;
|
||||
|
||||
|
||||
function Y= ovl_mars( FWSEG_VA, V5, V6, V7, V8, V9, V10, V11, V12, ...
|
||||
V13, V14, V15, V16, V17, V18, V19, V20, ...
|
||||
V21, V22, V23, V24, V25, V26, V27, V28)
|
||||
|
||||
BF1 = max(0, V21 - 4.671);
|
||||
BF3 = max(0, V6 - 5.396);
|
||||
BF4 = max(0, 5.396 - V6 );
|
||||
BF7 = max(0, V11 - 7.884);
|
||||
BF8 = max(0, 7.884 - V11 );
|
||||
BF9 = max(0, FWSEG_VA + 7.229) * BF1;
|
||||
BF10 = max(0, - 7.229 - FWSEG_VA ) * BF1;
|
||||
BF11 = max(0, V19 - 8.128) * BF1;
|
||||
BF12 = max(0, 8.128 - V19 ) * BF1;
|
||||
BF13 = max(0, V28 - 7.918);
|
||||
BF14 = max(0, 7.918 - V28 );
|
||||
BF15 = max(0, V5 + 2.888) * BF14;
|
||||
BF16 = max(0, - 2.888 - V5 ) * BF14;
|
||||
BF17 = max(0, V24 - 2.924) * BF8;
|
||||
BF18 = max(0, 2.924 - V24 ) * BF8;
|
||||
BF20 = max(0, 9.071 - V16 ) * BF15;
|
||||
BF21 = max(0, V10 - 6.286) * BF14;
|
||||
BF22 = max(0, 6.286 - V10 ) * BF14;
|
||||
BF24 = max(0, V23 - 5.173);
|
||||
BF25 = max(0, 5.173 - V23 );
|
||||
BF26 = max(0, V26 - 8.987);
|
||||
BF29 = max(0, 12.216 - V27 ) * BF3;
|
||||
BF30 = max(0, V8 - 4.306) * BF16;
|
||||
BF34 = max(0, V23 - 7.630) * BF21;
|
||||
BF35 = max(0, 7.630 - V23 ) * BF21;
|
||||
BF37 = max(0, 3.638 - V7 ) * BF1;
|
||||
BF39 = max(0, 8.337 - V21 ) * BF17;
|
||||
BF41 = max(0, 1.590 - V5 ) * BF11;
|
||||
BF43 = max(0, 13.993 - V8 ) * BF11;
|
||||
BF44 = max(0, V14 - 5.993) * BF25;
|
||||
BF45 = max(0, 5.993 - V14 ) * BF25;
|
||||
BF46 = max(0, V24 - 1.035);
|
||||
BF47 = max(0, 1.035 - V24 );
|
||||
BF49 = max(0, 8.915 - V23 ) * BF12;
|
||||
BF51 = max(0, - 0.004 - FWSEG_VA );
|
||||
BF52 = max(0, V27 - 6.520) * BF24;
|
||||
BF53 = max(0, 6.520 - V27 ) * BF24;
|
||||
BF54 = max(0, V7 - 11.484) * BF8;
|
||||
BF55 = max(0, 11.484 - V7 ) * BF8;
|
||||
BF57 = max(0, 5.742 - V17 ) * BF25;
|
||||
BF58 = max(0, V12 - 6.949) * BF12;
|
||||
BF59 = max(0, 6.949 - V12 ) * BF12;
|
||||
BF60 = max(0, V25 - 9.203) * BF45;
|
||||
BF63 = max(0, 1.887 - V13 ) * BF7;
|
||||
BF65 = max(0, 9.498 - V26 ) * BF15;
|
||||
BF66 = max(0, V5 - 6.566) * BF22;
|
||||
BF71 = max(0, 13.239 - V19 ) * BF46;
|
||||
BF72 = max(0, V19 - 9.925) * BF55;
|
||||
BF77 = max(0, 3.430 - V22 ) * BF18;
|
||||
BF78 = max(0, V27 - 6.513) * BF45;
|
||||
BF79 = max(0, 6.513 - V27 ) * BF45;
|
||||
BF81 = max(0, 12.511 - V18 );
|
||||
BF82 = max(0, V11 - 6.777) * BF81;
|
||||
BF83 = max(0, 6.777 - V11 ) * BF81;
|
||||
BF85 = max(0, 3.433 - V5 ) * BF47;
|
||||
BF87 = max(0, - 3.524 - FWSEG_VA ) * BF47;
|
||||
BF88 = max(0, V27 - 11.604) * BF9;
|
||||
BF91 = max(0, 8.845 - V26 ) * BF52;
|
||||
BF92 = max(0, V14 - 5.931) * BF82;
|
||||
BF93 = max(0, 5.931 - V14 ) * BF82;
|
||||
BF94 = max(0, V21 - 7.245) * BF25;
|
||||
BF95 = max(0, 7.245 - V21 ) * BF25;
|
||||
BF96 = max(0, V14 - 5.323) * BF7;
|
||||
BF98 = max(0, V10 - 6.248) * BF71;
|
||||
BF100 = max(0, V18 - 0.602) * BF95;
|
||||
|
||||
Y = 2.936 + 0.047 * BF1 + 0.061 * BF3 - 0.084 * BF4 - 0.139 * BF8 ...
|
||||
- 0.064 * BF10 - 0.030 * BF12 - 0.103 * BF13 ...
|
||||
- 0.039 * BF14 + 0.020 * BF17 - 0.002 * BF20 ...
|
||||
- 0.005 * BF22 - 0.114 * BF25 - 0.090 * BF26 ...
|
||||
- 0.011 * BF29 + 0.010 * BF30 + 0.009 * BF34 ...
|
||||
+ 0.002 * BF35 + 0.079 * BF37 - 0.006 * BF39 ...
|
||||
+ 0.007 * BF41 - 0.003 * BF43 + 0.017 * BF44 ...
|
||||
+ 0.076 * BF47 + 0.009 * BF49 + 0.016 * BF51 ...
|
||||
- 0.042 * BF53 - 0.079 * BF54 - 0.030 * BF57 ...
|
||||
- 0.018 * BF58 - 0.009 * BF59 - 0.119 * BF60 ...
|
||||
- 0.210 * BF63 - .456802E-03 * BF65 + 0.028 * BF66 ...
|
||||
+ 0.020 * BF72 + 0.011 * BF77 + 0.005 * BF78 ...
|
||||
+ 0.003 * BF79 - 0.049 * BF81 + 0.012 * BF83 ...
|
||||
- 0.030 * BF85 + 0.070 * BF87 + 0.008 * BF88 ...
|
||||
- 0.008 * BF91 + 0.010 * BF92 + 0.003 * BF93 ...
|
||||
+ 0.022 * BF94 - 0.038 * BF96 + .933766E-03 * BF98 ...
|
||||
+ 0.002 * BF100;
|
||||
|
||||
|
||||
|
||||
function distortion = fwseg(clean_speech, processed_speech,sample_rate)
|
||||
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Check the length of the clean and processed speech. Must be the same.
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
clean_length = length(clean_speech);
|
||||
processed_length = length(processed_speech);
|
||||
|
||||
if (clean_length ~= processed_length)
|
||||
disp('Error: Files must have same length.');
|
||||
return
|
||||
end
|
||||
|
||||
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Global Variables
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
|
||||
winlength = round(30*sample_rate/1000); % window length in samples
|
||||
skiprate = floor(winlength/4); % window skip in samples
|
||||
max_freq = sample_rate/2; % maximum bandwidth
|
||||
num_crit = 25; % number of critical bands
|
||||
|
||||
n_fft = 2^nextpow2(2*winlength);
|
||||
n_fftby2 = n_fft/2; % FFT size/2
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Critical Band Filter Definitions (Center Frequency and Bandwidths in Hz)
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
cent_freq(1) = 50.0000; bandwidth(1) = 70.0000;
|
||||
cent_freq(2) = 120.000; bandwidth(2) = 70.0000;
|
||||
cent_freq(3) = 190.000; bandwidth(3) = 70.0000;
|
||||
cent_freq(4) = 260.000; bandwidth(4) = 70.0000;
|
||||
cent_freq(5) = 330.000; bandwidth(5) = 70.0000;
|
||||
cent_freq(6) = 400.000; bandwidth(6) = 70.0000;
|
||||
cent_freq(7) = 470.000; bandwidth(7) = 70.0000;
|
||||
cent_freq(8) = 540.000; bandwidth(8) = 77.3724;
|
||||
cent_freq(9) = 617.372; bandwidth(9) = 86.0056;
|
||||
cent_freq(10) = 703.378; bandwidth(10) = 95.3398;
|
||||
cent_freq(11) = 798.717; bandwidth(11) = 105.411;
|
||||
cent_freq(12) = 904.128; bandwidth(12) = 116.256;
|
||||
cent_freq(13) = 1020.38; bandwidth(13) = 127.914;
|
||||
cent_freq(14) = 1148.30; bandwidth(14) = 140.423;
|
||||
cent_freq(15) = 1288.72; bandwidth(15) = 153.823;
|
||||
cent_freq(16) = 1442.54; bandwidth(16) = 168.154;
|
||||
cent_freq(17) = 1610.70; bandwidth(17) = 183.457;
|
||||
cent_freq(18) = 1794.16; bandwidth(18) = 199.776;
|
||||
cent_freq(19) = 1993.93; bandwidth(19) = 217.153;
|
||||
cent_freq(20) = 2211.08; bandwidth(20) = 235.631;
|
||||
cent_freq(21) = 2446.71; bandwidth(21) = 255.255;
|
||||
cent_freq(22) = 2701.97; bandwidth(22) = 276.072;
|
||||
cent_freq(23) = 2978.04; bandwidth(23) = 298.126;
|
||||
cent_freq(24) = 3276.17; bandwidth(24) = 321.465;
|
||||
cent_freq(25) = 3597.63; bandwidth(25) = 346.136;
|
||||
|
||||
|
||||
bw_min = bandwidth (1); % minimum critical bandwidth
|
||||
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Set up the critical band filters. Note here that Gaussianly shaped
|
||||
% filters are used. Also, the sum of the filter weights are equivalent
|
||||
% for each critical band filter. Filter less than -30 dB and set to
|
||||
% zero.
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
min_factor = exp (-30.0 / (2.0 * 2.303)); % -30 dB point of filter
|
||||
|
||||
for i = 1:num_crit
|
||||
f0 = (cent_freq (i) / max_freq) * (n_fftby2);
|
||||
all_f0(i) = floor(f0);
|
||||
bw = (bandwidth (i) / max_freq) * (n_fftby2);
|
||||
norm_factor = log(bw_min) - log(bandwidth(i));
|
||||
j = 0:1:n_fftby2-1;
|
||||
crit_filter(i,:) = exp (-11 *(((j - floor(f0)) ./bw).^2) + norm_factor);
|
||||
crit_filter(i,:) = crit_filter(i,:).*(crit_filter(i,:) > min_factor);
|
||||
end
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% For each frame of input speech, calculate the Weighted Spectral
|
||||
% Slope Measure
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
num_frames = floor(clean_length/skiprate-(winlength/skiprate)); % number of frames
|
||||
start = 1; % starting sample
|
||||
window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
|
||||
|
||||
distortion=zeros(num_frames,num_crit);
|
||||
for frame_count = 1:num_frames
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (1) Get the Frames for the test and reference speech.
|
||||
% Multiply by Hanning Window.
|
||||
% ----------------------------------------------------------
|
||||
|
||||
clean_frame = clean_speech(start:start+winlength-1);
|
||||
processed_frame = processed_speech(start:start+winlength-1);
|
||||
clean_frame = clean_frame.*window;
|
||||
processed_frame = processed_frame.*window;
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (2) Compute the magnitude Spectrum of Clean and Processed
|
||||
% ----------------------------------------------------------
|
||||
|
||||
|
||||
clean_spec = abs(fft(clean_frame,n_fft));
|
||||
processed_spec = abs(fft(processed_frame,n_fft));
|
||||
|
||||
% normalize so that spectra have unit area ----
|
||||
clean_spec=clean_spec/sum(clean_spec(1:n_fftby2));
|
||||
processed_spec=processed_spec/sum(processed_spec(1:n_fftby2));
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (3) Compute Filterbank Output Energies
|
||||
% ----------------------------------------------------------
|
||||
|
||||
clean_energy=zeros(1,num_crit);
|
||||
processed_energy=zeros(1,num_crit);
|
||||
error_energy=zeros(1,num_crit);
|
||||
|
||||
for i = 1:num_crit
|
||||
clean_energy(i) = sum(clean_spec(1:n_fftby2) ...
|
||||
.*crit_filter(i,:)');
|
||||
processed_energy(i) = sum(processed_spec(1:n_fftby2) ...
|
||||
.*crit_filter(i,:)');
|
||||
error_energy(i)=max((clean_energy(i)-processed_energy(i))^2,eps);
|
||||
end
|
||||
|
||||
|
||||
SNRlog=10*log10((clean_energy.^2)./error_energy);
|
||||
|
||||
distortion(frame_count,:)=min(max(SNRlog,-10),35);
|
||||
|
||||
start = start + skiprate;
|
||||
|
||||
end
|
||||
|
@ -0,0 +1,221 @@
|
||||
function [SIG,BAK,OVL]= comp_fwseg_variant(cleanFile, enhancedFile);
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Frequency-variant fwSNRseg Objective Speech Quality Measure
|
||||
%
|
||||
% This function implements the frequency-variant fwSNRseg measure [1]
|
||||
% (see also Chap. 10, Eq. 10.24)
|
||||
%
|
||||
%
|
||||
% Usage: [sig,bak,ovl]=comp_fwseg_variant(cleanFile.wav, enhancedFile.wav)
|
||||
%
|
||||
% cleanFile.wav - clean input file in .wav format
|
||||
% enhancedFile - enhanced output file in .wav format
|
||||
% sig - predicted rating [1-5] of speech distortion
|
||||
% bak - predicted rating [1-5] of noise distortion
|
||||
% ovl - predicted rating [1-5] of overall quality
|
||||
%
|
||||
%
|
||||
% Example call: [s,b,o] =comp_fwseg_variant('sp04.wav','enhanced.wav')
|
||||
%
|
||||
%
|
||||
% References:
|
||||
% [1] S. R. Quackenbush, T. P. Barnwell, and M. A. Clements,
|
||||
% Objective Measures of Speech Quality. Prentice Hall
|
||||
% Advanced Reference Series, Englewood Cliffs, NJ, 1988,
|
||||
% ISBN: 0-13-629056-6.
|
||||
%
|
||||
% Author: Philipos C. Loizou
|
||||
% (critical-band filtering routines were written by Bryan Pellom & John Hansen)
|
||||
%
|
||||
% Copyright (c) 2006 by Philipos C. Loizou
|
||||
% $Revision: 0.0 $ $Date: 10/09/2006 $
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
if nargin~=2
|
||||
fprintf('USAGE: [sig,bak,ovl]=comp_fwseg_variant(cleanFile.wav, enhancedFile.wav)\n');
|
||||
fprintf('For more help, type: help comp_fwseg_variant\n\n');
|
||||
return;
|
||||
end
|
||||
|
||||
|
||||
[data1, Srate1, Nbits1]= wavread(cleanFile);
|
||||
[data2, Srate2, Nbits2]= wavread(enhancedFile);
|
||||
if ( Srate1~= Srate2) | ( Nbits1~= Nbits2)
|
||||
error( 'The two files do not match!\n');
|
||||
end
|
||||
|
||||
len= min( length( data1), length( data2));
|
||||
data1= data1( 1: len)+eps;
|
||||
data2= data2( 1: len)+eps;
|
||||
|
||||
wss_dist_matrix= fwseg( data1, data2,Srate1);
|
||||
wss_dist=mean(wss_dist_matrix);
|
||||
|
||||
% initialize coefficients obtained from multiple linear
|
||||
% regression analysis
|
||||
%
|
||||
b_sig=[0.021,-0.028,0.088,-0.031,0.048,-0.049,0.065,0.009,0.011,0.033,...
|
||||
-0.040,-0.002,0.041,-0.007,0.033,0.018,-0.007,0.044,-0.001,0.021,...
|
||||
-0.002,0.017,-0.03,0.073,0.043];
|
||||
b_ovl=[-0.003,-0.026,0.066,-0.036,0.038,-0.023,0.037,0.022,0.014,0.009,...
|
||||
-0.03,0.004,0.044,-0.005,0.017,0.018,-0.001,0.051,0.009,0.011,...
|
||||
0.011,-0.002,-0.021,0.043,0.031];
|
||||
b_bak=[-0.03,-0.022,0.03,-0.048,0.034,0.002,0.006,0.037,0.017,-0.016,-0.008,...
|
||||
0.019,0.024,-0.002,0.01,0.03,-0.018,0.046,0.022,0.005,0.03,-0.028,...
|
||||
-0.028,0.019,0.005];
|
||||
|
||||
SIG=0.567+sum(b_sig.*wss_dist);
|
||||
SIG=max(1,SIG); SIG=min(5, SIG); % limit values to [1, 5]
|
||||
|
||||
BAK=1.013+sum(b_bak.*wss_dist);
|
||||
BAK=max(1,BAK); BAK=min(5, BAK); % limit values to [1, 5]
|
||||
|
||||
OVL=0.446+sum(b_ovl.*wss_dist);
|
||||
OVL=max(1,OVL); OVL=min(5, OVL); % limit values to [1, 5]
|
||||
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
function distortion = fwseg(clean_speech, processed_speech,sample_rate)
|
||||
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Check the length of the clean and processed speech. Must be the same.
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
clean_length = length(clean_speech);
|
||||
processed_length = length(processed_speech);
|
||||
|
||||
if (clean_length ~= processed_length)
|
||||
disp('Error: Files must have same length.');
|
||||
return
|
||||
end
|
||||
|
||||
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Global Variables
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
|
||||
winlength = round(30*sample_rate/1000); % window length in samples
|
||||
skiprate = floor(winlength/4); % window skip in samples
|
||||
max_freq = sample_rate/2; % maximum bandwidth
|
||||
num_crit = 25; % number of critical bands
|
||||
|
||||
n_fft = 2^nextpow2(2*winlength);
|
||||
n_fftby2 = n_fft/2; % FFT size/2
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Critical Band Filter Definitions (Center Frequency and Bandwidths in Hz)
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
cent_freq(1) = 50.0000; bandwidth(1) = 70.0000;
|
||||
cent_freq(2) = 120.000; bandwidth(2) = 70.0000;
|
||||
cent_freq(3) = 190.000; bandwidth(3) = 70.0000;
|
||||
cent_freq(4) = 260.000; bandwidth(4) = 70.0000;
|
||||
cent_freq(5) = 330.000; bandwidth(5) = 70.0000;
|
||||
cent_freq(6) = 400.000; bandwidth(6) = 70.0000;
|
||||
cent_freq(7) = 470.000; bandwidth(7) = 70.0000;
|
||||
cent_freq(8) = 540.000; bandwidth(8) = 77.3724;
|
||||
cent_freq(9) = 617.372; bandwidth(9) = 86.0056;
|
||||
cent_freq(10) = 703.378; bandwidth(10) = 95.3398;
|
||||
cent_freq(11) = 798.717; bandwidth(11) = 105.411;
|
||||
cent_freq(12) = 904.128; bandwidth(12) = 116.256;
|
||||
cent_freq(13) = 1020.38; bandwidth(13) = 127.914;
|
||||
cent_freq(14) = 1148.30; bandwidth(14) = 140.423;
|
||||
cent_freq(15) = 1288.72; bandwidth(15) = 153.823;
|
||||
cent_freq(16) = 1442.54; bandwidth(16) = 168.154;
|
||||
cent_freq(17) = 1610.70; bandwidth(17) = 183.457;
|
||||
cent_freq(18) = 1794.16; bandwidth(18) = 199.776;
|
||||
cent_freq(19) = 1993.93; bandwidth(19) = 217.153;
|
||||
cent_freq(20) = 2211.08; bandwidth(20) = 235.631;
|
||||
cent_freq(21) = 2446.71; bandwidth(21) = 255.255;
|
||||
cent_freq(22) = 2701.97; bandwidth(22) = 276.072;
|
||||
cent_freq(23) = 2978.04; bandwidth(23) = 298.126;
|
||||
cent_freq(24) = 3276.17; bandwidth(24) = 321.465;
|
||||
cent_freq(25) = 3597.63; bandwidth(25) = 346.136;
|
||||
|
||||
|
||||
bw_min = bandwidth (1); % minimum critical bandwidth
|
||||
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Set up the critical band filters. Note here that Gaussianly shaped
|
||||
% filters are used. Also, the sum of the filter weights are equivalent
|
||||
% for each critical band filter. Filter less than -30 dB and set to
|
||||
% zero.
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
min_factor = exp (-30.0 / (2.0 * 2.303)); % -30 dB point of filter
|
||||
|
||||
for i = 1:num_crit
|
||||
f0 = (cent_freq (i) / max_freq) * (n_fftby2);
|
||||
all_f0(i) = floor(f0);
|
||||
bw = (bandwidth (i) / max_freq) * (n_fftby2);
|
||||
norm_factor = log(bw_min) - log(bandwidth(i));
|
||||
j = 0:1:n_fftby2-1;
|
||||
crit_filter(i,:) = exp (-11 *(((j - floor(f0)) ./bw).^2) + norm_factor);
|
||||
crit_filter(i,:) = crit_filter(i,:).*(crit_filter(i,:) > min_factor);
|
||||
end
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% For each frame of input speech, calculate the Weighted Spectral
|
||||
% Slope Measure
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
num_frames = floor(clean_length/skiprate-(winlength/skiprate)); % number of frames
|
||||
start = 1; % starting sample
|
||||
window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
|
||||
|
||||
distortion=zeros(num_frames,num_crit);
|
||||
for frame_count = 1:num_frames
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (1) Get the Frames for the test and reference speech.
|
||||
% Multiply by Hanning Window.
|
||||
% ----------------------------------------------------------
|
||||
|
||||
clean_frame = clean_speech(start:start+winlength-1);
|
||||
processed_frame = processed_speech(start:start+winlength-1);
|
||||
clean_frame = clean_frame.*window;
|
||||
processed_frame = processed_frame.*window;
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (2) Compute the magnitude Spectrum of Clean and Processed
|
||||
% ----------------------------------------------------------
|
||||
|
||||
|
||||
clean_spec = abs(fft(clean_frame,n_fft));
|
||||
processed_spec = abs(fft(processed_frame,n_fft));
|
||||
|
||||
% normalize so that spectra have unit area ----
|
||||
clean_spec=clean_spec/sum(clean_spec(1:n_fftby2));
|
||||
processed_spec=processed_spec/sum(processed_spec(1:n_fftby2));
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (3) Compute Filterbank Output Energies (in dB scale)
|
||||
% ----------------------------------------------------------
|
||||
|
||||
clean_energy=zeros(1,num_crit);
|
||||
processed_energy=zeros(1,num_crit);
|
||||
error_energy=zeros(1,num_crit);
|
||||
|
||||
for i = 1:num_crit
|
||||
clean_energy(i) = sum(clean_spec(1:n_fftby2) ...
|
||||
.*crit_filter(i,:)');
|
||||
processed_energy(i) = sum(processed_spec(1:n_fftby2) ...
|
||||
.*crit_filter(i,:)');
|
||||
error_energy(i)=max((clean_energy(i)-processed_energy(i))^2,eps);
|
||||
end
|
||||
|
||||
|
||||
SNRlog=10*log10((clean_energy.^2)./error_energy);
|
||||
|
||||
distortion(frame_count,:)=min(max(SNRlog,-10),35);
|
||||
|
||||
start = start + skiprate;
|
||||
|
||||
end
|
||||
|
@ -0,0 +1,188 @@
|
||||
function is_mean= comp_is(cleanFile, enhdFile);
|
||||
% ----------------------------------------------------------------------
|
||||
% Itakura-Saito (IS) Objective Speech Quality Measure
|
||||
%
|
||||
% This function implements the Itakura-Saito distance measure
|
||||
% defined on page 50 of [1] (see Equation 2.26). See also
|
||||
% Equation 12 (page 1480) of [2].
|
||||
%
|
||||
% Usage: IS=comp_is(cleanFile.wav, enhancedFile.wav)
|
||||
%
|
||||
% cleanFile.wav - clean input file in .wav format
|
||||
% enhancedFile - enhanced output file in .wav format
|
||||
% IS - computed Itakura Saito measure
|
||||
%
|
||||
% Note that the IS measure is limited in the range [0, 100].
|
||||
%
|
||||
% Example call: IS =comp_is('sp04.wav','enhanced.wav')
|
||||
%
|
||||
%
|
||||
% References:
|
||||
%
|
||||
% [1] S. R. Quackenbush, T. P. Barnwell, and M. A. Clements,
|
||||
% Objective Measures of Speech Quality. Prentice Hall
|
||||
% Advanced Reference Series, Englewood Cliffs, NJ, 1988,
|
||||
% ISBN: 0-13-629056-6.
|
||||
%
|
||||
% [2] B.-H. Juang, "On Using the Itakura-Saito Measures for
|
||||
% Speech Coder Performance Evaluation", AT&T Bell
|
||||
% Laboratories Technical Journal, Vol. 63, No. 8,
|
||||
% October 1984, pp. 1477-1498.
|
||||
%
|
||||
% Authors: Bryan L. Pellom and John H. L. Hansen (July 1998)
|
||||
% Modified by: Philipos C. Loizou (Oct 2006) - limited IS to be in [0,100]
|
||||
%
|
||||
% Copyright (c) 2006 by Philipos C. Loizou
|
||||
% $Revision: 0.0 $ $Date: 10/09/2006 $
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
if nargin~=2
|
||||
fprintf('USAGE: IS=comp_is(cleanFile.wav, enhancedFile.wav)\n');
|
||||
fprintf('For more help, type: help comp_is\n\n');
|
||||
return;
|
||||
end
|
||||
|
||||
alpha=0.95;
|
||||
|
||||
[data1, Srate1, Nbits1]= wavread(cleanFile);
|
||||
[data2, Srate2, Nbits2]= wavread(enhdFile);
|
||||
if ( Srate1~= Srate2) | ( Nbits1~= Nbits2)
|
||||
error( 'The two files do not match!\n');
|
||||
end
|
||||
|
||||
len= min( length( data1), length( data2));
|
||||
data1= data1( 1: len)+eps;
|
||||
data2= data2( 1: len)+eps;
|
||||
|
||||
|
||||
IS_dist= is( data1, data2,Srate1);
|
||||
|
||||
IS_len= round( length( IS_dist)* alpha);
|
||||
IS= sort( IS_dist);
|
||||
|
||||
is_mean= mean( IS( 1: IS_len));
|
||||
|
||||
|
||||
|
||||
function distortion = is(clean_speech, processed_speech,sample_rate)
|
||||
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Check the length of the clean and processed speech. Must be the same.
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
clean_length = length(clean_speech);
|
||||
processed_length = length(processed_speech);
|
||||
|
||||
if (clean_length ~= processed_length)
|
||||
disp('Error: Both Speech Files must be same length.');
|
||||
return
|
||||
end
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Scale both clean speech and processed speech to have same dynamic
|
||||
% range. Also remove DC component from each signal
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
%clean_speech = clean_speech - mean(clean_speech);
|
||||
%processed_speech = processed_speech - mean(processed_speech);
|
||||
|
||||
%processed_speech = processed_speech.*(max(abs(clean_speech))/ max(abs(processed_speech)));
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Global Variables
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
%sample_rate = 8000; % default sample rate
|
||||
winlength = round(30*sample_rate/1000); %240; % window length in samples
|
||||
skiprate = floor(winlength/4); % window skip in samples
|
||||
if sample_rate<10000
|
||||
P = 10; % LPC Analysis Order
|
||||
else
|
||||
P=16; % this could vary depending on sampling frequency.
|
||||
end
|
||||
% ----------------------------------------------------------------------
|
||||
% For each frame of input speech, calculate the Itakura-Saito Measure
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames
|
||||
start = 1; % starting sample
|
||||
window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
|
||||
|
||||
for frame_count = 1:num_frames
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (1) Get the Frames for the test and reference speech.
|
||||
% Multiply by Hanning Window.
|
||||
% ----------------------------------------------------------
|
||||
|
||||
clean_frame = clean_speech(start:start+winlength-1);
|
||||
processed_frame = processed_speech(start:start+winlength-1);
|
||||
clean_frame = clean_frame.*window;
|
||||
processed_frame = processed_frame.*window;
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (2) Get the autocorrelation lags and LPC parameters used
|
||||
% to compute the IS measure.
|
||||
% ----------------------------------------------------------
|
||||
|
||||
[R_clean, Ref_clean, A_clean] = ...
|
||||
lpcoeff(clean_frame, P);
|
||||
[R_processed, Ref_processed, A_processed] = ...
|
||||
lpcoeff(processed_frame, P);
|
||||
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (3) Compute the IS measure
|
||||
% ----------------------------------------------------------
|
||||
|
||||
numerator = A_processed*toeplitz(R_clean)*A_processed';
|
||||
denominator = max(A_clean*toeplitz(R_clean)*A_clean',eps);
|
||||
gain_clean = max(R_clean*A_clean',eps); % this is gain
|
||||
gain_processed = max(R_processed*A_processed',eps); % squared (sigma^2)
|
||||
|
||||
|
||||
ISvalue=(gain_clean/gain_processed)*(numerator/denominator) + ...
|
||||
log(gain_processed/gain_clean)-1;
|
||||
|
||||
distortion(frame_count) = min(ISvalue,100);
|
||||
start = start + skiprate;
|
||||
|
||||
end
|
||||
|
||||
|
||||
|
||||
function [acorr, refcoeff, lpparams] = lpcoeff(speech_frame, model_order)
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (1) Compute Autocorrelation Lags
|
||||
% ----------------------------------------------------------
|
||||
|
||||
winlength = max(size(speech_frame));
|
||||
for k=1:model_order+1
|
||||
R(k) = sum(speech_frame(1:winlength-k+1) ...
|
||||
.*speech_frame(k:winlength));
|
||||
end
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (2) Levinson-Durbin
|
||||
% ----------------------------------------------------------
|
||||
|
||||
a = ones(1,model_order);
|
||||
E(1)=R(1);
|
||||
for i=1:model_order
|
||||
a_past(1:i-1) = a(1:i-1);
|
||||
sum_term = sum(a_past(1:i-1).*R(i:-1:2));
|
||||
rcoeff(i)=(R(i+1) - sum_term) / E(i);
|
||||
a(i)=rcoeff(i);
|
||||
a(1:i-1) = a_past(1:i-1) - rcoeff(i).*a_past(i-1:-1:1);
|
||||
E(i+1)=(1-rcoeff(i)*rcoeff(i))*E(i);
|
||||
end
|
||||
|
||||
acorr = R;
|
||||
refcoeff = rcoeff;
|
||||
lpparams = [1 -a];
|
||||
|
||||
|
||||
|
@ -0,0 +1,162 @@
|
||||
function llr_mean= comp_llr(cleanFile, enhancedFile);
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
%
|
||||
% Log Likelihood Ratio (LLR) Objective Speech Quality Measure
|
||||
%
|
||||
%
|
||||
% This function implements the Log Likelihood Ratio Measure
|
||||
% defined on page 48 of [1] (see Equation 2.18).
|
||||
%
|
||||
% Usage: llr=comp_llr(cleanFile.wav, enhancedFile.wav)
|
||||
%
|
||||
% cleanFile.wav - clean input file in .wav format
|
||||
% enhancedFile - enhanced output file in .wav format
|
||||
% llr - computed likelihood ratio
|
||||
%
|
||||
% Note that the LLR measure is limited in the range [0, 2].
|
||||
%
|
||||
% Example call: llr =comp_llr('sp04.wav','enhanced.wav')
|
||||
%
|
||||
%
|
||||
% References:
|
||||
%
|
||||
% [1] S. R. Quackenbush, T. P. Barnwell, and M. A. Clements,
|
||||
% Objective Measures of Speech Quality. Prentice Hall
|
||||
% Advanced Reference Series, Englewood Cliffs, NJ, 1988,
|
||||
% ISBN: 0-13-629056-6.
|
||||
%
|
||||
% Authors: Bryan L. Pellom and John H. L. Hansen (July 1998)
|
||||
% Modified by: Philipos C. Loizou (Oct 2006) - limited LLR to be in [0,2]
|
||||
%
|
||||
% Copyright (c) 2006 by Philipos C. Loizou
|
||||
% $Revision: 0.0 $ $Date: 10/09/2006 $
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
if nargin~=2
|
||||
fprintf('USAGE: LLR=comp_llr(cleanFile.wav, enhancedFile.wav)\n');
|
||||
fprintf('For more help, type: help comp_llr\n\n');
|
||||
return;
|
||||
end
|
||||
|
||||
alpha=0.95;
|
||||
[data1, Srate1, Nbits1]= wavread(cleanFile);
|
||||
[data2, Srate2, Nbits2]= wavread(enhancedFile);
|
||||
if ( Srate1~= Srate2) | ( Nbits1~= Nbits2)
|
||||
error( 'The two files do not match!\n');
|
||||
end
|
||||
|
||||
len= min( length( data1), length( data2));
|
||||
data1= data1( 1: len)+eps;
|
||||
data2= data2( 1: len)+eps;
|
||||
|
||||
IS_dist= llr( data1, data2,Srate1);
|
||||
|
||||
IS_len= round( length( IS_dist)* alpha);
|
||||
IS= sort( IS_dist);
|
||||
|
||||
llr_mean= mean( IS( 1: IS_len));
|
||||
|
||||
|
||||
|
||||
function distortion = llr(clean_speech, processed_speech,sample_rate)
|
||||
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Check the length of the clean and processed speech. Must be the same.
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
clean_length = length(clean_speech);
|
||||
processed_length = length(processed_speech);
|
||||
|
||||
if (clean_length ~= processed_length)
|
||||
disp('Error: Both Speech Files must be same length.');
|
||||
return
|
||||
end
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Global Variables
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
winlength = round(30*sample_rate/1000); %240; % window length in samples
|
||||
skiprate = floor(winlength/4); % window skip in samples
|
||||
if sample_rate<10000
|
||||
P = 10; % LPC Analysis Order
|
||||
else
|
||||
P=16; % this could vary depending on sampling frequency.
|
||||
end
|
||||
% ----------------------------------------------------------------------
|
||||
% For each frame of input speech, calculate the Log Likelihood Ratio
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames
|
||||
start = 1; % starting sample
|
||||
window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
|
||||
|
||||
for frame_count = 1:num_frames
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (1) Get the Frames for the test and reference speech.
|
||||
% Multiply by Hanning Window.
|
||||
% ----------------------------------------------------------
|
||||
|
||||
clean_frame = clean_speech(start:start+winlength-1);
|
||||
processed_frame = processed_speech(start:start+winlength-1);
|
||||
clean_frame = clean_frame.*window;
|
||||
processed_frame = processed_frame.*window;
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (2) Get the autocorrelation lags and LPC parameters used
|
||||
% to compute the LLR measure.
|
||||
% ----------------------------------------------------------
|
||||
|
||||
[R_clean, Ref_clean, A_clean] = ...
|
||||
lpcoeff(clean_frame, P);
|
||||
[R_processed, Ref_processed, A_processed] = ...
|
||||
lpcoeff(processed_frame, P);
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (3) Compute the LLR measure
|
||||
% ----------------------------------------------------------
|
||||
|
||||
numerator = A_processed*toeplitz(R_clean)*A_processed';
|
||||
denominator = A_clean*toeplitz(R_clean)*A_clean';
|
||||
distortion(frame_count) = min(2,log(numerator/denominator));
|
||||
start = start + skiprate;
|
||||
|
||||
end
|
||||
|
||||
|
||||
function [acorr, refcoeff, lpparams] = lpcoeff(speech_frame, model_order)
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (1) Compute Autocorrelation Lags
|
||||
% ----------------------------------------------------------
|
||||
|
||||
winlength = max(size(speech_frame));
|
||||
for k=1:model_order+1
|
||||
R(k) = sum(speech_frame(1:winlength-k+1) ...
|
||||
.*speech_frame(k:winlength));
|
||||
end
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (2) Levinson-Durbin
|
||||
% ----------------------------------------------------------
|
||||
|
||||
a = ones(1,model_order);
|
||||
E(1)=R(1);
|
||||
for i=1:model_order
|
||||
a_past(1:i-1) = a(1:i-1);
|
||||
sum_term = sum(a_past(1:i-1).*R(i:-1:2));
|
||||
rcoeff(i)=(R(i+1) - sum_term) / E(i);
|
||||
a(i)=rcoeff(i);
|
||||
a(1:i-1) = a_past(1:i-1) - rcoeff(i).*a_past(i-1:-1:1);
|
||||
E(i+1)=(1-rcoeff(i)*rcoeff(i))*E(i);
|
||||
end
|
||||
|
||||
acorr = R;
|
||||
refcoeff = rcoeff;
|
||||
lpparams = [1 -a];
|
||||
|
||||
|
||||
|
@ -0,0 +1,132 @@
|
||||
function [snr_mean, segsnr_mean]= comp_SNR(cleanFile, enhdFile);
|
||||
%
|
||||
% Segmental Signal-to-Noise Ratio Objective Speech Quality Measure
|
||||
%
|
||||
% This function implements the segmental signal-to-noise ratio
|
||||
% as defined in [1, p. 45] (see Equation 2.12).
|
||||
%
|
||||
% Usage: [SNRovl, SNRseg]=comp_snr(cleanFile.wav, enhancedFile.wav)
|
||||
%
|
||||
% cleanFile.wav - clean input file in .wav format
|
||||
% enhancedFile - enhanced output file in .wav format
|
||||
% SNRovl - overall SNR (dB)
|
||||
% SNRseg - segmental SNR (dB)
|
||||
%
|
||||
% This function returns 2 parameters. The first item is the
|
||||
% overall SNR for the two speech signals. The second value
|
||||
% is the segmental signal-to-noise ratio (1 seg-snr per
|
||||
% frame of input). The segmental SNR is clamped to range
|
||||
% between 35dB and -10dB (see suggestions in [2]).
|
||||
%
|
||||
% Example call: [SNRovl,SNRseg]=comp_SNR('sp04.wav','enhanced.wav')
|
||||
%
|
||||
% References:
|
||||
%
|
||||
% [1] S. R. Quackenbush, T. P. Barnwell, and M. A. Clements,
|
||||
% Objective Measures of Speech Quality. Prentice Hall
|
||||
% Advanced Reference Series, Englewood Cliffs, NJ, 1988,
|
||||
% ISBN: 0-13-629056-6.
|
||||
%
|
||||
% [2] P. E. Papamichalis, Practical Approaches to Speech
|
||||
% Coding, Prentice-Hall, Englewood Cliffs, NJ, 1987.
|
||||
% ISBN: 0-13-689019-9. (see pages 179-181).
|
||||
%
|
||||
% Authors: Bryan L. Pellom and John H. L. Hansen (July 1998)
|
||||
% Modified by: Philipos C. Loizou (Oct 2006)
|
||||
%
|
||||
% Copyright (c) 2006 by Philipos C. Loizou
|
||||
% $Revision: 0.0 $ $Date: 10/09/2006 $
|
||||
%-------------------------------------------------------------------------
|
||||
|
||||
if nargin ~=2
|
||||
fprintf('USAGE: [snr_mean, segsnr_mean]= comp_SNR(cleanFile, enhdFile) \n');
|
||||
return;
|
||||
end
|
||||
|
||||
[data1, Srate1, Nbits1]= wavread(cleanFile);
|
||||
[data2, Srate2, Nbits2]= wavread(enhdFile);
|
||||
if (( Srate1~= Srate2) | ( Nbits1~= Nbits2))
|
||||
error( 'The two files do not match!\n');
|
||||
end
|
||||
|
||||
len= min( length( data1), length( data2));
|
||||
data1= data1( 1: len);
|
||||
data2= data2( 1: len);
|
||||
|
||||
[snr_dist, segsnr_dist]= snr( data1, data2,Srate1);
|
||||
|
||||
snr_mean= snr_dist;
|
||||
segsnr_mean= mean( segsnr_dist);
|
||||
|
||||
|
||||
% =========================================================================
|
||||
function [overall_snr, segmental_snr] = snr(clean_speech, processed_speech,sample_rate)
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Check the length of the clean and processed speech. Must be the same.
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
clean_length = length(clean_speech);
|
||||
processed_length = length(processed_speech);
|
||||
|
||||
if (clean_length ~= processed_length)
|
||||
disp('Error: Both Speech Files must be same length.');
|
||||
return
|
||||
end
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Scale both clean speech and processed speech to have same dynamic
|
||||
% range. Also remove DC component from each signal
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
%clean_speech = clean_speech - mean(clean_speech);
|
||||
%processed_speech = processed_speech - mean(processed_speech);
|
||||
|
||||
%processed_speech = processed_speech.*(max(abs(clean_speech))/ max(abs(processed_speech)));
|
||||
|
||||
overall_snr = 10* log10( sum(clean_speech.^2)/sum((clean_speech-processed_speech).^2));
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Global Variables
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
|
||||
winlength = round(30*sample_rate/1000); %240; % window length in samples for 30-msecs
|
||||
skiprate = floor(winlength/4); %60; % window skip in samples
|
||||
MIN_SNR = -10; % minimum SNR in dB
|
||||
MAX_SNR = 35; % maximum SNR in dB
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% For each frame of input speech, calculate the Segmental SNR
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames
|
||||
start = 1; % starting sample
|
||||
window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
|
||||
|
||||
for frame_count = 1: num_frames
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (1) Get the Frames for the test and reference speech.
|
||||
% Multiply by Hanning Window.
|
||||
% ----------------------------------------------------------
|
||||
|
||||
clean_frame = clean_speech(start:start+winlength-1);
|
||||
processed_frame = processed_speech(start:start+winlength-1);
|
||||
clean_frame = clean_frame.*window;
|
||||
processed_frame = processed_frame.*window;
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (2) Compute the Segmental SNR
|
||||
% ----------------------------------------------------------
|
||||
|
||||
signal_energy = sum(clean_frame.^2);
|
||||
noise_energy = sum((clean_frame-processed_frame).^2);
|
||||
segmental_snr(frame_count) = 10*log10(signal_energy/(noise_energy+eps)+eps);
|
||||
segmental_snr(frame_count) = max(segmental_snr(frame_count),MIN_SNR);
|
||||
segmental_snr(frame_count) = min(segmental_snr(frame_count),MAX_SNR);
|
||||
|
||||
start = start + skiprate;
|
||||
|
||||
end
|
||||
|
@ -0,0 +1,299 @@
|
||||
function wss_dist= comp_wss(cleanFile, enhancedFile);
|
||||
% ----------------------------------------------------------------------
|
||||
%
|
||||
% Weighted Spectral Slope (WSS) Objective Speech Quality Measure
|
||||
%
|
||||
% This function implements the Weighted Spectral Slope (WSS)
|
||||
% distance measure originally proposed in [1]. The algorithm
|
||||
% works by first decomposing the speech signal into a set of
|
||||
% frequency bands (this is done for both the test and reference
|
||||
% frame). The intensities within each critical band are
|
||||
% measured. Then, a weighted distances between the measured
|
||||
% slopes of the log-critical band spectra are computed.
|
||||
% This measure is also described in Section 2.2.9 (pages 56-58)
|
||||
% of [2].
|
||||
%
|
||||
% Whereas Klatt's original measure used 36 critical-band
|
||||
% filters to estimate the smoothed short-time spectrum, this
|
||||
% implementation considers a bank of 25 filters spanning
|
||||
% the 4 kHz bandwidth.
|
||||
%
|
||||
% Usage: wss_dist=comp_wss(cleanFile.wav, enhancedFile.wav)
|
||||
%
|
||||
% cleanFile.wav - clean input file in .wav format
|
||||
% enhancedFile - enhanced output file in .wav format
|
||||
% wss_dist - computed spectral slope distance
|
||||
%
|
||||
% Example call: ws =comp_wss('sp04.wav','enhanced.wav')
|
||||
%
|
||||
% References:
|
||||
%
|
||||
% [1] D. H. Klatt, "Prediction of Perceived Phonetic Distance
|
||||
% from Critical-Band Spectra: A First Step", Proc. IEEE
|
||||
% ICASSP'82, Volume 2, pp. 1278-1281, May, 1982.
|
||||
%
|
||||
% [2] S. R. Quackenbush, T. P. Barnwell, and M. A. Clements,
|
||||
% Objective Measures of Speech Quality. Prentice Hall
|
||||
% Advanced Reference Series, Englewood Cliffs, NJ, 1988,
|
||||
% ISBN: 0-13-629056-6.
|
||||
%
|
||||
% Authors: Bryan L. Pellom and John H. L. Hansen (July 1998)
|
||||
% Modified by: Philipos C. Loizou (Oct 2006)
|
||||
%
|
||||
% Copyright (c) 2006 by Philipos C. Loizou
|
||||
% $Revision: 0.0 $ $Date: 10/09/2006 $
|
||||
%
|
||||
% ----------------------------------------------------------------------
|
||||
if nargin~=2
|
||||
fprintf('USAGE: WSS=comp_wss(cleanFile.wav, enhancedFile.wav)\n');
|
||||
fprintf('For more help, type: help comp_wss\n\n');
|
||||
return;
|
||||
end
|
||||
|
||||
alpha= 0.95;
|
||||
|
||||
[data1, Srate1, Nbits1]= wavread(cleanFile);
|
||||
[data2, Srate2, Nbits2]= wavread(enhancedFile);
|
||||
if ( Srate1~= Srate2) | ( Nbits1~= Nbits2)
|
||||
error( 'The two files do not match!\n');
|
||||
end
|
||||
|
||||
len= min( length( data1), length( data2));
|
||||
data1= data1( 1: len)+eps;
|
||||
data2= data2( 1: len)+eps;
|
||||
|
||||
wss_dist_vec= wss( data1, data2,Srate1);
|
||||
wss_dist_vec= sort( wss_dist_vec);
|
||||
wss_dist= mean( wss_dist_vec( 1: round( length( wss_dist_vec)*alpha)));
|
||||
|
||||
|
||||
|
||||
function distortion = wss(clean_speech, processed_speech,sample_rate)
|
||||
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Check the length of the clean and processed speech. Must be the same.
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
clean_length = length(clean_speech);
|
||||
processed_length = length(processed_speech);
|
||||
|
||||
if (clean_length ~= processed_length)
|
||||
disp('Error: Files musthave same length.');
|
||||
return
|
||||
end
|
||||
|
||||
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Global Variables
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
winlength = round(30*sample_rate/1000); % window length in samples
|
||||
skiprate = floor(winlength/4); % window skip in samples
|
||||
max_freq = sample_rate/2; % maximum bandwidth
|
||||
num_crit = 25; % number of critical bands
|
||||
|
||||
USE_FFT_SPECTRUM = 1; % defaults to 10th order LP spectrum
|
||||
n_fft = 2^nextpow2(2*winlength);
|
||||
n_fftby2 = n_fft/2; % FFT size/2
|
||||
Kmax = 20; % value suggested by Klatt, pg 1280
|
||||
Klocmax = 1; % value suggested by Klatt, pg 1280
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Critical Band Filter Definitions (Center Frequency and Bandwidths in Hz)
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
cent_freq(1) = 50.0000; bandwidth(1) = 70.0000;
|
||||
cent_freq(2) = 120.000; bandwidth(2) = 70.0000;
|
||||
cent_freq(3) = 190.000; bandwidth(3) = 70.0000;
|
||||
cent_freq(4) = 260.000; bandwidth(4) = 70.0000;
|
||||
cent_freq(5) = 330.000; bandwidth(5) = 70.0000;
|
||||
cent_freq(6) = 400.000; bandwidth(6) = 70.0000;
|
||||
cent_freq(7) = 470.000; bandwidth(7) = 70.0000;
|
||||
cent_freq(8) = 540.000; bandwidth(8) = 77.3724;
|
||||
cent_freq(9) = 617.372; bandwidth(9) = 86.0056;
|
||||
cent_freq(10) = 703.378; bandwidth(10) = 95.3398;
|
||||
cent_freq(11) = 798.717; bandwidth(11) = 105.411;
|
||||
cent_freq(12) = 904.128; bandwidth(12) = 116.256;
|
||||
cent_freq(13) = 1020.38; bandwidth(13) = 127.914;
|
||||
cent_freq(14) = 1148.30; bandwidth(14) = 140.423;
|
||||
cent_freq(15) = 1288.72; bandwidth(15) = 153.823;
|
||||
cent_freq(16) = 1442.54; bandwidth(16) = 168.154;
|
||||
cent_freq(17) = 1610.70; bandwidth(17) = 183.457;
|
||||
cent_freq(18) = 1794.16; bandwidth(18) = 199.776;
|
||||
cent_freq(19) = 1993.93; bandwidth(19) = 217.153;
|
||||
cent_freq(20) = 2211.08; bandwidth(20) = 235.631;
|
||||
cent_freq(21) = 2446.71; bandwidth(21) = 255.255;
|
||||
cent_freq(22) = 2701.97; bandwidth(22) = 276.072;
|
||||
cent_freq(23) = 2978.04; bandwidth(23) = 298.126;
|
||||
cent_freq(24) = 3276.17; bandwidth(24) = 321.465;
|
||||
cent_freq(25) = 3597.63; bandwidth(25) = 346.136;
|
||||
|
||||
bw_min = bandwidth (1); % minimum critical bandwidth
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Set up the critical band filters. Note here that Gaussianly shaped
|
||||
% filters are used. Also, the sum of the filter weights are equivalent
|
||||
% for each critical band filter. Filter less than -30 dB and set to
|
||||
% zero.
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
min_factor = exp (-30.0 / (2.0 * 2.303)); % -30 dB point of filter
|
||||
|
||||
for i = 1:num_crit
|
||||
f0 = (cent_freq (i) / max_freq) * (n_fftby2);
|
||||
all_f0(i) = floor(f0);
|
||||
bw = (bandwidth (i) / max_freq) * (n_fftby2);
|
||||
norm_factor = log(bw_min) - log(bandwidth(i));
|
||||
j = 0:1:n_fftby2-1;
|
||||
crit_filter(i,:) = exp (-11 *(((j - floor(f0)) ./bw).^2) + norm_factor);
|
||||
crit_filter(i,:) = crit_filter(i,:).*(crit_filter(i,:) > min_factor);
|
||||
end
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% For each frame of input speech, calculate the Weighted Spectral
|
||||
% Slope Measure
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames
|
||||
start = 1; % starting sample
|
||||
window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
|
||||
|
||||
for frame_count = 1:num_frames
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (1) Get the Frames for the test and reference speech.
|
||||
% Multiply by Hanning Window.
|
||||
% ----------------------------------------------------------
|
||||
|
||||
clean_frame = clean_speech(start:start+winlength-1);
|
||||
processed_frame = processed_speech(start:start+winlength-1);
|
||||
clean_frame = clean_frame.*window;
|
||||
processed_frame = processed_frame.*window;
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (2) Compute the Power Spectrum of Clean and Processed
|
||||
% ----------------------------------------------------------
|
||||
|
||||
if (USE_FFT_SPECTRUM)
|
||||
clean_spec = (abs(fft(clean_frame,n_fft)).^2);
|
||||
processed_spec = (abs(fft(processed_frame,n_fft)).^2);
|
||||
else
|
||||
a_vec = zeros(1,n_fft);
|
||||
a_vec(1:11) = lpc(clean_frame,10);
|
||||
clean_spec = 1.0/(abs(fft(a_vec,n_fft)).^2)';
|
||||
|
||||
a_vec = zeros(1,n_fft);
|
||||
a_vec(1:11) = lpc(processed_frame,10);
|
||||
processed_spec = 1.0/(abs(fft(a_vec,n_fft)).^2)';
|
||||
end
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (3) Compute Filterbank Output Energies (in dB scale)
|
||||
% ----------------------------------------------------------
|
||||
|
||||
for i = 1:num_crit
|
||||
clean_energy(i) = sum(clean_spec(1:n_fftby2) ...
|
||||
.*crit_filter(i,:)');
|
||||
processed_energy(i) = sum(processed_spec(1:n_fftby2) ...
|
||||
.*crit_filter(i,:)');
|
||||
end
|
||||
clean_energy = 10*log10(max(clean_energy,1E-10));
|
||||
processed_energy = 10*log10(max(processed_energy,1E-10));
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (4) Compute Spectral Slope (dB[i+1]-dB[i])
|
||||
% ----------------------------------------------------------
|
||||
|
||||
clean_slope = clean_energy(2:num_crit) - ...
|
||||
clean_energy(1:num_crit-1);
|
||||
processed_slope = processed_energy(2:num_crit) - ...
|
||||
processed_energy(1:num_crit-1);
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (5) Find the nearest peak locations in the spectra to
|
||||
% each critical band. If the slope is negative, we
|
||||
% search to the left. If positive, we search to the
|
||||
% right.
|
||||
% ----------------------------------------------------------
|
||||
|
||||
for i = 1:num_crit-1
|
||||
|
||||
% find the peaks in the clean speech signal
|
||||
|
||||
if (clean_slope(i)>0) % search to the right
|
||||
n = i;
|
||||
while ((n<num_crit) & (clean_slope(n) > 0))
|
||||
n = n+1;
|
||||
end
|
||||
clean_loc_peak(i) = clean_energy(n-1);
|
||||
else % search to the left
|
||||
n = i;
|
||||
while ((n>0) & (clean_slope(n) <= 0))
|
||||
n = n-1;
|
||||
end
|
||||
clean_loc_peak(i) = clean_energy(n+1);
|
||||
end
|
||||
|
||||
% find the peaks in the processed speech signal
|
||||
|
||||
if (processed_slope(i)>0) % search to the right
|
||||
n = i;
|
||||
while ((n<num_crit) & (processed_slope(n) > 0))
|
||||
n = n+1;
|
||||
end
|
||||
processed_loc_peak(i) = processed_energy(n-1);
|
||||
else % search to the left
|
||||
n = i;
|
||||
while ((n>0) & (processed_slope(n) <= 0))
|
||||
n = n-1;
|
||||
end
|
||||
processed_loc_peak(i) = processed_energy(n+1);
|
||||
end
|
||||
|
||||
end
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (6) Compute the WSS Measure for this frame. This
|
||||
% includes determination of the weighting function.
|
||||
% ----------------------------------------------------------
|
||||
|
||||
dBMax_clean = max(clean_energy);
|
||||
dBMax_processed = max(processed_energy);
|
||||
|
||||
% The weights are calculated by averaging individual
|
||||
% weighting factors from the clean and processed frame.
|
||||
% These weights W_clean and W_processed should range
|
||||
% from 0 to 1 and place more emphasis on spectral
|
||||
% peaks and less emphasis on slope differences in spectral
|
||||
% valleys. This procedure is described on page 1280 of
|
||||
% Klatt's 1982 ICASSP paper.
|
||||
|
||||
Wmax_clean = Kmax ./ (Kmax + dBMax_clean - ...
|
||||
clean_energy(1:num_crit-1));
|
||||
Wlocmax_clean = Klocmax ./ ( Klocmax + clean_loc_peak - ...
|
||||
clean_energy(1:num_crit-1));
|
||||
W_clean = Wmax_clean .* Wlocmax_clean;
|
||||
|
||||
Wmax_processed = Kmax ./ (Kmax + dBMax_processed - ...
|
||||
processed_energy(1:num_crit-1));
|
||||
Wlocmax_processed = Klocmax ./ ( Klocmax + processed_loc_peak - ...
|
||||
processed_energy(1:num_crit-1));
|
||||
W_processed = Wmax_processed .* Wlocmax_processed;
|
||||
|
||||
W = (W_clean + W_processed)./2.0;
|
||||
|
||||
distortion(frame_count) = sum(W.*(clean_slope(1:num_crit-1) - ...
|
||||
processed_slope(1:num_crit-1)).^2);
|
||||
|
||||
% this normalization is not part of Klatt's paper, but helps
|
||||
% to normalize the measure. Here we scale the measure by the
|
||||
% sum of the weights.
|
||||
|
||||
distortion(frame_count) = distortion(frame_count)/sum(W);
|
||||
|
||||
start = start + skiprate;
|
||||
|
||||
end
|
||||
|
@ -0,0 +1,496 @@
|
||||
function [Csig,Cbak,Covl]= composite(cleanFile, enhancedFile);
|
||||
% ----------------------------------------------------------------------
|
||||
% Composite Objective Speech Quality Measure
|
||||
%
|
||||
% This function implements the composite objective measure proposed in
|
||||
% [1].
|
||||
%
|
||||
% Usage: [sig,bak,ovl]=composite(cleanFile.wav, enhancedFile.wav)
|
||||
%
|
||||
% cleanFile.wav - clean input file in .wav format
|
||||
% enhancedFile - enhanced output file in .wav format
|
||||
% sig - predicted rating [1-5] of speech distortion
|
||||
% bak - predicted rating [1-5] of noise distortion
|
||||
% ovl - predicted rating [1-5] of overall quality
|
||||
%
|
||||
% In addition to the above ratings (sig, bak, & ovl) it returns
|
||||
% the individual values of the LLR, SNRseg, WSS and PESQ measures.
|
||||
%
|
||||
% Example call: [sig,bak,ovl] =composite('sp04.wav','enhanced.wav')
|
||||
%
|
||||
%
|
||||
% References:
|
||||
%
|
||||
% [1] Hu, Y. and Loizou, P. (2006). Evaluation of objective measures
|
||||
% for speech enhancement. Proc. Interspeech, Pittsburg, PA.
|
||||
%
|
||||
% Authors: Yi Hu and Philipos C. Loizou
|
||||
% (the LLR, SNRseg and WSS measures were based on Bryan Pellom and John
|
||||
% Hansen's implementations)
|
||||
%
|
||||
% Copyright (c) 2006 by Philipos C. Loizou
|
||||
% $Revision: 0.0 $ $Date: 10/09/2006 $
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
if nargin~=2
|
||||
fprintf('USAGE: [sig,bak,ovl]=composite(cleanFile.wav, enhancedFile.wav)\n');
|
||||
fprintf('For more help, type: help composite\n\n');
|
||||
return;
|
||||
end
|
||||
|
||||
alpha= 0.95;
|
||||
|
||||
[data1, Srate1, Nbits1]= wavread(cleanFile);
|
||||
[data2, Srate2, Nbits2]= wavread(enhancedFile);
|
||||
if ( Srate1~= Srate2) | ( Nbits1~= Nbits2)
|
||||
error( 'The two files do not match!\n');
|
||||
end
|
||||
|
||||
len= min( length( data1), length( data2));
|
||||
data1= data1( 1: len)+eps;
|
||||
data2= data2( 1: len)+eps;
|
||||
|
||||
|
||||
% -- compute the WSS measure ---
|
||||
%
|
||||
wss_dist_vec= wss( data1, data2,Srate1);
|
||||
wss_dist_vec= sort( wss_dist_vec);
|
||||
wss_dist= mean( wss_dist_vec( 1: round( length( wss_dist_vec)*alpha)));
|
||||
|
||||
% --- compute the LLR measure ---------
|
||||
%
|
||||
LLR_dist= llr( data1, data2,Srate1);
|
||||
LLRs= sort(LLR_dist);
|
||||
LLR_len= round( length(LLR_dist)* alpha);
|
||||
llr_mean= mean( LLRs( 1: LLR_len));
|
||||
|
||||
% --- compute the SNRseg ----------------
|
||||
%
|
||||
[snr_dist, segsnr_dist]= snr( data1, data2,Srate1);
|
||||
snr_mean= snr_dist;
|
||||
segSNR= mean( segsnr_dist);
|
||||
|
||||
|
||||
% -- compute the pesq ----
|
||||
[pesq_mos]= pesq(cleanFile, enhancedFile);
|
||||
|
||||
|
||||
% --- now compute the composite measures ------------------
|
||||
%
|
||||
Csig = 3.093 - 1.029*llr_mean + 0.603*pesq_mos-0.009*wss_dist;
|
||||
Csig = max(1,Csig); Csig=min(5, Csig); % limit values to [1, 5]
|
||||
Cbak = 1.634 + 0.478 *pesq_mos - 0.007*wss_dist + 0.063*segSNR;
|
||||
Cbak = max(1, Cbak); Cbak=min(5,Cbak); % limit values to [1, 5]
|
||||
Covl = 1.594 + 0.805*pesq_mos - 0.512*llr_mean - 0.007*wss_dist;
|
||||
Covl = max(1, Covl); Covl=min(5, Covl); % limit values to [1, 5]
|
||||
|
||||
fprintf('\n LLR=%f SNRseg=%f WSS=%f PESQ=%f\n',llr_mean,segSNR,wss_dist,pesq_mos);
|
||||
|
||||
return; %=================================================================
|
||||
|
||||
|
||||
function distortion = wss(clean_speech, processed_speech,sample_rate)
|
||||
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Check the length of the clean and processed speech. Must be the same.
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
clean_length = length(clean_speech);
|
||||
processed_length = length(processed_speech);
|
||||
|
||||
if (clean_length ~= processed_length)
|
||||
disp('Error: Files musthave same length.');
|
||||
return
|
||||
end
|
||||
|
||||
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Global Variables
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
winlength = round(30*sample_rate/1000); %240; % window length in samples
|
||||
skiprate = floor(winlength/4); % window skip in samples
|
||||
max_freq = sample_rate/2; % maximum bandwidth
|
||||
num_crit = 25; % number of critical bands
|
||||
|
||||
USE_FFT_SPECTRUM = 1; % defaults to 10th order LP spectrum
|
||||
n_fft = 2^nextpow2(2*winlength);
|
||||
n_fftby2 = n_fft/2; % FFT size/2
|
||||
Kmax = 20; % value suggested by Klatt, pg 1280
|
||||
Klocmax = 1; % value suggested by Klatt, pg 1280
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Critical Band Filter Definitions (Center Frequency and Bandwidths in Hz)
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
cent_freq(1) = 50.0000; bandwidth(1) = 70.0000;
|
||||
cent_freq(2) = 120.000; bandwidth(2) = 70.0000;
|
||||
cent_freq(3) = 190.000; bandwidth(3) = 70.0000;
|
||||
cent_freq(4) = 260.000; bandwidth(4) = 70.0000;
|
||||
cent_freq(5) = 330.000; bandwidth(5) = 70.0000;
|
||||
cent_freq(6) = 400.000; bandwidth(6) = 70.0000;
|
||||
cent_freq(7) = 470.000; bandwidth(7) = 70.0000;
|
||||
cent_freq(8) = 540.000; bandwidth(8) = 77.3724;
|
||||
cent_freq(9) = 617.372; bandwidth(9) = 86.0056;
|
||||
cent_freq(10) = 703.378; bandwidth(10) = 95.3398;
|
||||
cent_freq(11) = 798.717; bandwidth(11) = 105.411;
|
||||
cent_freq(12) = 904.128; bandwidth(12) = 116.256;
|
||||
cent_freq(13) = 1020.38; bandwidth(13) = 127.914;
|
||||
cent_freq(14) = 1148.30; bandwidth(14) = 140.423;
|
||||
cent_freq(15) = 1288.72; bandwidth(15) = 153.823;
|
||||
cent_freq(16) = 1442.54; bandwidth(16) = 168.154;
|
||||
cent_freq(17) = 1610.70; bandwidth(17) = 183.457;
|
||||
cent_freq(18) = 1794.16; bandwidth(18) = 199.776;
|
||||
cent_freq(19) = 1993.93; bandwidth(19) = 217.153;
|
||||
cent_freq(20) = 2211.08; bandwidth(20) = 235.631;
|
||||
cent_freq(21) = 2446.71; bandwidth(21) = 255.255;
|
||||
cent_freq(22) = 2701.97; bandwidth(22) = 276.072;
|
||||
cent_freq(23) = 2978.04; bandwidth(23) = 298.126;
|
||||
cent_freq(24) = 3276.17; bandwidth(24) = 321.465;
|
||||
cent_freq(25) = 3597.63; bandwidth(25) = 346.136;
|
||||
|
||||
bw_min = bandwidth (1); % minimum critical bandwidth
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Set up the critical band filters. Note here that Gaussianly shaped
|
||||
% filters are used. Also, the sum of the filter weights are equivalent
|
||||
% for each critical band filter. Filter less than -30 dB and set to
|
||||
% zero.
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
min_factor = exp (-30.0 / (2.0 * 2.303)); % -30 dB point of filter
|
||||
|
||||
for i = 1:num_crit
|
||||
f0 = (cent_freq (i) / max_freq) * (n_fftby2);
|
||||
all_f0(i) = floor(f0);
|
||||
bw = (bandwidth (i) / max_freq) * (n_fftby2);
|
||||
norm_factor = log(bw_min) - log(bandwidth(i));
|
||||
j = 0:1:n_fftby2-1;
|
||||
crit_filter(i,:) = exp (-11 *(((j - floor(f0)) ./bw).^2) + norm_factor);
|
||||
crit_filter(i,:) = crit_filter(i,:).*(crit_filter(i,:) > min_factor);
|
||||
end
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% For each frame of input speech, calculate the Weighted Spectral
|
||||
% Slope Measure
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames
|
||||
start = 1; % starting sample
|
||||
window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
|
||||
|
||||
for frame_count = 1:num_frames
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (1) Get the Frames for the test and reference speech.
|
||||
% Multiply by Hanning Window.
|
||||
% ----------------------------------------------------------
|
||||
|
||||
clean_frame = clean_speech(start:start+winlength-1);
|
||||
processed_frame = processed_speech(start:start+winlength-1);
|
||||
clean_frame = clean_frame.*window;
|
||||
processed_frame = processed_frame.*window;
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (2) Compute the Power Spectrum of Clean and Processed
|
||||
% ----------------------------------------------------------
|
||||
|
||||
if (USE_FFT_SPECTRUM)
|
||||
clean_spec = (abs(fft(clean_frame,n_fft)).^2);
|
||||
processed_spec = (abs(fft(processed_frame,n_fft)).^2);
|
||||
else
|
||||
a_vec = zeros(1,n_fft);
|
||||
a_vec(1:11) = lpc(clean_frame,10);
|
||||
clean_spec = 1.0/(abs(fft(a_vec,n_fft)).^2)';
|
||||
|
||||
a_vec = zeros(1,n_fft);
|
||||
a_vec(1:11) = lpc(processed_frame,10);
|
||||
processed_spec = 1.0/(abs(fft(a_vec,n_fft)).^2)';
|
||||
end
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (3) Compute Filterbank Output Energies (in dB scale)
|
||||
% ----------------------------------------------------------
|
||||
|
||||
for i = 1:num_crit
|
||||
clean_energy(i) = sum(clean_spec(1:n_fftby2) ...
|
||||
.*crit_filter(i,:)');
|
||||
processed_energy(i) = sum(processed_spec(1:n_fftby2) ...
|
||||
.*crit_filter(i,:)');
|
||||
end
|
||||
clean_energy = 10*log10(max(clean_energy,1E-10));
|
||||
processed_energy = 10*log10(max(processed_energy,1E-10));
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (4) Compute Spectral Slope (dB[i+1]-dB[i])
|
||||
% ----------------------------------------------------------
|
||||
|
||||
clean_slope = clean_energy(2:num_crit) - ...
|
||||
clean_energy(1:num_crit-1);
|
||||
processed_slope = processed_energy(2:num_crit) - ...
|
||||
processed_energy(1:num_crit-1);
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (5) Find the nearest peak locations in the spectra to
|
||||
% each critical band. If the slope is negative, we
|
||||
% search to the left. If positive, we search to the
|
||||
% right.
|
||||
% ----------------------------------------------------------
|
||||
|
||||
for i = 1:num_crit-1
|
||||
|
||||
% find the peaks in the clean speech signal
|
||||
|
||||
if (clean_slope(i)>0) % search to the right
|
||||
n = i;
|
||||
while ((n<num_crit) & (clean_slope(n) > 0))
|
||||
n = n+1;
|
||||
end
|
||||
clean_loc_peak(i) = clean_energy(n-1);
|
||||
else % search to the left
|
||||
n = i;
|
||||
while ((n>0) & (clean_slope(n) <= 0))
|
||||
n = n-1;
|
||||
end
|
||||
clean_loc_peak(i) = clean_energy(n+1);
|
||||
end
|
||||
|
||||
% find the peaks in the processed speech signal
|
||||
|
||||
if (processed_slope(i)>0) % search to the right
|
||||
n = i;
|
||||
while ((n<num_crit) & (processed_slope(n) > 0))
|
||||
n = n+1;
|
||||
end
|
||||
processed_loc_peak(i) = processed_energy(n-1);
|
||||
else % search to the left
|
||||
n = i;
|
||||
while ((n>0) & (processed_slope(n) <= 0))
|
||||
n = n-1;
|
||||
end
|
||||
processed_loc_peak(i) = processed_energy(n+1);
|
||||
end
|
||||
|
||||
end
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (6) Compute the WSS Measure for this frame. This
|
||||
% includes determination of the weighting function.
|
||||
% ----------------------------------------------------------
|
||||
|
||||
dBMax_clean = max(clean_energy);
|
||||
dBMax_processed = max(processed_energy);
|
||||
|
||||
% The weights are calculated by averaging individual
|
||||
% weighting factors from the clean and processed frame.
|
||||
% These weights W_clean and W_processed should range
|
||||
% from 0 to 1 and place more emphasis on spectral
|
||||
% peaks and less emphasis on slope differences in spectral
|
||||
% valleys. This procedure is described on page 1280 of
|
||||
% Klatt's 1982 ICASSP paper.
|
||||
|
||||
Wmax_clean = Kmax ./ (Kmax + dBMax_clean - ...
|
||||
clean_energy(1:num_crit-1));
|
||||
Wlocmax_clean = Klocmax ./ ( Klocmax + clean_loc_peak - ...
|
||||
clean_energy(1:num_crit-1));
|
||||
W_clean = Wmax_clean .* Wlocmax_clean;
|
||||
|
||||
Wmax_processed = Kmax ./ (Kmax + dBMax_processed - ...
|
||||
processed_energy(1:num_crit-1));
|
||||
Wlocmax_processed = Klocmax ./ ( Klocmax + processed_loc_peak - ...
|
||||
processed_energy(1:num_crit-1));
|
||||
W_processed = Wmax_processed .* Wlocmax_processed;
|
||||
|
||||
W = (W_clean + W_processed)./2.0;
|
||||
|
||||
distortion(frame_count) = sum(W.*(clean_slope(1:num_crit-1) - ...
|
||||
processed_slope(1:num_crit-1)).^2);
|
||||
|
||||
% this normalization is not part of Klatt's paper, but helps
|
||||
% to normalize the measure. Here we scale the measure by the
|
||||
% sum of the weights.
|
||||
|
||||
distortion(frame_count) = distortion(frame_count)/sum(W);
|
||||
|
||||
start = start + skiprate;
|
||||
|
||||
end
|
||||
|
||||
%-----------------------------------------------
|
||||
function distortion = llr(clean_speech, processed_speech,sample_rate)
|
||||
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Check the length of the clean and processed speech. Must be the same.
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
clean_length = length(clean_speech);
|
||||
processed_length = length(processed_speech);
|
||||
|
||||
if (clean_length ~= processed_length)
|
||||
disp('Error: Both Speech Files must be same length.');
|
||||
return
|
||||
end
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Global Variables
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
winlength = round(30*sample_rate/1000); % window length in samples
|
||||
skiprate = floor(winlength/4); % window skip in samples
|
||||
if sample_rate<10000
|
||||
P = 10; % LPC Analysis Order
|
||||
else
|
||||
P=16; % this could vary depending on sampling frequency.
|
||||
end
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% For each frame of input speech, calculate the Log Likelihood Ratio
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames
|
||||
start = 1; % starting sample
|
||||
window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
|
||||
|
||||
for frame_count = 1:num_frames
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (1) Get the Frames for the test and reference speech.
|
||||
% Multiply by Hanning Window.
|
||||
% ----------------------------------------------------------
|
||||
|
||||
clean_frame = clean_speech(start:start+winlength-1);
|
||||
processed_frame = processed_speech(start:start+winlength-1);
|
||||
clean_frame = clean_frame.*window;
|
||||
processed_frame = processed_frame.*window;
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (2) Get the autocorrelation lags and LPC parameters used
|
||||
% to compute the LLR measure.
|
||||
% ----------------------------------------------------------
|
||||
|
||||
[R_clean, Ref_clean, A_clean] = ...
|
||||
lpcoeff(clean_frame, P);
|
||||
[R_processed, Ref_processed, A_processed] = ...
|
||||
lpcoeff(processed_frame, P);
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (3) Compute the LLR measure
|
||||
% ----------------------------------------------------------
|
||||
|
||||
numerator = A_processed*toeplitz(R_clean)*A_processed';
|
||||
denominator = A_clean*toeplitz(R_clean)*A_clean';
|
||||
distortion(frame_count) = log(numerator/denominator);
|
||||
start = start + skiprate;
|
||||
|
||||
end
|
||||
|
||||
%---------------------------------------------
|
||||
function [acorr, refcoeff, lpparams] = lpcoeff(speech_frame, model_order)
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (1) Compute Autocorrelation Lags
|
||||
% ----------------------------------------------------------
|
||||
|
||||
winlength = max(size(speech_frame));
|
||||
for k=1:model_order+1
|
||||
R(k) = sum(speech_frame(1:winlength-k+1) ...
|
||||
.*speech_frame(k:winlength));
|
||||
end
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (2) Levinson-Durbin
|
||||
% ----------------------------------------------------------
|
||||
|
||||
a = ones(1,model_order);
|
||||
E(1)=R(1);
|
||||
for i=1:model_order
|
||||
a_past(1:i-1) = a(1:i-1);
|
||||
sum_term = sum(a_past(1:i-1).*R(i:-1:2));
|
||||
rcoeff(i)=(R(i+1) - sum_term) / E(i);
|
||||
a(i)=rcoeff(i);
|
||||
a(1:i-1) = a_past(1:i-1) - rcoeff(i).*a_past(i-1:-1:1);
|
||||
E(i+1)=(1-rcoeff(i)*rcoeff(i))*E(i);
|
||||
end
|
||||
|
||||
acorr = R;
|
||||
refcoeff = rcoeff;
|
||||
lpparams = [1 -a];
|
||||
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
function [overall_snr, segmental_snr] = snr(clean_speech, processed_speech,sample_rate)
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Check the length of the clean and processed speech. Must be the same.
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
clean_length = length(clean_speech);
|
||||
processed_length = length(processed_speech);
|
||||
|
||||
if (clean_length ~= processed_length)
|
||||
disp('Error: Both Speech Files must be same length.');
|
||||
return
|
||||
end
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Scale both clean speech and processed speech to have same dynamic
|
||||
% range. Also remove DC component from each signal
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
%clean_speech = clean_speech - mean(clean_speech);
|
||||
%processed_speech = processed_speech - mean(processed_speech);
|
||||
|
||||
%processed_speech = processed_speech.*(max(abs(clean_speech))/ max(abs(processed_speech)));
|
||||
|
||||
overall_snr = 10* log10( sum(clean_speech.^2)/sum((clean_speech-processed_speech).^2));
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% Global Variables
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
winlength = round(30*sample_rate/1000); %240; % window length in samples
|
||||
skiprate = floor(winlength/4); % window skip in samples
|
||||
MIN_SNR = -10; % minimum SNR in dB
|
||||
MAX_SNR = 35; % maximum SNR in dB
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% For each frame of input speech, calculate the Segmental SNR
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames
|
||||
start = 1; % starting sample
|
||||
window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
|
||||
|
||||
for frame_count = 1: num_frames
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (1) Get the Frames for the test and reference speech.
|
||||
% Multiply by Hanning Window.
|
||||
% ----------------------------------------------------------
|
||||
|
||||
clean_frame = clean_speech(start:start+winlength-1);
|
||||
processed_frame = processed_speech(start:start+winlength-1);
|
||||
clean_frame = clean_frame.*window;
|
||||
processed_frame = processed_frame.*window;
|
||||
|
||||
% ----------------------------------------------------------
|
||||
% (2) Compute the Segmental SNR
|
||||
% ----------------------------------------------------------
|
||||
|
||||
signal_energy = sum(clean_frame.^2);
|
||||
noise_energy = sum((clean_frame-processed_frame).^2);
|
||||
segmental_snr(frame_count) = 10*log10(signal_energy/(noise_energy+eps)+eps);
|
||||
segmental_snr(frame_count) = max(segmental_snr(frame_count),MIN_SNR);
|
||||
segmental_snr(frame_count) = min(segmental_snr(frame_count),MAX_SNR);
|
||||
|
||||
start = start + skiprate;
|
||||
|
||||
end
|
||||
|
||||
|
||||
|
@ -0,0 +1,84 @@
|
||||
function crude_align( ref_logVAD, ref_Nsamples, deg_logVAD, ...
|
||||
deg_Nsamples, Utt_id)
|
||||
|
||||
global Downsample
|
||||
global Nutterances Largest_uttsize Nsurf_samples Crude_DelayEst
|
||||
global Crude_DelayConf UttSearch_Start UttSearch_End Utt_DelayEst
|
||||
global Utt_Delay Utt_DelayConf Utt_Start Utt_End
|
||||
global MAXNUTTERANCES WHOLE_SIGNAL
|
||||
global pesq_mos subj_mos cond_nr
|
||||
|
||||
if (Utt_id== WHOLE_SIGNAL )
|
||||
nr = floor( ref_Nsamples/ Downsample);
|
||||
nd = floor( deg_Nsamples/ Downsample);
|
||||
startr= 1;
|
||||
startd= 1;
|
||||
elseif Utt_id== MAXNUTTERANCES
|
||||
startr= UttSearch_Start(MAXNUTTERANCES);
|
||||
startd= startr+ Utt_DelayEst(MAXNUTTERANCES)/ Downsample;
|
||||
if ( startd< 0 )
|
||||
startr= 1- Utt_DelayEst(MAXNUTTERANCES)/ Downsample;
|
||||
startd= 1;
|
||||
end
|
||||
|
||||
nr= UttSearch_End(MAXNUTTERANCES)- startr;
|
||||
nd= nr;
|
||||
|
||||
if( startd+ nd> floor( deg_Nsamples/ Downsample) )
|
||||
nd= floor( deg_Nsamples/ Downsample)- startd;
|
||||
end
|
||||
% fprintf( 'nr,nd is %d,%d\n', nr, nd);
|
||||
|
||||
else
|
||||
startr= UttSearch_Start(Utt_id);
|
||||
startd= startr+ Crude_DelayEst/ Downsample;
|
||||
|
||||
if ( startd< 0 )
|
||||
startr= 1- Crude_DelayEst/ Downsample;
|
||||
startd= 1;
|
||||
end
|
||||
|
||||
nr= UttSearch_End(Utt_id)- startr;
|
||||
nd = nr;
|
||||
if( startd+ nd> floor( deg_Nsamples/ Downsample)+ 1)
|
||||
nd = floor( deg_Nsamples/ Downsample)- startd+ 1;
|
||||
end
|
||||
end
|
||||
|
||||
max_Y= 0.0;
|
||||
I_max_Y= nr;
|
||||
if( (nr> 1) && (nd> 1) )
|
||||
Y= FFTNXCorr( ref_logVAD, startr, nr, deg_logVAD, startd, nd);
|
||||
[max_Y, I_max_Y]= max( Y);
|
||||
if (max_Y<= 0)
|
||||
max_Y= 0;
|
||||
I_max_Y= nr;
|
||||
end
|
||||
end
|
||||
|
||||
% fprintf( 'max_Y, I_max_Y is %f, %d\n', max_Y, I_max_Y);
|
||||
|
||||
if( Utt_id== WHOLE_SIGNAL )
|
||||
Crude_DelayEst= (I_max_Y- nr)* Downsample;
|
||||
Crude_DelayConf= 0.0;
|
||||
% fprintf( 1, 'I_max_Y, nr, Crude_DelayEst is %f, %f, %f\n', ...
|
||||
% I_max_Y, nr, Crude_DelayEst);
|
||||
elseif( Utt_id == MAXNUTTERANCES )
|
||||
Utt_Delay(MAXNUTTERANCES)= (I_max_Y- nr)* Downsample+ ...
|
||||
Utt_DelayEst(MAXNUTTERANCES);
|
||||
% fprintf( 'startr, startd, nr, nd, I_max, Utt_Delay[%d] is %d, %d, %d, %d, %d, %d\n', ...
|
||||
% MAXNUTTERANCES, startr, startd, nr, nd, ...
|
||||
% I_max_Y, Utt_Delay(MAXNUTTERANCES) );
|
||||
else
|
||||
% fprintf( 'I_max_Y, nr is %d, %d\n', I_max_Y, nr);
|
||||
Utt_DelayEst(Utt_id)= (I_max_Y- nr)* Downsample+ ...
|
||||
Crude_DelayEst;
|
||||
end
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -0,0 +1,21 @@
|
||||
function mod_data= fix_power_level( data, data_Nsamples, maxNsamples)
|
||||
% this function is used for level normalization, i.e., to fix the power
|
||||
% level of data to a preset number, and return it to mod_data.
|
||||
|
||||
global Downsample DATAPADDING_MSECS SEARCHBUFFER Fs
|
||||
global TARGET_AVG_POWER
|
||||
TARGET_AVG_POWER= 1e7;
|
||||
|
||||
align_filter_dB= [0,-500; 50, -500; 100, -500; 125, -500; 160, -500; 200, -500;
|
||||
250, -500; 300, -500; 350, 0; 400, 0; 500, 0; 600, 0; 630, 0;
|
||||
800, 0; 1000, 0; 1250, 0; 1600, 0; 2000, 0; 2500, 0; 3000, 0;
|
||||
3250, 0; 3500, -500; 4000, -500; 5000, -500; 6300, -500; 8000, -500];
|
||||
|
||||
align_filtered= apply_filter( data, data_Nsamples, align_filter_dB);
|
||||
power_above_300Hz = pow_of (align_filtered, SEARCHBUFFER* Downsample+ 1, ...
|
||||
data_Nsamples- SEARCHBUFFER* Downsample+ DATAPADDING_MSECS* (Fs/ 1000), ...
|
||||
maxNsamples- 2* SEARCHBUFFER* Downsample+ DATAPADDING_MSECS* (Fs/ 1000));
|
||||
|
||||
global_scale= sqrt( TARGET_AVG_POWER/ power_above_300Hz);
|
||||
% fprintf( 1, '\tglobal_scale is %f\n', global_scale);
|
||||
mod_data= data* global_scale;
|
@ -0,0 +1,68 @@
|
||||
function id_searchwindows( ref_VAD, ref_Nsamples, deg_VAD, deg_Nsamples);
|
||||
|
||||
global MINUTTLENGTH Downsample MINUTTLENGTH SEARCHBUFFER
|
||||
global Crude_DelayEst Nutterances UttSearch_Start UttSearch_End
|
||||
|
||||
Utt_num = 1;
|
||||
speech_flag = 0;
|
||||
|
||||
VAD_length= floor( ref_Nsamples/ Downsample);
|
||||
del_deg_start= MINUTTLENGTH- Crude_DelayEst/ Downsample;
|
||||
del_deg_end= floor((deg_Nsamples- Crude_DelayEst)/ Downsample)-...
|
||||
MINUTTLENGTH;
|
||||
|
||||
for count= 1: VAD_length
|
||||
VAD_value= ref_VAD(count);
|
||||
if( (VAD_value> 0) && (speech_flag== 0) )
|
||||
speech_flag= 1;
|
||||
this_start= count;
|
||||
UttSearch_Start(Utt_num)= count- SEARCHBUFFER;
|
||||
if( UttSearch_Start(Utt_num)< 0 )
|
||||
UttSearch_Start(Utt_num)= 0;
|
||||
end
|
||||
end
|
||||
|
||||
if( ((VAD_value== 0) || (count == (VAD_length-1))) && ...
|
||||
(speech_flag == 1) )
|
||||
speech_flag = 0;
|
||||
UttSearch_End(Utt_num) = count + SEARCHBUFFER;
|
||||
if( UttSearch_End(Utt_num) > VAD_length - 1 )
|
||||
UttSearch_End(Utt_num) = VAD_length -1;
|
||||
end
|
||||
|
||||
if( ((count - this_start) >= MINUTTLENGTH) &&...
|
||||
(this_start < del_deg_end) &&...
|
||||
(count > del_deg_start) )
|
||||
Utt_num= Utt_num + 1;
|
||||
end
|
||||
end
|
||||
end
|
||||
Utt_num= Utt_num- 1;
|
||||
Nutterances = Utt_num;
|
||||
|
||||
% fprintf( 1, 'Nutterances is %d\n', Nutterances);
|
||||
|
||||
% fid= fopen( 'mat_utt.txt', 'wt');
|
||||
% fprintf( fid, '%d\n', UttSearch_Start( 1: Nutterances));
|
||||
% fprintf( fid, '\n');
|
||||
% fprintf( fid, '%d\n', UttSearch_End( 1: Nutterances));
|
||||
% fclose(fid);
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -0,0 +1,85 @@
|
||||
function id_utterances( ref_Nsamples, ref_VAD, deg_Nsamples)
|
||||
|
||||
global Largest_uttsize MINUTTLENGTH MINUTTLENGTH Crude_DelayEst
|
||||
global Downsample SEARCHBUFFER Nutterances Utt_Start
|
||||
global Utt_End Utt_Delay
|
||||
|
||||
Utt_num = 1;
|
||||
speech_flag = 0;
|
||||
VAD_length = floor( ref_Nsamples / Downsample);
|
||||
% fprintf( 1, 'VAD_length is %d\n', VAD_length);
|
||||
|
||||
del_deg_start = MINUTTLENGTH - Crude_DelayEst / Downsample;
|
||||
del_deg_end = floor((deg_Nsamples- Crude_DelayEst)/ Downsample) ...
|
||||
- MINUTTLENGTH;
|
||||
|
||||
for count = 1: VAD_length
|
||||
VAD_value = ref_VAD(count);
|
||||
if( (VAD_value > 0.0) && (speech_flag == 0) )
|
||||
speech_flag = 1;
|
||||
this_start = count;
|
||||
Utt_Start (Utt_num) = count;
|
||||
end
|
||||
|
||||
if( ((VAD_value == 0) || (count == VAD_length)) && ...
|
||||
(speech_flag == 1) )
|
||||
speech_flag = 0;
|
||||
Utt_End (Utt_num) = count;
|
||||
|
||||
if( ((count - this_start) >= MINUTTLENGTH) && ...
|
||||
(this_start < del_deg_end) && ...
|
||||
(count > del_deg_start) )
|
||||
Utt_num = Utt_num + 1;
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
Utt_Start(1) = SEARCHBUFFER+ 1;
|
||||
Utt_End(Nutterances) = VAD_length - SEARCHBUFFER+ 1;
|
||||
|
||||
for Utt_num = 2: Nutterances
|
||||
this_start = Utt_Start(Utt_num)- 1;
|
||||
last_end = Utt_End(Utt_num - 1)- 1;
|
||||
count = floor( (this_start + last_end) / 2);
|
||||
Utt_Start(Utt_num) = count+ 1;
|
||||
Utt_End(Utt_num - 1) = count+ 1;
|
||||
end
|
||||
|
||||
this_start = (Utt_Start(1)- 1) * Downsample + Utt_Delay(1);
|
||||
if( this_start < (SEARCHBUFFER * Downsample) )
|
||||
count = SEARCHBUFFER + floor( ...
|
||||
(Downsample - 1 - Utt_Delay(1)) / Downsample);
|
||||
Utt_Start(1) = count+ 1;
|
||||
end
|
||||
|
||||
last_end = (Utt_End(Nutterances)- 1) * Downsample + 1 + ...
|
||||
Utt_Delay(Nutterances);
|
||||
% fprintf( 'Utt_End(%d) is %d\n', Nutterances, Utt_End(Nutterances));
|
||||
% fprintf( 'last_end is %d\n', last_end);
|
||||
% fprintf( 'Utt_Delay(%d) is %d\n', Nutterances, Utt_Delay(Nutterances));
|
||||
if( last_end > (deg_Nsamples - SEARCHBUFFER * Downsample+ 1) )
|
||||
count = floor( (deg_Nsamples - Utt_Delay(Nutterances)) / Downsample) ...
|
||||
- SEARCHBUFFER;
|
||||
Utt_End(Nutterances) = count+ 1;
|
||||
end
|
||||
|
||||
for Utt_num = 2: Nutterances
|
||||
this_start = (Utt_Start(Utt_num)- 1) * Downsample + Utt_Delay(Utt_num);
|
||||
last_end = (Utt_End(Utt_num - 1)- 1) * Downsample + Utt_Delay(Utt_num - 1);
|
||||
if( this_start < last_end )
|
||||
count = floor( (this_start + last_end) / 2);
|
||||
this_start = floor( (Downsample- 1+ count- Utt_Delay(Utt_num))...
|
||||
/ Downsample);
|
||||
last_end = floor( (count - Utt_Delay(Utt_num - 1))...
|
||||
/ Downsample);
|
||||
Utt_Start(Utt_num) = this_start+ 1;
|
||||
Utt_End(Utt_num- 1) = last_end+ 1;
|
||||
end
|
||||
end
|
||||
|
||||
Largest_uttsize= max( Utt_End- Utt_Start);
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -0,0 +1,9 @@
|
||||
function [mod_ref_data, mod_deg_data]= input_filter( ref_data, ref_Nsamples, ...
|
||||
deg_data, deg_Nsamples)
|
||||
|
||||
mod_ref_data= DC_block( ref_data, ref_Nsamples);
|
||||
mod_deg_data= DC_block( deg_data, deg_Nsamples);
|
||||
|
||||
mod_ref_data= apply_filters( mod_ref_data, ref_Nsamples);
|
||||
mod_deg_data= apply_filters( mod_deg_data, deg_Nsamples);
|
||||
|
@ -0,0 +1,127 @@
|
||||
function [pesq_mos]= pesq(ref_wav, deg_wav)
|
||||
|
||||
% ----------------------------------------------------------------------
|
||||
% PESQ objective speech quality measure
|
||||
%
|
||||
% This function implements the PESQ measure based on the ITU standard
|
||||
% P.862 [1].
|
||||
%
|
||||
%
|
||||
% Usage: pval=pesq(cleanFile.wav, enhancedFile.wav)
|
||||
%
|
||||
% cleanFile.wav - clean input file in .wav format
|
||||
% enhancedFile - enhanced output file in .wav format
|
||||
% pval - PESQ value
|
||||
%
|
||||
% Note that the PESQ routine only supports sampling rates of 8 kHz and
|
||||
% 16 kHz [1]
|
||||
%
|
||||
% Example call: pval = pesq ('sp04.wav','enhanced.wav')
|
||||
%
|
||||
%
|
||||
% References:
|
||||
% [1] ITU (2000). Perceptual evaluation of speech quality (PESQ), and
|
||||
% objective method for end-to-end speech quality assessment of
|
||||
% narrowband telephone networks and speech codecs. ITU-T
|
||||
% Recommendation P. 862
|
||||
%
|
||||
% Authors: Yi Hu and Philipos C. Loizou
|
||||
%
|
||||
%
|
||||
% Copyright (c) 2006 by Philipos C. Loizou
|
||||
% $Revision: 0.0 $ $Date: 10/09/2006 $
|
||||
% ----------------------------------------------------------------------
|
||||
|
||||
if nargin<2
|
||||
fprintf('Usage: [pesq_mos]=pesq(cleanfile.wav,enhanced.wav) \n');
|
||||
return;
|
||||
end;
|
||||
|
||||
global Downsample DATAPADDING_MSECS SEARCHBUFFER Fs WHOLE_SIGNAL
|
||||
global Align_Nfft Window
|
||||
|
||||
[ref_data,sampling_rate]= audioread( ref_wav);
|
||||
if sampling_rate~=8000 & sampling_rate~=16000
|
||||
error('Sampling frequency needs to be either 8000 or 16000 Hz');
|
||||
end
|
||||
|
||||
setup_global( sampling_rate);
|
||||
|
||||
% Window= hann( Align_Nfft, 'periodic'); %Hanning window
|
||||
% Window= Window';
|
||||
TWOPI= 6.28318530717959;
|
||||
%for count = 0: Align_Nfft- 1
|
||||
% Window(1+ count) = 0.5 * (1.0 - cos((TWOPI * count) / Align_Nfft));
|
||||
%end
|
||||
|
||||
count=0:Align_Nfft- 1;
|
||||
Window= 0.5 * (1.0 - cos((TWOPI * count) / Align_Nfft));
|
||||
|
||||
|
||||
|
||||
ref_data= ref_data';
|
||||
ref_data= ref_data* 32768;
|
||||
ref_Nsamples= length( ref_data)+ 2* SEARCHBUFFER* Downsample;
|
||||
ref_data= [zeros( 1, SEARCHBUFFER* Downsample), ref_data, ...
|
||||
zeros( 1, DATAPADDING_MSECS* (Fs/ 1000)+ SEARCHBUFFER* Downsample)];
|
||||
|
||||
deg_data= audioread( deg_wav);
|
||||
deg_data= deg_data';
|
||||
deg_data= deg_data* 32768;
|
||||
deg_Nsamples= length( deg_data)+ 2* SEARCHBUFFER* Downsample;
|
||||
deg_data= [zeros( 1, SEARCHBUFFER* Downsample), deg_data, ...
|
||||
zeros( 1, DATAPADDING_MSECS* (Fs/ 1000)+ SEARCHBUFFER* Downsample)];
|
||||
|
||||
maxNsamples= max( ref_Nsamples, deg_Nsamples);
|
||||
|
||||
ref_data= fix_power_level( ref_data, ref_Nsamples, maxNsamples);
|
||||
deg_data= fix_power_level( deg_data, deg_Nsamples, maxNsamples);
|
||||
|
||||
standard_IRS_filter_dB= [0, -200; 50, -40; 100, -20; 125, -12; 160, -6; 200, 0;...
|
||||
250, 4; 300, 6; 350, 8; 400, 10; 500, 11; 600, 12; 700, 12; 800, 12;...
|
||||
1000, 12; 1300, 12; 1600, 12; 2000, 12; 2500, 12; 3000, 12; 3250, 12;...
|
||||
3500, 4; 4000, -200; 5000, -200; 6300, -200; 8000, -200];
|
||||
|
||||
ref_data= apply_filter( ref_data, ref_Nsamples, standard_IRS_filter_dB);
|
||||
deg_data= apply_filter( deg_data, deg_Nsamples, standard_IRS_filter_dB);
|
||||
%
|
||||
|
||||
|
||||
|
||||
% for later use in psychoacoustical model
|
||||
model_ref= ref_data;
|
||||
model_deg= deg_data;
|
||||
|
||||
[ref_data, deg_data]= input_filter( ref_data, ref_Nsamples, deg_data, ...
|
||||
deg_Nsamples);
|
||||
|
||||
|
||||
[ref_VAD, ref_logVAD]= apply_VAD( ref_data, ref_Nsamples);
|
||||
[deg_VAD, deg_logVAD]= apply_VAD( deg_data, deg_Nsamples);
|
||||
|
||||
|
||||
crude_align (ref_logVAD, ref_Nsamples, deg_logVAD, deg_Nsamples,...
|
||||
WHOLE_SIGNAL);
|
||||
|
||||
utterance_locate (ref_data, ref_Nsamples, ref_VAD, ref_logVAD,...
|
||||
deg_data, deg_Nsamples, deg_VAD, deg_logVAD);
|
||||
|
||||
ref_data= model_ref;
|
||||
deg_data= model_deg;
|
||||
|
||||
% make ref_data and deg_data equal length
|
||||
if (ref_Nsamples< deg_Nsamples)
|
||||
newlen= deg_Nsamples+ DATAPADDING_MSECS* (Fs/ 1000);
|
||||
ref_data( newlen)= 0;
|
||||
elseif (ref_Nsamples> deg_Nsamples)
|
||||
newlen= ref_Nsamples+ DATAPADDING_MSECS* (Fs/ 1000);
|
||||
deg_data( newlen)= 0;
|
||||
end
|
||||
|
||||
|
||||
pesq_mos= pesq_psychoacoustic_model (ref_data, ref_Nsamples, deg_data, ...
|
||||
deg_Nsamples );
|
||||
|
||||
|
||||
|
||||
|
@ -0,0 +1,920 @@
|
||||
function pesq_mos= pesq_psychoacoustic_model (ref_data, ref_Nsamples, deg_data, ...
|
||||
deg_Nsamples )
|
||||
|
||||
global CALIBRATE Nfmax Nb Sl Sp
|
||||
global nr_of_hz_bands_per_bark_band centre_of_band_bark
|
||||
global width_of_band_hz centre_of_band_hz width_of_band_bark
|
||||
global pow_dens_correction_factor abs_thresh_power
|
||||
global Downsample SEARCHBUFFER DATAPADDING_MSECS Fs Nutterances
|
||||
global Utt_Start Utt_End Utt_Delay NUMBER_OF_PSQM_FRAMES_PER_SYLLABE
|
||||
global Fs Plot_Frame
|
||||
|
||||
% Plot_Frame= 75; % this is the frame whose spectrum will be plotted
|
||||
|
||||
FALSE= 0;
|
||||
TRUE= 1;
|
||||
NUMBER_OF_PSQM_FRAMES_PER_SYLLABE= 20;
|
||||
|
||||
maxNsamples = max (ref_Nsamples, deg_Nsamples);
|
||||
Nf = Downsample * 8;
|
||||
MAX_NUMBER_OF_BAD_INTERVALS = 1000;
|
||||
|
||||
start_frame_of_bad_interval= zeros( 1, MAX_NUMBER_OF_BAD_INTERVALS);
|
||||
stop_frame_of_bad_interval= zeros( 1, MAX_NUMBER_OF_BAD_INTERVALS);
|
||||
start_sample_of_bad_interval= zeros( 1, MAX_NUMBER_OF_BAD_INTERVALS);
|
||||
stop_sample_of_bad_interval= zeros( 1, MAX_NUMBER_OF_BAD_INTERVALS);
|
||||
number_of_samples_in_bad_interval= zeros( 1, MAX_NUMBER_OF_BAD_INTERVALS);
|
||||
delay_in_samples_in_bad_interval= zeros( 1, MAX_NUMBER_OF_BAD_INTERVALS);
|
||||
number_of_bad_intervals= 0;
|
||||
there_is_a_bad_frame= FALSE;
|
||||
|
||||
Whanning= hann( Nf, 'periodic');
|
||||
Whanning= Whanning';
|
||||
|
||||
D_POW_F = 2;
|
||||
D_POW_S = 6;
|
||||
D_POW_T = 2;
|
||||
A_POW_F = 1;
|
||||
A_POW_S = 6;
|
||||
A_POW_T = 2;
|
||||
D_WEIGHT= 0.1;
|
||||
A_WEIGHT= 0.0309;
|
||||
|
||||
CRITERIUM_FOR_SILENCE_OF_5_SAMPLES = 500;
|
||||
samples_to_skip_at_start = 0;
|
||||
sum_of_5_samples= 0;
|
||||
while ((sum_of_5_samples< CRITERIUM_FOR_SILENCE_OF_5_SAMPLES) ...
|
||||
&& (samples_to_skip_at_start < maxNsamples / 2))
|
||||
sum_of_5_samples= sum( abs( ref_data( samples_to_skip_at_start...
|
||||
+ SEARCHBUFFER * Downsample + 1: samples_to_skip_at_start...
|
||||
+ SEARCHBUFFER * Downsample + 5)));
|
||||
|
||||
if (sum_of_5_samples< CRITERIUM_FOR_SILENCE_OF_5_SAMPLES)
|
||||
samples_to_skip_at_start = samples_to_skip_at_start+ 1;
|
||||
end
|
||||
end
|
||||
% fprintf( 'samples_to_skip_at_start is %d\n', samples_to_skip_at_start);
|
||||
|
||||
samples_to_skip_at_end = 0;
|
||||
sum_of_5_samples= 0;
|
||||
while ((sum_of_5_samples< CRITERIUM_FOR_SILENCE_OF_5_SAMPLES) ...
|
||||
&& (samples_to_skip_at_end < maxNsamples / 2))
|
||||
sum_of_5_samples= sum( abs( ref_data( maxNsamples - ...
|
||||
SEARCHBUFFER* Downsample + DATAPADDING_MSECS* (Fs/ 1000) ...
|
||||
- samples_to_skip_at_end - 4: maxNsamples - ...
|
||||
SEARCHBUFFER* Downsample + DATAPADDING_MSECS* (Fs/ 1000) ...
|
||||
- samples_to_skip_at_end)));
|
||||
if (sum_of_5_samples< CRITERIUM_FOR_SILENCE_OF_5_SAMPLES)
|
||||
samples_to_skip_at_end = samples_to_skip_at_end+ 1;
|
||||
end
|
||||
end
|
||||
% fprintf( 'samples_to_skip_at_end is %d\n', samples_to_skip_at_end);
|
||||
|
||||
start_frame = floor( samples_to_skip_at_start/ (Nf/ 2));
|
||||
stop_frame = floor( (maxNsamples- 2* SEARCHBUFFER* Downsample ...
|
||||
+ DATAPADDING_MSECS* (Fs/ 1000)- samples_to_skip_at_end) ...
|
||||
/ (Nf/ 2))- 1;
|
||||
% number of frames in speech data plus DATAPADDING_MSECS
|
||||
% fprintf( 'start/end frame is %d/%d\n', start_frame, stop_frame);
|
||||
|
||||
D_disturbance= zeros( stop_frame+ 1, Nb);
|
||||
DA_disturbance= zeros( stop_frame+ 1, Nb);
|
||||
|
||||
power_ref = pow_of (ref_data, SEARCHBUFFER* Downsample, ...
|
||||
maxNsamples- SEARCHBUFFER* Downsample+ DATAPADDING_MSECS* (Fs/ 1000),...
|
||||
maxNsamples- 2* SEARCHBUFFER* Downsample+ DATAPADDING_MSECS* (Fs/ 1000));
|
||||
power_deg = pow_of (deg_data, SEARCHBUFFER * Downsample, ...
|
||||
maxNsamples- SEARCHBUFFER* Downsample+ DATAPADDING_MSECS* (Fs/ 1000),...
|
||||
maxNsamples- 2* SEARCHBUFFER* Downsample+ DATAPADDING_MSECS* (Fs/ 1000));
|
||||
% fprintf( 'ref/deg power is %f/%f\n', power_ref, power_deg);
|
||||
|
||||
hz_spectrum_ref = zeros( 1, Nf/ 2);
|
||||
hz_spectrum_deg = zeros( 1, Nf/ 2);
|
||||
frame_is_bad = zeros( 1, stop_frame + 1);
|
||||
smeared_frame_is_bad = zeros( 1, stop_frame + 1);
|
||||
silent = zeros( 1, stop_frame + 1);
|
||||
|
||||
pitch_pow_dens_ref = zeros( stop_frame + 1, Nb);
|
||||
pitch_pow_dens_deg = zeros( stop_frame + 1, Nb);
|
||||
|
||||
frame_was_skipped = zeros( 1, stop_frame + 1);
|
||||
frame_disturbance = zeros( 1, stop_frame + 1);
|
||||
frame_disturbance_asym_add = zeros( 1, stop_frame + 1);
|
||||
|
||||
avg_pitch_pow_dens_ref = zeros( 1, Nb);
|
||||
avg_pitch_pow_dens_deg = zeros( 1, Nb);
|
||||
loudness_dens_ref = zeros( 1, Nb);
|
||||
loudness_dens_deg = zeros( 1, Nb);
|
||||
deadzone = zeros( 1, Nb);
|
||||
disturbance_dens = zeros( 1, Nb);
|
||||
disturbance_dens_asym_add = zeros( 1, Nb);
|
||||
|
||||
time_weight = zeros( 1, stop_frame + 1);
|
||||
total_power_ref = zeros( 1, stop_frame + 1);
|
||||
|
||||
% fid= fopen( 'tmp_mat.txt', 'wt');
|
||||
|
||||
for frame = 0: stop_frame
|
||||
start_sample_ref = 1+ SEARCHBUFFER * Downsample + frame* (Nf/ 2);
|
||||
hz_spectrum_ref= short_term_fft (Nf, ref_data, Whanning, ...
|
||||
start_sample_ref);
|
||||
|
||||
utt = Nutterances;
|
||||
while ((utt >= 1) && ((Utt_Start(utt)- 1)* Downsample+ 1 ...
|
||||
> start_sample_ref))
|
||||
utt= utt - 1;
|
||||
end
|
||||
|
||||
if (utt >= 1)
|
||||
delay = Utt_Delay(utt);
|
||||
else
|
||||
delay = Utt_Delay(1);
|
||||
end
|
||||
|
||||
start_sample_deg = start_sample_ref + delay;
|
||||
|
||||
if ((start_sample_deg > 0) && (start_sample_deg + Nf- 1 < ...
|
||||
maxNsamples+ DATAPADDING_MSECS* (Fs/ 1000)))
|
||||
hz_spectrum_deg= short_term_fft (Nf, deg_data, Whanning, ...
|
||||
start_sample_deg);
|
||||
else
|
||||
hz_spectrum_deg( 1: Nf/ 2)= 0;
|
||||
end
|
||||
|
||||
pitch_pow_dens_ref( frame+ 1, :)= freq_warping (...
|
||||
hz_spectrum_ref, Nb, frame);
|
||||
%peak = maximum_of (pitch_pow_dens_ref, 0, Nb);
|
||||
pitch_pow_dens_deg( frame+ 1, :)= freq_warping (...
|
||||
hz_spectrum_deg, Nb, frame);
|
||||
|
||||
total_audible_pow_ref = total_audible (frame, pitch_pow_dens_ref, 1E2);
|
||||
total_audible_pow_deg = total_audible (frame, pitch_pow_dens_deg, 1E2);
|
||||
silent(frame+ 1) = (total_audible_pow_ref < 1E7);
|
||||
|
||||
|
||||
end
|
||||
% fclose( fid);
|
||||
|
||||
avg_pitch_pow_dens_ref= time_avg_audible_of (stop_frame + 1, ...
|
||||
silent, pitch_pow_dens_ref, floor((maxNsamples- 2* SEARCHBUFFER* ...
|
||||
Downsample+ DATAPADDING_MSECS* (Fs/ 1000))/ (Nf / 2))- 1);
|
||||
avg_pitch_pow_dens_deg= time_avg_audible_of (stop_frame + 1, ...
|
||||
silent, pitch_pow_dens_deg, floor((maxNsamples- 2* SEARCHBUFFER* ...
|
||||
Downsample+ DATAPADDING_MSECS* (Fs/ 1000))/ (Nf/ 2))- 1);
|
||||
|
||||
% fid= fopen( 'tmp_mat.txt', 'wt');
|
||||
% fprintf( fid, '%f\n', avg_pitch_pow_dens_deg);
|
||||
% fclose( fid);
|
||||
|
||||
if (CALIBRATE== 0)
|
||||
pitch_pow_dens_ref= freq_resp_compensation (stop_frame + 1, ...
|
||||
pitch_pow_dens_ref, avg_pitch_pow_dens_ref, ...
|
||||
avg_pitch_pow_dens_deg, 1000);
|
||||
if (Plot_Frame>= 0) % plot pitch_pow_dens_ref
|
||||
figure;
|
||||
subplot( 1, 2, 1);
|
||||
plot( centre_of_band_hz, 10* log10( eps+ ...
|
||||
pitch_pow_dens_ref( Plot_Frame+ 1, :)));
|
||||
axis( [0 Fs/2 0 95]); %xlabel( 'Hz'); ylabel( 'Db');
|
||||
title( 'reference signal bark spectrum with frequency compensation');
|
||||
subplot( 1, 2, 2);
|
||||
plot( centre_of_band_hz, 10* log10( eps+ ...
|
||||
pitch_pow_dens_deg( Plot_Frame+ 1, :)));
|
||||
axis( [0 Fs/2 0 95]); %xlabel( 'Hz'); ylabel( 'Db');
|
||||
title( 'degraded signal bark spectrum');
|
||||
end
|
||||
|
||||
end
|
||||
% tmp1= pitch_pow_dens_ref';
|
||||
|
||||
|
||||
MAX_SCALE = 5.0;
|
||||
MIN_SCALE = 3e-4;
|
||||
oldScale = 1;
|
||||
THRESHOLD_BAD_FRAMES = 30;
|
||||
for frame = 0: stop_frame
|
||||
|
||||
total_audible_pow_ref = total_audible (frame, pitch_pow_dens_ref, 1);
|
||||
total_audible_pow_deg = total_audible (frame, pitch_pow_dens_deg, 1);
|
||||
total_power_ref (1+ frame) = total_audible_pow_ref;
|
||||
|
||||
scale = (total_audible_pow_ref + 5e3)/ (total_audible_pow_deg + 5e3);
|
||||
if (frame > 0)
|
||||
scale = 0.2 * oldScale + 0.8 * scale;
|
||||
end
|
||||
oldScale = scale;
|
||||
|
||||
if (scale > MAX_SCALE)
|
||||
scale = MAX_SCALE;
|
||||
elseif (scale < MIN_SCALE)
|
||||
scale = MIN_SCALE;
|
||||
end
|
||||
|
||||
pitch_pow_dens_deg( 1+ frame, :) = ...
|
||||
pitch_pow_dens_deg( 1+ frame, :) * scale;
|
||||
|
||||
if (frame== Plot_Frame)
|
||||
figure;
|
||||
subplot( 1, 2, 1);
|
||||
plot( centre_of_band_hz, 10* log10( eps+ ...
|
||||
pitch_pow_dens_ref( Plot_Frame+ 1, :)));
|
||||
axis( [0 Fs/2 0 95]); %xlabel( 'Hz'); ylabel( 'Db');
|
||||
subplot( 1, 2, 2);
|
||||
plot( centre_of_band_hz, 10* log10( eps+ ...
|
||||
pitch_pow_dens_deg( Plot_Frame+ 1, :)));
|
||||
axis( [0 Fs/2 0 95]); %xlabel( 'Hz'); ylabel( 'Db');
|
||||
end
|
||||
|
||||
loudness_dens_ref = intensity_warping_of (frame, pitch_pow_dens_ref);
|
||||
loudness_dens_deg = intensity_warping_of (frame, pitch_pow_dens_deg);
|
||||
disturbance_dens = loudness_dens_deg - loudness_dens_ref;
|
||||
|
||||
if (frame== Plot_Frame)
|
||||
figure;
|
||||
subplot( 1, 2, 1);
|
||||
plot( centre_of_band_hz, 10* log10( eps+ ...
|
||||
loudness_dens_ref));
|
||||
axis( [0 Fs/2 0 15]); %xlabel( 'Hz'); ylabel( 'Db');
|
||||
title( 'reference signal loudness density');
|
||||
subplot( 1, 2, 2);
|
||||
plot( centre_of_band_hz, 10* log10( eps+ ...
|
||||
loudness_dens_deg));
|
||||
axis( [0 Fs/2 0 15]); %xlabel( 'Hz'); ylabel( 'Db');
|
||||
title( 'degraded signal loudness density');
|
||||
end
|
||||
|
||||
for band =1: Nb
|
||||
deadzone (band) = 0.25* min (loudness_dens_deg (band), ...
|
||||
loudness_dens_ref (band));
|
||||
end
|
||||
|
||||
for band = 1: Nb
|
||||
d = disturbance_dens (band);
|
||||
m = deadzone (band);
|
||||
|
||||
if (d > m)
|
||||
disturbance_dens (band) = disturbance_dens (band)- m;
|
||||
% disturbance_dens (band) = d- m;
|
||||
else
|
||||
if (d < -m)
|
||||
disturbance_dens (band) = disturbance_dens (band)+ m;
|
||||
% disturbance_dens (band) = d+ m;
|
||||
else
|
||||
disturbance_dens (band) = 0;
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
if (frame== Plot_Frame)
|
||||
figure;
|
||||
subplot( 1, 2, 1);
|
||||
plot( centre_of_band_hz, disturbance_dens);
|
||||
axis( [0 Fs/2 -1 50]); %xlabel( 'Hz'); ylabel( 'Db');
|
||||
title( 'disturbance');
|
||||
end
|
||||
D_disturbance( frame+ 1, :)= disturbance_dens;
|
||||
|
||||
frame_disturbance (1+ frame) = pseudo_Lp (disturbance_dens, D_POW_F);
|
||||
if (frame_disturbance (1+ frame) > THRESHOLD_BAD_FRAMES)
|
||||
there_is_a_bad_frame = TRUE;
|
||||
end
|
||||
|
||||
disturbance_dens= multiply_with_asymmetry_factor (...
|
||||
disturbance_dens, frame, pitch_pow_dens_ref, pitch_pow_dens_deg);
|
||||
|
||||
if (frame== Plot_Frame)
|
||||
subplot( 1, 2, 2);
|
||||
plot( centre_of_band_hz, disturbance_dens);
|
||||
axis( [0 Fs/2 -1 50]); %xlabel( 'Hz'); ylabel( 'Db');
|
||||
title( 'disturbance after asymmetry processing');
|
||||
end
|
||||
DA_disturbance( frame+ 1, :)= disturbance_dens;
|
||||
|
||||
|
||||
frame_disturbance_asym_add (1+ frame) = ...
|
||||
pseudo_Lp (disturbance_dens, A_POW_F);
|
||||
end
|
||||
% fid= fopen( 'tmp_mat.txt', 'wt');
|
||||
% fprintf( fid, '%f\n', frame_disturbance);
|
||||
% fclose( fid);
|
||||
|
||||
frame_was_skipped (1: 1+ stop_frame) = FALSE;
|
||||
|
||||
for utt = 2: Nutterances
|
||||
frame1 = floor (((Utt_Start(utt)- 1- SEARCHBUFFER )* Downsample+ 1+ ...
|
||||
Utt_Delay(utt))/ (Nf/ 2));
|
||||
j = floor( floor(((Utt_End(utt-1)- 1- SEARCHBUFFER)* Downsample+ 1+ ...
|
||||
Utt_Delay(utt-1)))/(Nf/ 2));
|
||||
delay_jump = Utt_Delay(utt) - Utt_Delay(utt-1);
|
||||
if (frame1 > j)
|
||||
frame1 = j;
|
||||
elseif (frame1 < 0)
|
||||
frame1 = 0;
|
||||
end
|
||||
% fprintf( 'frame1, j, delay_jump is %d, %d, %d\n', frame1, ...
|
||||
% j, delay_jump);
|
||||
|
||||
if (delay_jump < -(Nf/ 2))
|
||||
frame2 = floor (((Utt_Start(utt)- 1- SEARCHBUFFER)* Downsample+ 1 ...
|
||||
+ max (0, abs (delay_jump)))/ (Nf/ 2)) + 1;
|
||||
|
||||
for frame = frame1: frame2
|
||||
if (frame < stop_frame)
|
||||
frame_was_skipped (1+ frame) = TRUE;
|
||||
frame_disturbance (1+ frame) = 0;
|
||||
frame_disturbance_asym_add (1+ frame) = 0;
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
nn = DATAPADDING_MSECS* (Fs/ 1000) + maxNsamples;
|
||||
tweaked_deg = zeros( 1, nn);
|
||||
% fprintf( 'nn is %d\n', nn);
|
||||
|
||||
for i= SEARCHBUFFER* Downsample+ 1: nn- SEARCHBUFFER* Downsample
|
||||
utt = Nutterances;
|
||||
|
||||
while ((utt >= 1) && ((Utt_Start (utt)- 1)* Downsample> i))
|
||||
utt = utt- 1;
|
||||
end
|
||||
if (utt >= 1)
|
||||
delay = Utt_Delay (utt);
|
||||
else
|
||||
delay = Utt_Delay (1);
|
||||
end
|
||||
|
||||
j = i + delay;
|
||||
if (j < SEARCHBUFFER * Downsample+ 1)
|
||||
j = SEARCHBUFFER * Downsample+ 1;
|
||||
end
|
||||
if (j > nn - SEARCHBUFFER * Downsample)
|
||||
j = nn - SEARCHBUFFER * Downsample;
|
||||
end
|
||||
tweaked_deg (i) = deg_data (j);
|
||||
end
|
||||
|
||||
if (there_is_a_bad_frame)
|
||||
|
||||
for frame = 0: stop_frame
|
||||
frame_is_bad (1+ frame) = (frame_disturbance (1+ frame)...
|
||||
> THRESHOLD_BAD_FRAMES);
|
||||
smeared_frame_is_bad (1+ frame) = FALSE;
|
||||
end
|
||||
frame_is_bad (1) = FALSE;
|
||||
SMEAR_RANGE = 2;
|
||||
|
||||
for frame = SMEAR_RANGE: stop_frame- 1- SMEAR_RANGE
|
||||
max_itself_and_left = frame_is_bad (1+ frame);
|
||||
max_itself_and_right = frame_is_bad (1+ frame);
|
||||
|
||||
for i = -SMEAR_RANGE: 0
|
||||
if (max_itself_and_left < frame_is_bad (1+ frame+ i))
|
||||
max_itself_and_left = frame_is_bad (1+ frame+ i);
|
||||
end
|
||||
end
|
||||
|
||||
for i = 0: SMEAR_RANGE
|
||||
if (max_itself_and_right < frame_is_bad (1+ frame + i))
|
||||
max_itself_and_right = frame_is_bad (1+ frame + i);
|
||||
end
|
||||
end
|
||||
|
||||
mini = max_itself_and_left;
|
||||
if (mini > max_itself_and_right)
|
||||
mini = max_itself_and_right;
|
||||
end
|
||||
|
||||
smeared_frame_is_bad (1+ frame) = mini;
|
||||
end
|
||||
|
||||
MINIMUM_NUMBER_OF_BAD_FRAMES_IN_BAD_INTERVAL = 5;
|
||||
number_of_bad_intervals = 0;
|
||||
frame = 0;
|
||||
while (frame <= stop_frame)
|
||||
while ((frame <= stop_frame) && (~smeared_frame_is_bad (1+ frame)))
|
||||
frame= frame+ 1;
|
||||
end
|
||||
|
||||
if (frame <= stop_frame)
|
||||
start_frame_of_bad_interval(1+ number_of_bad_intervals)= ...
|
||||
1+ frame;
|
||||
|
||||
while ((frame <= stop_frame) && (...
|
||||
smeared_frame_is_bad (1+ frame)))
|
||||
frame= frame+ 1;
|
||||
end
|
||||
|
||||
if (frame <= stop_frame)
|
||||
stop_frame_of_bad_interval(1+ number_of_bad_intervals)= ...
|
||||
1+ frame;
|
||||
if (stop_frame_of_bad_interval(1+ number_of_bad_intervals)- ...
|
||||
start_frame_of_bad_interval(1+ number_of_bad_intervals)...
|
||||
>= MINIMUM_NUMBER_OF_BAD_FRAMES_IN_BAD_INTERVAL)
|
||||
number_of_bad_intervals= number_of_bad_intervals+ 1;
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
for bad_interval = 0: number_of_bad_intervals - 1
|
||||
start_sample_of_bad_interval(1+ bad_interval) = ...
|
||||
(start_frame_of_bad_interval(1+ bad_interval)- 1) * (Nf/ 2) ...
|
||||
+ SEARCHBUFFER * Downsample+ 1;
|
||||
stop_sample_of_bad_interval(1+ bad_interval) = ...
|
||||
(stop_frame_of_bad_interval(1+ bad_interval)- 1) * (Nf/ 2) ...
|
||||
+ Nf + SEARCHBUFFER* Downsample;
|
||||
if (stop_frame_of_bad_interval(1+ bad_interval) > stop_frame+ 1)
|
||||
stop_frame_of_bad_interval(1+ bad_interval) = stop_frame+ 1;
|
||||
end
|
||||
|
||||
number_of_samples_in_bad_interval(1+ bad_interval) = ...
|
||||
stop_sample_of_bad_interval(1+ bad_interval) - ...
|
||||
start_sample_of_bad_interval(1+ bad_interval)+ 1;
|
||||
end
|
||||
% fprintf( 'number of bad intervals %d\n', number_of_bad_intervals);
|
||||
% fprintf( '%d %d\n', number_of_samples_in_bad_interval(1), ...
|
||||
% number_of_samples_in_bad_interval(2));
|
||||
% fprintf( '%d %d\n', start_sample_of_bad_interval(1), ...
|
||||
% start_sample_of_bad_interval(2));
|
||||
|
||||
SEARCH_RANGE_IN_TRANSFORM_LENGTH = 4;
|
||||
search_range_in_samples= SEARCH_RANGE_IN_TRANSFORM_LENGTH * Nf;
|
||||
|
||||
for bad_interval= 0: number_of_bad_intervals- 1
|
||||
ref = zeros (1, 2 * search_range_in_samples + ...
|
||||
number_of_samples_in_bad_interval (1+ bad_interval));
|
||||
deg = zeros (1, 2 * search_range_in_samples + ...
|
||||
number_of_samples_in_bad_interval (1+ bad_interval));
|
||||
|
||||
ref(1: search_range_in_samples) = 0;
|
||||
|
||||
ref (search_range_in_samples+ 1: search_range_in_samples+ ...
|
||||
number_of_samples_in_bad_interval (1+ bad_interval)) = ...
|
||||
ref_data (start_sample_of_bad_interval( 1+ bad_interval) + 1: ...
|
||||
start_sample_of_bad_interval( 1+ bad_interval) + ...
|
||||
number_of_samples_in_bad_interval (1+ bad_interval));
|
||||
|
||||
ref (search_range_in_samples + ...
|
||||
number_of_samples_in_bad_interval (1+ bad_interval) + 1: ...
|
||||
search_range_in_samples + ...
|
||||
number_of_samples_in_bad_interval (1+ bad_interval) + ...
|
||||
search_range_in_samples) = 0;
|
||||
|
||||
for i = 0: 2 * search_range_in_samples + ...
|
||||
number_of_samples_in_bad_interval (1+ bad_interval) - 1
|
||||
j = start_sample_of_bad_interval (1+ bad_interval) - ...
|
||||
search_range_in_samples + i;
|
||||
nn = maxNsamples - SEARCHBUFFER * Downsample + ...
|
||||
DATAPADDING_MSECS * (Fs / 1000);
|
||||
if (j <= SEARCHBUFFER * Downsample)
|
||||
j = SEARCHBUFFER * Downsample+ 1;
|
||||
end
|
||||
if (j > nn)
|
||||
j = nn;
|
||||
end
|
||||
deg (1+ i) = tweaked_deg (j);
|
||||
end
|
||||
|
||||
[delay_in_samples, best_correlation]= compute_delay ...
|
||||
(1, 2 * search_range_in_samples + ...
|
||||
number_of_samples_in_bad_interval (1+ bad_interval), ...
|
||||
search_range_in_samples, ref, deg);
|
||||
delay_in_samples_in_bad_interval (1+ bad_interval) = ...
|
||||
delay_in_samples;
|
||||
% fprintf( 'delay_in_samples, best_correlation is \n\t%d, %f\n', ...
|
||||
% delay_in_samples, best_correlation);
|
||||
%
|
||||
if (best_correlation < 0.5)
|
||||
delay_in_samples_in_bad_interval (1+ bad_interval) = 0;
|
||||
end
|
||||
end
|
||||
|
||||
if (number_of_bad_intervals > 0)
|
||||
doubly_tweaked_deg = tweaked_deg( 1: maxNsamples + ...
|
||||
DATAPADDING_MSECS * (Fs / 1000));
|
||||
for bad_interval= 0: number_of_bad_intervals- 1
|
||||
delay = delay_in_samples_in_bad_interval (1+ bad_interval);
|
||||
|
||||
for i = start_sample_of_bad_interval (1+ bad_interval): ...
|
||||
stop_sample_of_bad_interval (1+ bad_interval)
|
||||
j = i + delay;
|
||||
if (j < 1)
|
||||
j = 1;
|
||||
end
|
||||
if (j > maxNsamples)
|
||||
j = maxNsamples;
|
||||
end
|
||||
h = tweaked_deg (j);
|
||||
doubly_tweaked_deg (i) = h;
|
||||
end
|
||||
end
|
||||
|
||||
untweaked_deg = deg_data;
|
||||
deg_data = doubly_tweaked_deg;
|
||||
|
||||
for bad_interval= 0: number_of_bad_intervals- 1
|
||||
for frame = start_frame_of_bad_interval (1+ bad_interval): ...
|
||||
stop_frame_of_bad_interval (1+ bad_interval)- 1
|
||||
frame= frame- 1;
|
||||
start_sample_ref = SEARCHBUFFER * Downsample + ...
|
||||
frame * Nf / 2+ 1;
|
||||
start_sample_deg = start_sample_ref;
|
||||
hz_spectrum_deg= short_term_fft (Nf, deg_data, ...
|
||||
Whanning, start_sample_deg);
|
||||
pitch_pow_dens_deg( 1+ frame, :)= freq_warping (...
|
||||
hz_spectrum_deg, Nb, frame);
|
||||
end
|
||||
|
||||
oldScale = 1;
|
||||
for frame = start_frame_of_bad_interval (1+ bad_interval): ...
|
||||
stop_frame_of_bad_interval (1+ bad_interval)- 1
|
||||
frame= frame- 1;
|
||||
% see implementation for detail why 1 needed to be
|
||||
% subtracted
|
||||
total_audible_pow_ref = total_audible (frame, ...
|
||||
pitch_pow_dens_ref, 1);
|
||||
total_audible_pow_deg = total_audible (frame, ...
|
||||
pitch_pow_dens_deg, 1);
|
||||
scale = (total_audible_pow_ref + 5e3) / ...
|
||||
(total_audible_pow_deg + 5e3);
|
||||
if (frame > 0)
|
||||
scale = 0.2 * oldScale + 0.8*scale;
|
||||
end
|
||||
oldScale = scale;
|
||||
if (scale > MAX_SCALE)
|
||||
scale = MAX_SCALE;
|
||||
end
|
||||
if (scale < MIN_SCALE)
|
||||
scale = MIN_SCALE;
|
||||
end
|
||||
|
||||
pitch_pow_dens_deg (1+ frame, :) = ...
|
||||
pitch_pow_dens_deg (1+ frame, :)* scale;
|
||||
loudness_dens_ref= intensity_warping_of (frame, ...
|
||||
pitch_pow_dens_ref);
|
||||
loudness_dens_deg= intensity_warping_of (frame, ...
|
||||
pitch_pow_dens_deg);
|
||||
disturbance_dens = loudness_dens_deg - loudness_dens_ref;
|
||||
|
||||
for band = 1: Nb
|
||||
deadzone(band) = min (loudness_dens_deg(band), ...
|
||||
loudness_dens_ref(band));
|
||||
deadzone(band) = deadzone(band)* 0.25;
|
||||
end
|
||||
|
||||
for band = 1: Nb
|
||||
d = disturbance_dens (band);
|
||||
m = deadzone (band);
|
||||
|
||||
if (d > m)
|
||||
disturbance_dens (band) = ...
|
||||
disturbance_dens (band)- m;
|
||||
else
|
||||
if (d < -m)
|
||||
disturbance_dens (band) = ...
|
||||
disturbance_dens (band)+ m;
|
||||
else
|
||||
disturbance_dens (band) = 0;
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
frame_disturbance( 1+ frame) = min (...
|
||||
frame_disturbance( 1+ frame), pseudo_Lp(...
|
||||
disturbance_dens, D_POW_F));
|
||||
disturbance_dens= multiply_with_asymmetry_factor ...
|
||||
(disturbance_dens, frame, pitch_pow_dens_ref, ...
|
||||
pitch_pow_dens_deg);
|
||||
frame_disturbance_asym_add(1+ frame) = min (...
|
||||
frame_disturbance_asym_add(1+ frame), ...
|
||||
pseudo_Lp (disturbance_dens, A_POW_F));
|
||||
end
|
||||
end
|
||||
deg_data = untweaked_deg;
|
||||
end
|
||||
end
|
||||
|
||||
for frame = 0: stop_frame
|
||||
h = 1;
|
||||
if (stop_frame + 1 > 1000)
|
||||
n = floor( (maxNsamples - 2 * SEARCHBUFFER * Downsample)...
|
||||
/ (Nf / 2)) - 1;
|
||||
timeWeightFactor = (n - 1000) / 5500;
|
||||
if (timeWeightFactor > 0.5)
|
||||
timeWeightFactor = 0.5;
|
||||
end
|
||||
h = (1.0 - timeWeightFactor) + timeWeightFactor * frame / n;
|
||||
end
|
||||
|
||||
time_weight (1 +frame) = h;
|
||||
end
|
||||
|
||||
% fid= fopen( 'tmp_mat1.txt', 'at');
|
||||
% fprintf( '\n');
|
||||
for frame = 0: stop_frame
|
||||
h = ((total_power_ref (1+ frame) + 1e5) / 1e7)^ 0.04;
|
||||
% if (frame== 118)
|
||||
% fprintf( '%f\n', h);
|
||||
% fprintf( '%f\n', frame_disturbance( 1+ frame));
|
||||
% end
|
||||
frame_disturbance( 1+ frame) = frame_disturbance( 1+ frame)/ h;
|
||||
|
||||
% if (frame== 118)
|
||||
% fprintf( '%f\n', frame_disturbance( 1+ frame));
|
||||
% end
|
||||
%
|
||||
frame_disturbance_asym_add( 1+ frame) = ...
|
||||
frame_disturbance_asym_add( 1+ frame)/ h;
|
||||
if (frame_disturbance( 1+ frame) > 45)
|
||||
frame_disturbance( 1+ frame) = 45;
|
||||
end
|
||||
if (frame_disturbance_asym_add( 1+ frame)> 45)
|
||||
frame_disturbance_asym_add( 1+ frame) = 45;
|
||||
end
|
||||
end
|
||||
% fclose ( fid);
|
||||
|
||||
d_indicator = Lpq_weight (start_frame, stop_frame, ...
|
||||
D_POW_S, D_POW_T, frame_disturbance, time_weight);
|
||||
a_indicator = Lpq_weight (start_frame, stop_frame, ...
|
||||
A_POW_S, A_POW_T, frame_disturbance_asym_add, time_weight);
|
||||
|
||||
pesq_mos = 4.5 - D_WEIGHT * d_indicator - A_WEIGHT * a_indicator;
|
||||
|
||||
if (Plot_Frame> 0)
|
||||
figure;
|
||||
subplot( 1, 2, 1);
|
||||
mesh( 0: stop_frame, centre_of_band_hz, D_disturbance');
|
||||
title( 'disturbance');
|
||||
subplot( 1, 2, 2);
|
||||
mesh( 0: stop_frame, centre_of_band_hz, DA_disturbance');
|
||||
title( 'disturbance after asymmetry processing');
|
||||
end
|
||||
|
||||
% fid= fopen( 'tmp_mat.txt', 'wt');
|
||||
% fprintf( fid, 'time_weight\n');
|
||||
% fprintf( fid, '%f\n', time_weight);
|
||||
% fprintf( fid, 'frame_disturbance:\n');
|
||||
% fprintf( fid, '%f\n', frame_disturbance);
|
||||
% fprintf( fid, 'frame_disturbance_asym_add\n');
|
||||
% fprintf( fid, '%f\n', frame_disturbance_asym_add);
|
||||
% fclose( fid);
|
||||
|
||||
function result_time= Lpq_weight(start_frame, stop_frame, ...
|
||||
power_syllable, power_time, frame_disturbance, time_weight)
|
||||
|
||||
global NUMBER_OF_PSQM_FRAMES_PER_SYLLABE
|
||||
|
||||
% fid= fopen( 'tmp_mat1.txt', 'at');
|
||||
% fprintf( 'result_time:\n');
|
||||
|
||||
result_time= 0;
|
||||
total_time_weight_time = 0;
|
||||
% fprintf( 'start/end frame: %d/%d\n', start_frame, stop_frame);
|
||||
for start_frame_of_syllable = start_frame: ...
|
||||
NUMBER_OF_PSQM_FRAMES_PER_SYLLABE/2: stop_frame
|
||||
result_syllable = 0;
|
||||
count_syllable = 0;
|
||||
|
||||
for frame = start_frame_of_syllable: ...
|
||||
start_frame_of_syllable + NUMBER_OF_PSQM_FRAMES_PER_SYLLABE- 1
|
||||
if (frame <= stop_frame)
|
||||
h = frame_disturbance(1+ frame);
|
||||
% if (start_frame_of_syllable== 101)
|
||||
% fprintf( fid, '%f\n', h);
|
||||
% end
|
||||
result_syllable = result_syllable+ (h^ power_syllable);
|
||||
end
|
||||
count_syllable = count_syllable+ 1;
|
||||
end
|
||||
|
||||
result_syllable = result_syllable/ count_syllable;
|
||||
result_syllable = result_syllable^ (1/power_syllable);
|
||||
|
||||
result_time= result_time+ (time_weight (...
|
||||
1+ start_frame_of_syllable - start_frame) * ...
|
||||
result_syllable)^ power_time;
|
||||
total_time_weight_time = total_time_weight_time+ ...
|
||||
time_weight (1+ start_frame_of_syllable - start_frame)^ power_time;
|
||||
|
||||
% fprintf( fid, '%f\n', result_time);
|
||||
end
|
||||
% fclose (fid);
|
||||
|
||||
% fprintf( 'total_time_weight_time is %f\n', total_time_weight_time);
|
||||
result_time = result_time/ total_time_weight_time;
|
||||
result_time= result_time^ (1/ power_time);
|
||||
% fprintf( 'result_time is %f\n\n', result_time);
|
||||
|
||||
|
||||
function [best_delay, max_correlation] = compute_delay (...
|
||||
start_sample, stop_sample, search_range, ...
|
||||
time_series1, time_series2)
|
||||
|
||||
n = stop_sample - start_sample+ 1;
|
||||
power_of_2 = 2^ (ceil( log2( 2 * n)));
|
||||
|
||||
power1 = pow_of (time_series1, start_sample, stop_sample, n)* ...
|
||||
n/ power_of_2;
|
||||
power2 = pow_of (time_series2, start_sample, stop_sample, n)* ...
|
||||
n/ power_of_2;
|
||||
normalization = sqrt (power1 * power2);
|
||||
% fprintf( 'normalization is %f\n', normalization);
|
||||
|
||||
if ((power1 <= 1e-6) || (power2 <= 1e-6))
|
||||
max_correlation = 0;
|
||||
best_delay= 0;
|
||||
end
|
||||
|
||||
x1( 1: power_of_2)= 0;
|
||||
x2( 1: power_of_2)= 0;
|
||||
y( 1: power_of_2)= 0;
|
||||
|
||||
x1( 1: n)= abs( time_series1( start_sample: ...
|
||||
stop_sample));
|
||||
x2( 1: n)= abs( time_series2( start_sample: ...
|
||||
stop_sample));
|
||||
|
||||
x1_fft= fft( x1, power_of_2)/ power_of_2;
|
||||
x2_fft= fft( x2, power_of_2);
|
||||
x1_fft_conj= conj( x1_fft);
|
||||
y= ifft( x1_fft_conj.* x2_fft, power_of_2);
|
||||
|
||||
best_delay = 0;
|
||||
max_correlation = 0;
|
||||
|
||||
% these loop can be rewritten
|
||||
for i = -search_range: -1
|
||||
h = abs (y (1+ i + power_of_2)) / normalization;
|
||||
if (h > max_correlation)
|
||||
max_correlation = h;
|
||||
best_delay= i;
|
||||
end
|
||||
end
|
||||
for i = 0: search_range- 1
|
||||
h = abs (y (1+i)) / normalization;
|
||||
if (h > max_correlation)
|
||||
max_correlation = h;
|
||||
best_delay= i;
|
||||
end
|
||||
end
|
||||
best_delay= best_delay- 1;
|
||||
|
||||
function mod_disturbance_dens= multiply_with_asymmetry_factor (...
|
||||
disturbance_dens, frame, pitch_pow_dens_ref, pitch_pow_dens_deg)
|
||||
|
||||
global Nb
|
||||
for i = 1: Nb
|
||||
ratio = (pitch_pow_dens_deg(1+ frame, i) + 50)...
|
||||
/ (pitch_pow_dens_ref (1+ frame, i) + 50);
|
||||
h = ratio^ 1.2;
|
||||
if (h > 12)
|
||||
h = 12;
|
||||
elseif (h < 3)
|
||||
h = 0.0;
|
||||
end
|
||||
mod_disturbance_dens (i) = disturbance_dens (i) * h;
|
||||
end
|
||||
|
||||
|
||||
function loudness_dens = intensity_warping_of (...
|
||||
frame, pitch_pow_dens)
|
||||
|
||||
global abs_thresh_power Sl Nb centre_of_band_bark
|
||||
ZWICKER_POWER= 0.23;
|
||||
for band = 1: Nb
|
||||
threshold = abs_thresh_power (band);
|
||||
input = pitch_pow_dens (1+ frame, band);
|
||||
|
||||
if (centre_of_band_bark (band) < 4)
|
||||
h = 6 / (centre_of_band_bark (band) + 2);
|
||||
else
|
||||
h = 1;
|
||||
end
|
||||
|
||||
if (h > 2)
|
||||
h = 2;
|
||||
end
|
||||
h = h^ 0.15;
|
||||
modified_zwicker_power = ZWICKER_POWER * h;
|
||||
if (input > threshold)
|
||||
loudness_dens (band) = ((threshold / 0.5)^ modified_zwicker_power)...
|
||||
* ((0.5 + 0.5 * input / threshold)^ modified_zwicker_power- 1);
|
||||
else
|
||||
loudness_dens (band) = 0;
|
||||
end
|
||||
|
||||
loudness_dens (band) = loudness_dens (band)* Sl;
|
||||
end
|
||||
|
||||
function result= pseudo_Lp (x, p)
|
||||
|
||||
global Nb width_of_band_bark
|
||||
totalWeight = 0;
|
||||
result = 0;
|
||||
for band = 2: Nb
|
||||
h = abs (x (band));
|
||||
w = width_of_band_bark (band);
|
||||
prod = h * w;
|
||||
|
||||
result = result+ prod^ p;
|
||||
totalWeight = totalWeight+ w;
|
||||
end
|
||||
result = (result/ totalWeight)^ (1/p);
|
||||
result = result* totalWeight;
|
||||
|
||||
|
||||
function mod_pitch_pow_dens_ref= freq_resp_compensation (number_of_frames, ...
|
||||
pitch_pow_dens_ref, avg_pitch_pow_dens_ref, ...
|
||||
avg_pitch_pow_dens_deg, constant)
|
||||
|
||||
global Nb
|
||||
|
||||
for band = 1: Nb
|
||||
x = (avg_pitch_pow_dens_deg (band) + constant) / ...
|
||||
(avg_pitch_pow_dens_ref (band) + constant);
|
||||
if (x > 100.0)
|
||||
x = 100.0;
|
||||
elseif (x < 0.01)
|
||||
x = 0.01;
|
||||
end
|
||||
|
||||
for frame = 1: number_of_frames
|
||||
mod_pitch_pow_dens_ref(frame, band) = ...
|
||||
pitch_pow_dens_ref(frame, band) * x;
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
|
||||
function avg_pitch_pow_dens= time_avg_audible_of(number_of_frames, ...
|
||||
silent, pitch_pow_dens, total_number_of_frames)
|
||||
|
||||
global Nb abs_thresh_power
|
||||
|
||||
for band = 1: Nb
|
||||
result = 0;
|
||||
for frame = 1: number_of_frames
|
||||
if (~silent (frame))
|
||||
h = pitch_pow_dens (frame, band);
|
||||
if (h > 100 * abs_thresh_power (band))
|
||||
result = result + h;
|
||||
end
|
||||
end
|
||||
|
||||
avg_pitch_pow_dens (band) = result/ total_number_of_frames;
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
|
||||
function hz_spectrum= short_term_fft (Nf, data, Whanning, start_sample)
|
||||
|
||||
x1= data( start_sample: start_sample+ Nf-1).* Whanning;
|
||||
x1_fft= fft( x1);
|
||||
hz_spectrum= abs( x1_fft( 1: Nf/ 2)).^ 2;
|
||||
hz_spectrum( 1)= 0;
|
||||
|
||||
|
||||
function pitch_pow_dens= freq_warping( hz_spectrum, Nb, frame)
|
||||
|
||||
global nr_of_hz_bands_per_bark_band pow_dens_correction_factor
|
||||
global Sp
|
||||
|
||||
hz_band = 1;
|
||||
for bark_band = 1: Nb
|
||||
n = nr_of_hz_bands_per_bark_band (bark_band);
|
||||
sum = 0;
|
||||
for i = 1: n
|
||||
sum = sum+ hz_spectrum( hz_band);
|
||||
hz_band= hz_band+ 1;
|
||||
end
|
||||
sum = sum* pow_dens_correction_factor (bark_band);
|
||||
sum = sum* Sp;
|
||||
pitch_pow_dens (bark_band) = sum;
|
||||
|
||||
end
|
||||
|
||||
|
||||
function total_audible_pow = total_audible (frame, ...
|
||||
pitch_pow_dens, factor)
|
||||
|
||||
global Nb abs_thresh_power
|
||||
|
||||
total_audible_pow = 0;
|
||||
for band= 2: Nb
|
||||
h = pitch_pow_dens (frame+ 1,band);
|
||||
threshold = factor * abs_thresh_power (band);
|
||||
if (h > threshold)
|
||||
total_audible_pow = total_audible_pow+ h;
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -0,0 +1,3 @@
|
||||
function power= pow_of( data, start_point, end_point, divisor)
|
||||
|
||||
power= sum( data( start_point: end_point).^ 2)/ divisor;
|
@ -0,0 +1,301 @@
|
||||
function setup_global( sampling_rate);
|
||||
|
||||
global Downsample InIIR_Hsos InIIR_Nsos Align_Nfft
|
||||
global DATAPADDING_MSECS SEARCHBUFFER Fs MINSPEECHLGTH JOINSPEECHLGTH
|
||||
|
||||
global Nutterances Largest_uttsize Nsurf_samples Crude_DelayEst
|
||||
global Crude_DelayConf UttSearch_Start UttSearch_End Utt_DelayEst
|
||||
global Utt_Delay Utt_DelayConf Utt_Start Utt_End
|
||||
global MAXNUTTERANCES WHOLE_SIGNAL
|
||||
global pesq_mos subj_mos cond_nr MINUTTLENGTH
|
||||
global CALIBRATE Nfmax Nb Sl Sp
|
||||
global nr_of_hz_bands_per_bark_band centre_of_band_bark
|
||||
global width_of_band_hz centre_of_band_hz width_of_band_bark
|
||||
global pow_dens_correction_factor abs_thresh_power
|
||||
|
||||
CALIBRATE= 0;
|
||||
Nfmax= 512;
|
||||
|
||||
MAXNUTTERANCES= 50;
|
||||
MINUTTLENGTH= 50;
|
||||
WHOLE_SIGNAL= -1;
|
||||
UttSearch_Star= zeros( 1, MAXNUTTERANCES);
|
||||
UttSearch_End= zeros( 1, MAXNUTTERANCES);
|
||||
Utt_DelayEst= zeros( 1, MAXNUTTERANCES);
|
||||
Utt_Delay= zeros( 1, MAXNUTTERANCES);
|
||||
Utt_DelayConf= zeros( 1, MAXNUTTERANCES);
|
||||
Utt_Start= zeros( 1, MAXNUTTERANCES);
|
||||
Utt_End= zeros( 1, MAXNUTTERANCES);
|
||||
|
||||
DATAPADDING_MSECS= 320;
|
||||
SEARCHBUFFER= 75;
|
||||
MINSPEECHLGTH= 4;
|
||||
JOINSPEECHLGTH= 50;
|
||||
|
||||
Sp_16k = 6.910853e-006;
|
||||
Sl_16k = 1.866055e-001;
|
||||
fs_16k= 16000;
|
||||
Downsample_16k = 64;
|
||||
Align_Nfft_16k = 1024;
|
||||
InIIR_Nsos_16k = 12;
|
||||
InIIR_Hsos_16k = [
|
||||
0.325631521, -0.086782860, -0.238848661, -1.079416490, 0.434583902;
|
||||
0.403961804, -0.556985881, 0.153024077, -0.415115835, 0.696590244;
|
||||
4.736162769, 3.287251046, 1.753289019, -1.859599046, 0.876284034;
|
||||
0.365373469, 0.000000000, 0.000000000, -0.634626531, 0.000000000;
|
||||
0.884811506, 0.000000000, 0.000000000, -0.256725271, 0.141536777;
|
||||
0.723593055, -1.447186099, 0.723593044, -1.129587469, 0.657232737;
|
||||
1.644910855, -1.817280902, 1.249658063, -1.778403899, 0.801724355;
|
||||
0.633692689, -0.284644314, -0.319789663, 0.000000000, 0.000000000;
|
||||
1.032763031, 0.268428979, 0.602913323, 0.000000000, 0.000000000;
|
||||
1.001616361, -0.823749013, 0.439731942, -0.885778255, 0.000000000;
|
||||
0.752472096, -0.375388990, 0.188977609, -0.077258216, 0.247230734;
|
||||
1.023700575, 0.001661628, 0.521284240, -0.183867259, 0.354324187
|
||||
];
|
||||
|
||||
Sp_8k = 2.764344e-5;
|
||||
Sl_8k = 1.866055e-1;
|
||||
fs_8k= 8000;
|
||||
Downsample_8k = 32;
|
||||
Align_Nfft_8k = 512;
|
||||
InIIR_Nsos_8k = 8;
|
||||
InIIR_Hsos_8k = [
|
||||
0.885535424, -0.885535424, 0.000000000, -0.771070709, 0.000000000;
|
||||
0.895092588, 1.292907193, 0.449260174, 1.268869037, 0.442025372;
|
||||
4.049527940, -7.865190042, 3.815662102, -1.746859852, 0.786305963;
|
||||
0.500002353, -0.500002353, 0.000000000, 0.000000000, 0.000000000;
|
||||
0.565002834, -0.241585934, -0.306009671, 0.259688659, 0.249979657;
|
||||
2.115237288, 0.919935084, 1.141240051, -1.587313419, 0.665935315;
|
||||
0.912224584, -0.224397719, -0.641121413, -0.246029464, -0.556720590;
|
||||
0.444617727, -0.307589321, 0.141638062, -0.996391149, 0.502251622
|
||||
];
|
||||
|
||||
nr_of_hz_bands_per_bark_band_8k = [
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 2, 1, ...
|
||||
1, 1, 1, 1, 2, 1, 1, 2, 2, 2, ...
|
||||
2, 2, 2, 2, 2, 3, 3, 3, 3, 4, ...
|
||||
3, 4, 5, 4, 5, 6, 6, 7, 8, 9, ...
|
||||
9, 11
|
||||
];
|
||||
|
||||
centre_of_band_bark_8k = [
|
||||
0.078672, 0.316341, 0.636559, 0.961246, 1.290450, ...
|
||||
1.624217, 1.962597, 2.305636, 2.653383, 3.005889, ...
|
||||
3.363201, 3.725371, 4.092449, 4.464486, 4.841533, ...
|
||||
5.223642, 5.610866, 6.003256, 6.400869, 6.803755, ...
|
||||
7.211971, 7.625571, 8.044611, 8.469146, 8.899232, ...
|
||||
9.334927, 9.776288, 10.223374, 10.676242, 11.134952,...
|
||||
11.599563, 12.070135, 12.546731, 13.029408, 13.518232,...
|
||||
14.013264, 14.514566, 15.022202, 15.536238, 16.056736,...
|
||||
16.583761, 17.117382
|
||||
];
|
||||
|
||||
centre_of_band_hz_8k = [
|
||||
7.867213, 31.634144, 63.655895, 96.124611, 129.044968,...
|
||||
162.421738, 196.259659, 230.563568, 265.338348, 300.588867,...
|
||||
336.320129, 372.537140, 409.244934, 446.448578, 484.568604,...
|
||||
526.600586, 570.303833, 619.423340, 672.121643, 728.525696,...
|
||||
785.675964, 846.835693, 909.691650, 977.063293, 1049.861694,...
|
||||
1129.635986, 1217.257568, 1312.109497, 1412.501465, 1517.999390,...
|
||||
1628.894165, 1746.194336, 1871.568848, 2008.776123, 2158.979248,...
|
||||
2326.743164, 2513.787109, 2722.488770, 2952.586670, 3205.835449,...
|
||||
3492.679932, 3820.219238
|
||||
];
|
||||
|
||||
width_of_band_bark_8k = [
|
||||
0.157344, 0.317994, 0.322441, 0.326934, 0.331474, ...
|
||||
0.336061, 0.340697, 0.345381, 0.350114, 0.354897, ...
|
||||
0.359729, 0.364611, 0.369544, 0.374529, 0.379565, ...
|
||||
0.384653, 0.389794, 0.394989, 0.400236, 0.405538, ...
|
||||
0.410894, 0.416306, 0.421773, 0.427297, 0.432877, ...
|
||||
0.438514, 0.444209, 0.449962, 0.455774, 0.461645, ...
|
||||
0.467577, 0.473569, 0.479621, 0.485736, 0.491912, ...
|
||||
0.498151, 0.504454, 0.510819, 0.517250, 0.523745, ...
|
||||
0.530308, 0.536934
|
||||
];
|
||||
|
||||
width_of_band_hz_8k = [
|
||||
15.734426, 31.799433, 32.244064, 32.693359, 33.147385, ...
|
||||
33.606140, 34.069702, 34.538116, 35.011429, 35.489655, ...
|
||||
35.972870, 36.461121, 36.954407, 37.452911, 40.269653, ...
|
||||
42.311859, 45.992554, 51.348511, 55.040527, 56.775208, ...
|
||||
58.699402, 62.445862, 64.820923, 69.195374, 76.745667, ...
|
||||
84.016235, 90.825684, 97.931152, 103.348877, 107.801880, ...
|
||||
113.552246, 121.490601, 130.420410, 143.431763, 158.486816, ...
|
||||
176.872803, 198.314697, 219.549561, 240.600098, 268.702393, ...
|
||||
306.060059, 349.937012
|
||||
];
|
||||
|
||||
pow_dens_correction_factor_8k = [
|
||||
100.000000, 99.999992, 100.000000, 100.000008, 100.000008,...
|
||||
100.000015, 99.999992, 99.999969, 50.000027, 100.000000,...
|
||||
99.999969, 100.000015, 99.999947, 100.000061, 53.047077, ...
|
||||
110.000046, 117.991989, 65.000000, 68.760147, 69.999931, ...
|
||||
71.428818, 75.000038, 76.843384, 80.968781, 88.646126, ...
|
||||
63.864388, 68.155350, 72.547775, 75.584831, 58.379192,...
|
||||
80.950836, 64.135651, 54.384785, 73.821884, 64.437073, ...
|
||||
59.176456, 65.521278, 61.399822, 58.144047, 57.004543,...
|
||||
64.126297, 59.248363
|
||||
];
|
||||
|
||||
abs_thresh_power_8k = [
|
||||
51286152, 2454709.500, 70794.593750, ...
|
||||
4897.788574, 1174.897705, 389.045166, ...
|
||||
104.712860, 45.708820, 17.782795, ...
|
||||
9.772372, 4.897789, 3.090296, ...
|
||||
1.905461, 1.258925, 0.977237, ...
|
||||
0.724436, 0.562341, 0.457088, ...
|
||||
0.389045, 0.331131, 0.295121, ...
|
||||
0.269153, 0.257040, 0.251189, ...
|
||||
0.251189, 0.251189, 0.251189, ...
|
||||
0.263027, 0.288403, 0.309030, ...
|
||||
0.338844, 0.371535, 0.398107, ...
|
||||
0.436516, 0.467735, 0.489779, ...
|
||||
0.501187, 0.501187, 0.512861, ...
|
||||
0.524807, 0.524807, 0.524807
|
||||
];
|
||||
|
||||
nr_of_hz_bands_per_bark_band_16k = [
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 2, 1, ...
|
||||
1, 1, 1, 1, 2, 1, 1, 2, 2, 2, ...
|
||||
2, 2, 2, 2, 2, 3, 3, 3, 3, 4, ...
|
||||
3, 4, 5, 4, 5, 6, 6, 7, 8, 9, ...
|
||||
9, 12, 12, 15, 16, 18, 21, 25, 20
|
||||
];
|
||||
|
||||
centre_of_band_bark_16k = [
|
||||
0.078672, 0.316341, 0.636559, 0.961246, 1.290450, ...
|
||||
1.624217, 1.962597, 2.305636, 2.653383, 3.005889, ...
|
||||
3.363201, 3.725371, 4.092449, 4.464486, 4.841533, ...
|
||||
5.223642, 5.610866, 6.003256, 6.400869, 6.803755, ...
|
||||
7.211971, 7.625571, 8.044611, 8.469146, 8.899232, ...
|
||||
9.334927, 9.776288, 10.223374, 10.676242, 11.134952, ...
|
||||
11.599563, 12.070135, 12.546731, 13.029408, 13.518232, ...
|
||||
14.013264, 14.514566, 15.022202, 15.536238, 16.056736, ...
|
||||
16.583761, 17.117382, 17.657663, 18.204674, 18.758478, ...
|
||||
19.319147, 19.886751, 20.461355, 21.043034
|
||||
];
|
||||
|
||||
centre_of_band_hz_16k = [
|
||||
7.867213, 31.634144, 63.655895, 96.124611, 129.044968,...
|
||||
162.421738, 196.259659, 230.563568, 265.338348, 300.588867,...
|
||||
336.320129, 372.537140, 409.244934, 446.448578, 484.568604,...
|
||||
526.600586, 570.303833, 619.423340, 672.121643, 728.525696,...
|
||||
785.675964, 846.835693, 909.691650, 977.063293, 1049.861694,...
|
||||
1129.635986, 1217.257568, 1312.109497, 1412.501465, 1517.999390,...
|
||||
1628.894165, 1746.194336, 1871.568848, 2008.776123, 2158.979248,...
|
||||
2326.743164, 2513.787109, 2722.488770, 2952.586670, 3205.835449,...
|
||||
3492.679932, 3820.219238, 4193.938477, 4619.846191, 5100.437012,...
|
||||
5636.199219, 6234.313477, 6946.734863, 7796.473633
|
||||
];
|
||||
|
||||
width_of_band_bark_16k = [
|
||||
0.157344, 0.317994, 0.322441, 0.326934, 0.331474,...
|
||||
0.336061, 0.340697, 0.345381, 0.350114, 0.354897,...
|
||||
0.359729, 0.364611, 0.369544, 0.374529, 0.379565,...
|
||||
0.384653, 0.389794, 0.394989, 0.400236, 0.405538,...
|
||||
0.410894, 0.416306, 0.421773, 0.427297, 0.432877,...
|
||||
0.438514, 0.444209, 0.449962, 0.455774, 0.461645,...
|
||||
0.467577, 0.473569, 0.479621, 0.485736, 0.491912,...
|
||||
0.498151, 0.504454, 0.510819, 0.517250, 0.523745,...
|
||||
0.530308, 0.536934, 0.543629, 0.550390, 0.557220,...
|
||||
0.564119, 0.571085, 0.578125, 0.585232
|
||||
];
|
||||
|
||||
width_of_band_hz_16k = [
|
||||
15.734426, 31.799433, 32.244064, 32.693359, ...
|
||||
33.147385, 33.606140, 34.069702, 34.538116, ...
|
||||
35.011429, 35.489655, 35.972870, 36.461121, ...
|
||||
36.954407, 37.452911, 40.269653, 42.311859, ...
|
||||
45.992554, 51.348511, 55.040527, 56.775208, ...
|
||||
58.699402, 62.445862, 64.820923, 69.195374, ...
|
||||
76.745667, 84.016235, 90.825684, 97.931152, ...
|
||||
103.348877, 107.801880, 113.552246, 121.490601, ...
|
||||
130.420410, 143.431763, 158.486816, 176.872803, ...
|
||||
198.314697, 219.549561, 240.600098, 268.702393, ...
|
||||
306.060059, 349.937012, 398.686279, 454.713867, ...
|
||||
506.841797, 564.863770, 637.261230, 794.717285, ...
|
||||
931.068359
|
||||
];
|
||||
|
||||
pow_dens_correction_factor_16k = [
|
||||
100.000000, 99.999992, 100.000000, 100.000008,...
|
||||
100.000008, 100.000015, 99.999992, 99.999969, ...
|
||||
50.000027, 100.000000, 99.999969, 100.000015, ...
|
||||
99.999947, 100.000061, 53.047077, 110.000046, ...
|
||||
117.991989, 65.000000, 68.760147, 69.999931, ...
|
||||
71.428818, 75.000038, 76.843384, 80.968781, ...
|
||||
88.646126, 63.864388, 68.155350, 72.547775, ...
|
||||
75.584831, 58.379192, 80.950836, 64.135651, ...
|
||||
54.384785, 73.821884, 64.437073, 59.176456, ...
|
||||
65.521278, 61.399822, 58.144047, 57.004543, ...
|
||||
64.126297, 54.311001, 61.114979, 55.077751, ...
|
||||
56.849335, 55.628868, 53.137054, 54.985844, ...
|
||||
79.546974
|
||||
];
|
||||
|
||||
abs_thresh_power_16k = [
|
||||
51286152.00, 2454709.500, 70794.593750, ...
|
||||
4897.788574, 1174.897705, 389.045166, ...
|
||||
104.712860, 45.708820, 17.782795, ...
|
||||
9.772372, 4.897789, 3.090296, ...
|
||||
1.905461, 1.258925, 0.977237, ...
|
||||
0.724436, 0.562341, 0.457088, ...
|
||||
0.389045, 0.331131, 0.295121, ...
|
||||
0.269153, 0.257040, 0.251189, ...
|
||||
0.251189, 0.251189, 0.251189, ...
|
||||
0.263027, 0.288403, 0.309030, ...
|
||||
0.338844, 0.371535, 0.398107, ...
|
||||
0.436516, 0.467735, 0.489779, ...
|
||||
0.501187, 0.501187, 0.512861, ...
|
||||
0.524807, 0.524807, 0.524807, ...
|
||||
0.512861, 0.478630, 0.426580, ...
|
||||
0.371535, 0.363078, 0.416869, ...
|
||||
0.537032
|
||||
];
|
||||
|
||||
if (sampling_rate== fs_16k)
|
||||
Downsample = Downsample_16k;
|
||||
InIIR_Hsos = InIIR_Hsos_16k;
|
||||
InIIR_Nsos = InIIR_Nsos_16k;
|
||||
Align_Nfft = Align_Nfft_16k;
|
||||
Fs= fs_16k;
|
||||
|
||||
Nb = 49;
|
||||
Sl = Sl_16k;
|
||||
Sp = Sp_16k;
|
||||
nr_of_hz_bands_per_bark_band = nr_of_hz_bands_per_bark_band_16k;
|
||||
centre_of_band_bark = centre_of_band_bark_16k;
|
||||
centre_of_band_hz = centre_of_band_hz_16k;
|
||||
width_of_band_bark = width_of_band_bark_16k;
|
||||
width_of_band_hz = width_of_band_hz_16k;
|
||||
pow_dens_correction_factor = pow_dens_correction_factor_16k;
|
||||
abs_thresh_power = abs_thresh_power_16k;
|
||||
|
||||
return;
|
||||
end
|
||||
|
||||
if (sampling_rate== fs_8k)
|
||||
Downsample = Downsample_8k;
|
||||
InIIR_Hsos = InIIR_Hsos_8k;
|
||||
InIIR_Nsos = InIIR_Nsos_8k;
|
||||
Align_Nfft = Align_Nfft_8k;
|
||||
Fs= fs_8k;
|
||||
|
||||
Nb = 42;
|
||||
Sl = Sl_8k;
|
||||
Sp = Sp_8k;
|
||||
nr_of_hz_bands_per_bark_band = nr_of_hz_bands_per_bark_band_8k;
|
||||
centre_of_band_bark = centre_of_band_bark_8k;
|
||||
centre_of_band_hz = centre_of_band_hz_8k;
|
||||
width_of_band_bark = width_of_band_bark_8k;
|
||||
width_of_band_hz = width_of_band_hz_8k;
|
||||
pow_dens_correction_factor = pow_dens_correction_factor_8k;
|
||||
abs_thresh_power = abs_thresh_power_8k;
|
||||
return;
|
||||
end
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -0,0 +1,390 @@
|
||||
function split_align( ref_data, ref_Nsamples, ref_VAD, ref_logVAD, ...
|
||||
deg_data, deg_Nsamples, deg_VAD, deg_logVAD, ...
|
||||
Utt_Start_l, Utt_SpeechStart, Utt_SpeechEnd, Utt_End_l, ...
|
||||
Utt_DelayEst_l, Utt_DelayConf_l)
|
||||
|
||||
global MAXNUTTERANCES Align_Nfft Downsample Window
|
||||
global Utt_DelayEst Utt_Delay UttSearch_Start UttSearch_End
|
||||
global Best_ED1 Best_D1 Best_DC1 Best_ED2 Best_D2 Best_DC2 Best_BP
|
||||
|
||||
Utt_BPs= zeros( 1, 41);
|
||||
Utt_ED1= zeros( 1, 41);
|
||||
Utt_ED2= zeros( 1, 41);
|
||||
Utt_D1= zeros( 1, 41);
|
||||
Utt_D2= zeros( 1, 41);
|
||||
Utt_DC1= zeros( 1, 41);
|
||||
Utt_DC2= zeros( 1, 41);
|
||||
|
||||
|
||||
Utt_Len = Utt_SpeechEnd - Utt_SpeechStart;
|
||||
Utt_Test = MAXNUTTERANCES;
|
||||
Best_DC1 = 0.0;
|
||||
Best_DC2 = 0.0;
|
||||
kernel = Align_Nfft / 64;
|
||||
Delta = Align_Nfft / (4 * Downsample);
|
||||
Step = floor( ((0.801 * Utt_Len + 40 * Delta - 1)/(40 * Delta)));
|
||||
Step = Step* Delta;
|
||||
% fprintf( 'Step is %f\n', Step);
|
||||
|
||||
Pad = floor( Utt_Len / 10);
|
||||
if( Pad < 75 )
|
||||
Pad = 75;
|
||||
end
|
||||
|
||||
Utt_BPs(1) = Utt_SpeechStart + Pad;
|
||||
N_BPs = 1;
|
||||
while( 1)
|
||||
N_BPs= N_BPs+ 1;
|
||||
Utt_BPs(N_BPs)= Utt_BPs(N_BPs- 1)+ Step;
|
||||
if (~((Utt_BPs(N_BPs) <= (Utt_SpeechEnd- Pad)) && (N_BPs <= 40) ))
|
||||
break;
|
||||
end
|
||||
end
|
||||
|
||||
if( N_BPs <= 1 )
|
||||
return;
|
||||
end
|
||||
|
||||
% fprintf( 'Utt_DelayEst_l, Utt_Start_l, N_BPs is %d,%d,%d\n', ...
|
||||
% Utt_DelayEst_l, Utt_Start_l, N_BPs);
|
||||
for bp = 1: N_BPs- 1
|
||||
Utt_DelayEst(Utt_Test) = Utt_DelayEst_l;
|
||||
UttSearch_Start(Utt_Test) = Utt_Start_l;
|
||||
UttSearch_End(Utt_Test) = Utt_BPs(bp);
|
||||
% fprintf( 'bp,Utt_BPs(%d) is %d,%d\n', bp,bp,Utt_BPs(bp));
|
||||
|
||||
crude_align( ref_logVAD, ref_Nsamples, deg_logVAD, ...
|
||||
deg_Nsamples, MAXNUTTERANCES);
|
||||
Utt_ED1(bp) = Utt_Delay(Utt_Test);
|
||||
|
||||
Utt_DelayEst(Utt_Test) = Utt_DelayEst_l;
|
||||
UttSearch_Start(Utt_Test) = Utt_BPs(bp);
|
||||
UttSearch_End(Utt_Test) = Utt_End_l;
|
||||
|
||||
crude_align( ref_logVAD, ref_Nsamples, deg_logVAD, ...
|
||||
deg_Nsamples, MAXNUTTERANCES);
|
||||
Utt_ED2(bp) = Utt_Delay(Utt_Test);
|
||||
end
|
||||
|
||||
% stream = fopen( 'matmat.txt', 'wt' );
|
||||
% for count= 1: N_BPs- 1
|
||||
% fprintf( stream, '%d\n', Utt_ED2(count));
|
||||
% end
|
||||
% fclose( stream );
|
||||
|
||||
|
||||
Utt_DC1(1: N_BPs-1) = -2.0;
|
||||
% stream= fopen( 'what_mmm.txt', 'at');
|
||||
while( 1 )
|
||||
bp = 1;
|
||||
while( (bp <= N_BPs- 1) && (Utt_DC1(bp) > -2.0) )
|
||||
bp = bp+ 1;
|
||||
end
|
||||
if( bp >= N_BPs )
|
||||
break;
|
||||
end
|
||||
|
||||
estdelay = Utt_ED1(bp);
|
||||
% fprintf( 'bp,estdelay is %d,%d\n', bp, estdelay);
|
||||
H(1: Align_Nfft)= 0;
|
||||
Hsum = 0.0;
|
||||
|
||||
startr = (Utt_Start_l- 1) * Downsample+ 1;
|
||||
startd = startr + estdelay;
|
||||
% fprintf( 'startr/startd is %d/%d\n', startr, startd);
|
||||
|
||||
if ( startd < 0 )
|
||||
startr = -estdelay+ 1;
|
||||
startd = 1;
|
||||
end
|
||||
|
||||
while( ((startd + Align_Nfft) <= 1+ deg_Nsamples) &&...
|
||||
((startr + Align_Nfft) <= (1+ (Utt_BPs(bp)- 1) * Downsample)) )
|
||||
X1= ref_data(startr: startr+ Align_Nfft- 1).* Window;
|
||||
X2= deg_data(startd: startd+ Align_Nfft- 1).* Window;
|
||||
|
||||
X1_fft= fft( X1, Align_Nfft );
|
||||
X1_fft_conj= conj( X1_fft);
|
||||
X2_fft= fft( X2, Align_Nfft );
|
||||
X1= ifft( X1_fft_conj.* X2_fft, Align_Nfft);
|
||||
|
||||
X1= abs( X1);
|
||||
v_max= max( X1)* 0.99;
|
||||
n_max = (v_max^ 0.125 )/ kernel;
|
||||
% fprintf( stream, '%f %f\n', v_max, n_max);
|
||||
|
||||
for count = 0: Align_Nfft- 1
|
||||
if( X1(count+ 1) > v_max )
|
||||
Hsum = Hsum+ n_max * kernel;
|
||||
for k = 1-kernel: kernel- 1
|
||||
H(1+ rem( count+ k+ Align_Nfft, Align_Nfft))= ...
|
||||
H(1+ rem(count+ k+ Align_Nfft, Align_Nfft))+ ...
|
||||
n_max* (kernel- abs(k));
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
startr = startr+ (Align_Nfft / 4);
|
||||
startd = startd+ (Align_Nfft / 4);
|
||||
end
|
||||
|
||||
[v_max, I_max] = max( H);
|
||||
if( I_max- 1 >= (Align_Nfft/2) )
|
||||
I_max = I_max- Align_Nfft;
|
||||
end
|
||||
|
||||
Utt_D1(bp) = estdelay + I_max- 1;
|
||||
if( Hsum > 0.0 )
|
||||
% if (Utt_Len== 236)
|
||||
% fprintf( 'v_max, Hsum is %f, %f\n', v_max, Hsum);
|
||||
% end
|
||||
Utt_DC1(bp) = v_max / Hsum;
|
||||
else
|
||||
Utt_DC1(bp) = 0.0;
|
||||
end
|
||||
|
||||
% fprintf( 'bp/startr/startd is %d/%d/%d\n', bp, startr, startd);
|
||||
while( bp < (N_BPs - 1) )
|
||||
bp = bp + 1;
|
||||
|
||||
if( (Utt_ED1(bp) == estdelay) && (Utt_DC1(bp) <= -2.0) )
|
||||
% loopno= 0;
|
||||
while(((startd+ Align_Nfft)<= 1+ deg_Nsamples) && ...
|
||||
((startr+ Align_Nfft)<= ...
|
||||
((Utt_BPs(bp)- 1)* Downsample+ 1) ))
|
||||
X1= ref_data( startr: startr+ Align_Nfft- 1).* ...
|
||||
Window;
|
||||
% % if (Utt_Len== 321)
|
||||
% fid= fopen( 'what_mat.txt', 'at');
|
||||
% fprintf( fid, '%f\n', Window);
|
||||
% fclose( fid);
|
||||
% % fprintf( '\n');
|
||||
% % end
|
||||
X2= deg_data( startd: startd+ Align_Nfft- 1).* ...
|
||||
Window;
|
||||
X1_fft= fft( X1, Align_Nfft );
|
||||
X1_fft_conj= conj( X1_fft);
|
||||
X2_fft= fft( X2, Align_Nfft );
|
||||
X1= ifft( X1_fft_conj.* X2_fft, Align_Nfft);
|
||||
|
||||
X1= abs( X1);
|
||||
v_max = 0.99* max( X1);
|
||||
n_max = (v_max^ 0.125)/ kernel;
|
||||
% fprintf( 'v_max n_max is %f %f\n', v_max, n_max);
|
||||
|
||||
for count = 0: Align_Nfft- 1
|
||||
if( X1(count+ 1) > v_max )
|
||||
Hsum = Hsum+ n_max * kernel;
|
||||
for k = 1-kernel: kernel-1
|
||||
H(1+ rem( count+ k+ Align_Nfft, Align_Nfft))= ...
|
||||
H(1+ rem(count+ k+ Align_Nfft, Align_Nfft))+ ...
|
||||
n_max* (kernel- abs(k));
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
startr = startr+ (Align_Nfft / 4);
|
||||
startd = startd+ (Align_Nfft / 4);
|
||||
|
||||
% loopno= loopno+ 1;
|
||||
end
|
||||
% fprintf( 'loopno is %d\n', loopno);
|
||||
|
||||
[v_max, I_max] = max( H);
|
||||
% fprintf( 'I_max is %d ', I_max);
|
||||
if( I_max- 1 >= (Align_Nfft/2) )
|
||||
I_max = I_max- Align_Nfft;
|
||||
end
|
||||
|
||||
|
||||
Utt_D1(bp) = estdelay + I_max- 1;
|
||||
if( Hsum > 0.0 )
|
||||
% fprintf( 'v_max Hsum is %f %f\n', v_max, Hsum);
|
||||
Utt_DC1(bp) = v_max / Hsum;
|
||||
else
|
||||
Utt_DC1(bp) = 0.0;
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
% fclose( stream);
|
||||
|
||||
for bp= 1: N_BPs- 1
|
||||
if( Utt_DC1(bp) > Utt_DelayConf_l )
|
||||
Utt_DC2(bp) = -2.0;
|
||||
else
|
||||
Utt_DC2(bp) = 0.0;
|
||||
end
|
||||
end
|
||||
|
||||
while( 1 )
|
||||
bp = N_BPs- 1;
|
||||
while( (bp >= 1) && (Utt_DC2(bp) > -2.0) )
|
||||
bp = bp- 1;
|
||||
end
|
||||
if( bp < 1 )
|
||||
break;
|
||||
end
|
||||
|
||||
estdelay = Utt_ED2(bp);
|
||||
H( 1: Align_Nfft)= 0;
|
||||
Hsum = 0.0;
|
||||
|
||||
startr = (Utt_End_l- 1)* Downsample+ 1- Align_Nfft;
|
||||
startd = startr + estdelay;
|
||||
|
||||
% fprintf( '***NEW startr is %d\n', startr);
|
||||
|
||||
% fprintf( 'startr/d, deg_Nsamples is %d/%d, %d\n', startr,startd, ...
|
||||
% deg_Nsamples);
|
||||
% fprintf( 'deg_data has %d elements\n', numel( deg_data));
|
||||
|
||||
if ( (startd + Align_Nfft) > deg_Nsamples+ 1 )
|
||||
startd = deg_Nsamples - Align_Nfft+ 1;
|
||||
startr = startd - estdelay;
|
||||
end
|
||||
|
||||
while( (startd>= 1) && (startr>= (Utt_BPs(bp)- 1)* Downsample+ 1) )
|
||||
X1= ref_data( startr: startr+ Align_Nfft- 1).* Window;
|
||||
X2= deg_data( startd: startd+ Align_Nfft- 1).* Window;
|
||||
|
||||
X1_fft= fft( X1, Align_Nfft);
|
||||
X1_fft_conj= conj( X1_fft);
|
||||
X2_fft= fft( X2, Align_Nfft);
|
||||
|
||||
X1= ifft( X1_fft_conj.* X2_fft, Align_Nfft );
|
||||
X1= abs( X1);
|
||||
|
||||
v_max = max( X1)* 0.99;
|
||||
n_max = ( v_max^ 0.125 )/ kernel;
|
||||
|
||||
for count = 0: Align_Nfft- 1
|
||||
if( X1(count+ 1) > v_max )
|
||||
Hsum = Hsum+ n_max * kernel;
|
||||
for k = 1-kernel: kernel- 1
|
||||
H(1+ rem(count+ k+ Align_Nfft, Align_Nfft))= ...
|
||||
H(1+ rem(count+ k+ Align_Nfft, Align_Nfft))+ ...
|
||||
n_max* (kernel- abs(k));
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
startr = startr- (Align_Nfft / 4);
|
||||
startd = startd- (Align_Nfft / 4);
|
||||
end
|
||||
|
||||
[v_max, I_max] = max( H);
|
||||
if( I_max- 1 >= (Align_Nfft/2) )
|
||||
I_max = I_max- Align_Nfft;
|
||||
end
|
||||
|
||||
Utt_D2(bp) = estdelay + I_max- 1;
|
||||
if( Hsum > 0.0 )
|
||||
Utt_DC2(bp) = v_max / Hsum;
|
||||
else
|
||||
Utt_DC2(bp) = 0.0;
|
||||
end
|
||||
|
||||
while( bp > 1 )
|
||||
bp = bp - 1;
|
||||
if( (Utt_ED2(bp) == estdelay) && (Utt_DC2(bp) <= -2.0) )
|
||||
while( (startd >= 1) && (startr >= (Utt_BPs(bp)- 1) * Downsample+ 1))
|
||||
X1= ref_data( startr: startr+ Align_Nfft- 1).* Window;
|
||||
X2= deg_data( startd: startd+ Align_Nfft- 1).* Window;
|
||||
X1_fft_conj= conj( fft( X1, Align_Nfft));
|
||||
X2_fft= fft( X2, Align_Nfft);
|
||||
X1= ifft( X1_fft_conj.* X2_fft, Align_Nfft);
|
||||
|
||||
X1= abs( X1);
|
||||
v_max = max( X1)* 0.99;
|
||||
n_max = (v_max^ 0.125)/ kernel;
|
||||
|
||||
for count = 0: Align_Nfft- 1
|
||||
if( X1(count+ 1) > v_max )
|
||||
Hsum = Hsum+ n_max * kernel;
|
||||
for k = 1-kernel: kernel- 1
|
||||
H(1+ rem( count+ k+ Align_Nfft, Align_Nfft))= ...
|
||||
H(1+ rem(count+ k+ Align_Nfft, Align_Nfft))+ ...
|
||||
n_max* (kernel- abs(k));
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
startr = startr- (Align_Nfft / 4);
|
||||
startd = startd- (Align_Nfft / 4);
|
||||
end
|
||||
|
||||
[v_max, I_max] = max( H);
|
||||
if( I_max- 1 >= (Align_Nfft/2) )
|
||||
I_max = I_max- Align_Nfft;
|
||||
end
|
||||
|
||||
|
||||
Utt_D2(bp) = estdelay + I_max- 1;
|
||||
if( Hsum > 0.0 )
|
||||
Utt_DC2(bp) = v_max / Hsum;
|
||||
else
|
||||
Utt_DC2(bp) = 0.0;
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
% fid= fopen( 'uttinfo_mat.txt', 'wt');
|
||||
% fprintf( fid, '%f\n', Utt_D2);
|
||||
% fprintf( fid, '\n');
|
||||
% fprintf( fid, '%f\n', Utt_DC2);
|
||||
% fclose( fid);
|
||||
|
||||
% fprintf( 'Utt_Len, N_BPs is %d, %d\n', Utt_Len, N_BPs);
|
||||
for bp = 1: N_BPs- 1
|
||||
if( (abs(Utt_D2(bp) - Utt_D1(bp)) >= Downsample) && ...
|
||||
((Utt_DC1(bp)+ Utt_DC2(bp))> (Best_DC1 + Best_DC2)) &&...
|
||||
(Utt_DC1(bp) > Utt_DelayConf_l) && ...
|
||||
(Utt_DC2(bp) > Utt_DelayConf_l) )
|
||||
Best_ED1 = Utt_ED1(bp);
|
||||
Best_D1 = Utt_D1(bp);
|
||||
Best_DC1 = Utt_DC1(bp);
|
||||
Best_ED2 = Utt_ED2(bp);
|
||||
Best_D2 = Utt_D2(bp);
|
||||
Best_DC2 = Utt_DC2(bp);
|
||||
Best_BP = Utt_BPs(bp);
|
||||
% fprintf( 'in loop...');
|
||||
end
|
||||
end
|
||||
|
||||
% if (Utt_Len== 236)
|
||||
% fid= fopen( 'matmat.txt', 'wt');
|
||||
% fprintf( fid, 'N_BPs is %d\n', N_BPs);
|
||||
% fprintf( fid, 'Utt_DelayConf is %f\n', Utt_DelayConf_l);
|
||||
% fprintf( fid, 'ED2\t ED1\t D2\t D1\t DC2\t DC1\t BPs\n');
|
||||
% for bp= 1: N_BPs- 1
|
||||
% fprintf( fid, '%d\t %d\t %d\t %d\t %f\t %f\t %d\n', Utt_ED2( bp), ...
|
||||
% Utt_ED1( bp), Utt_D2(bp), Utt_D1(bp), Utt_DC2(bp),...
|
||||
% Utt_DC1( bp), Utt_BPs( bp));
|
||||
% end
|
||||
% fclose( fid);
|
||||
% end
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -0,0 +1,76 @@
|
||||
function time_align(ref_data, ref_Nsamples, ...
|
||||
deg_data, deg_Nsamples, Utt_id)
|
||||
|
||||
global Utt_DelayEst Utt_Delay Utt_DelayConf UttSearch_Start UttSearch_End
|
||||
global Align_Nfft Downsample Window
|
||||
|
||||
estdelay = Utt_DelayEst(Utt_id);
|
||||
|
||||
H = zeros( 1, Align_Nfft);
|
||||
X1= zeros( 1, Align_Nfft);
|
||||
X2= zeros( 1, Align_Nfft);
|
||||
|
||||
startr = (UttSearch_Start(Utt_id)- 1)* Downsample+ 1;
|
||||
startd = startr + estdelay;
|
||||
if ( startd < 0 )
|
||||
startr = 1 -estdelay;
|
||||
startd = 1;
|
||||
end
|
||||
|
||||
while( ((startd + Align_Nfft) <= deg_Nsamples) && ...
|
||||
((startr + Align_Nfft) <= ((UttSearch_End(Utt_id)- 1) * Downsample)) )
|
||||
X1= ref_data( startr: startr+ Align_Nfft- 1).* Window;
|
||||
X2= deg_data( startd: startd+ Align_Nfft- 1).* Window;
|
||||
|
||||
% find cross-correlation between X1 and X2
|
||||
X1_fft= fft( X1, Align_Nfft );
|
||||
X1_fft_conj= conj( X1_fft);
|
||||
X2_fft= fft( X2, Align_Nfft );
|
||||
X1= ifft( X1_fft_conj.* X2_fft, Align_Nfft );
|
||||
|
||||
X1= abs( X1);
|
||||
v_max = max( X1)* 0.99;
|
||||
|
||||
X1_greater_vmax= find( X1 > v_max );
|
||||
H( X1_greater_vmax )= H( X1_greater_vmax )+ v_max^ 0.125;
|
||||
|
||||
startr = startr+ Align_Nfft/ 4;
|
||||
startd = startd+ Align_Nfft/ 4;
|
||||
|
||||
end
|
||||
|
||||
X1= H;
|
||||
X2= 0;
|
||||
Hsum = sum( H);
|
||||
|
||||
X2(1) = 1.0;
|
||||
kernel = Align_Nfft / 64;
|
||||
|
||||
for count= 2: kernel
|
||||
X2( count)= 1- (count- 1)/ kernel;
|
||||
X2( Align_Nfft- count+ 2)= 1- (count- 1)/ kernel;
|
||||
end
|
||||
|
||||
X1_fft= fft( X1, Align_Nfft );
|
||||
X2_fft= fft( X2, Align_Nfft );
|
||||
|
||||
X1= ifft( X1_fft.* X2_fft, Align_Nfft );
|
||||
|
||||
if (Hsum> 0)
|
||||
H= abs( X1)/ Hsum;
|
||||
else
|
||||
H= 0;
|
||||
end
|
||||
|
||||
[v_max, I_max] = max( H);
|
||||
if( I_max- 1 >= (Align_Nfft/2) )
|
||||
I_max = I_max- Align_Nfft;
|
||||
end
|
||||
|
||||
Utt_Delay(Utt_id) = estdelay + I_max- 1;
|
||||
Utt_DelayConf(Utt_id) = v_max; % confidence
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -0,0 +1,26 @@
|
||||
function utterance_locate (ref_data, ref_Nsamples, ref_VAD, ref_logVAD,...
|
||||
deg_data, deg_Nsamples, deg_VAD, deg_logVAD);
|
||||
|
||||
global Nutterances Utt_Delay Utt_DelayConf Utt_Start Utt_End Utt_DelayEst
|
||||
|
||||
id_searchwindows( ref_VAD, ref_Nsamples, deg_VAD, deg_Nsamples);
|
||||
|
||||
for Utt_id= 1: Nutterances
|
||||
%fprintf( 1, 'Utt_id is %d\n', Utt_id);
|
||||
crude_align( ref_logVAD, ref_Nsamples, deg_logVAD, deg_Nsamples, Utt_id);
|
||||
time_align(ref_data, ref_Nsamples, ...
|
||||
deg_data, deg_Nsamples, Utt_id);
|
||||
end
|
||||
|
||||
id_utterances( ref_Nsamples, ref_VAD, deg_Nsamples);
|
||||
|
||||
|
||||
utterance_split( ref_data, ref_Nsamples, ref_VAD, ref_logVAD, ...
|
||||
deg_data, deg_Nsamples, deg_VAD, deg_logVAD);
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -0,0 +1,122 @@
|
||||
function utterance_split( ref_data, ref_Nsamples, ref_VAD, ref_logVAD, ...
|
||||
deg_data, deg_Nsamples, deg_VAD, deg_logVAD)
|
||||
|
||||
global Nutterances MAXNUTTERANCES Downsample SEARCHBUFFER
|
||||
global Utt_DelayEst Utt_Delay Utt_DelayConf UttSearch_Start
|
||||
global Utt_Start Utt_End Largest_uttsize UttSearch_End
|
||||
global Best_ED1 Best_D1 Best_DC1 Best_ED2 Best_D2 Best_DC2 Best_BP
|
||||
|
||||
Utt_id = 1;
|
||||
while( (Utt_id <= Nutterances) && (Nutterances <= MAXNUTTERANCES) )
|
||||
Utt_DelayEst_l = Utt_DelayEst(Utt_id);
|
||||
Utt_Delay_l = Utt_Delay(Utt_id);
|
||||
Utt_DelayConf_l = Utt_DelayConf(Utt_id);
|
||||
Utt_Start_l = Utt_Start(Utt_id);
|
||||
Utt_End_l = Utt_End(Utt_id);
|
||||
|
||||
Utt_SpeechStart = Utt_Start_l;
|
||||
% fprintf( 'SpeechStart is %d\n', Utt_SpeechStart);
|
||||
while( (Utt_SpeechStart < Utt_End_l) && ...
|
||||
(ref_VAD(Utt_SpeechStart)<= 0.0) )
|
||||
Utt_SpeechStart = Utt_SpeechStart + 1;
|
||||
end %find the SpeechStart for each utterance
|
||||
Utt_SpeechEnd = Utt_End_l;
|
||||
% fprintf( 'SpeechEnd is %d\n', Utt_SpeechEnd);
|
||||
while( (Utt_SpeechEnd > Utt_Start_l) && ...
|
||||
(ref_VAD(Utt_SpeechEnd) <= 0))
|
||||
Utt_SpeechEnd = Utt_SpeechEnd- 1;
|
||||
end
|
||||
Utt_SpeechEnd = Utt_SpeechEnd+ 1;
|
||||
%find SpeechEnd for each utterance
|
||||
Utt_Len = Utt_SpeechEnd - Utt_SpeechStart;
|
||||
|
||||
% fprintf( 'Utt_Len is %d\n', Utt_Len);
|
||||
|
||||
if( Utt_Len >= 200 )
|
||||
split_align( ref_data, ref_Nsamples, ref_VAD, ref_logVAD, ...
|
||||
deg_data, deg_Nsamples, deg_VAD, deg_logVAD, ...
|
||||
Utt_Start_l, Utt_SpeechStart, Utt_SpeechEnd, Utt_End_l, ...
|
||||
Utt_DelayEst_l, Utt_DelayConf_l);
|
||||
% fprintf( '\nBest_ED1, Best_D1, Best_DC1 is %d, %d, %f\n',...
|
||||
% Best_ED1, Best_D1, Best_DC1);
|
||||
% fprintf( 'Best_ED2, Best_D2, Best_DC2 is %d, %d, %f\n',...
|
||||
% Best_ED2, Best_D2, Best_DC2);
|
||||
% fprintf( 'Best_BP is %d\n', Best_BP);
|
||||
|
||||
if( (Best_DC1 > Utt_DelayConf_l) && (Best_DC2 > Utt_DelayConf_l) )
|
||||
for step = Nutterances: -1: Utt_id+ 1
|
||||
Utt_DelayEst(step+ 1) = Utt_DelayEst(step);
|
||||
Utt_Delay(step+ 1) = Utt_Delay(step);
|
||||
Utt_DelayConf(step+ 1) = Utt_DelayConf(step);
|
||||
Utt_Start(step+ 1) = Utt_Start(step);
|
||||
Utt_End(step+ 1) = Utt_End(step);
|
||||
UttSearch_Start(step+ 1) = Utt_Start( step);
|
||||
UttSearch_End(step+ 1) = Utt_End( step);
|
||||
end
|
||||
|
||||
Nutterances = Nutterances+ 1;
|
||||
|
||||
Utt_DelayEst(Utt_id) = Best_ED1;
|
||||
Utt_Delay(Utt_id) = Best_D1;
|
||||
Utt_DelayConf(Utt_id) = Best_DC1;
|
||||
|
||||
Utt_DelayEst(Utt_id +1) = Best_ED2;
|
||||
Utt_Delay(Utt_id +1) = Best_D2;
|
||||
Utt_DelayConf(Utt_id +1) = Best_DC2;
|
||||
|
||||
UttSearch_Start(Utt_id +1) = UttSearch_Start(Utt_id);
|
||||
UttSearch_End(Utt_id +1) = UttSearch_End( Utt_id);
|
||||
if( Best_D2 < Best_D1 )
|
||||
Utt_Start(Utt_id) = Utt_Start_l;
|
||||
Utt_End(Utt_id) = Best_BP;
|
||||
Utt_Start(Utt_id +1) = Best_BP;
|
||||
Utt_End(Utt_id +1) = Utt_End_l;
|
||||
else
|
||||
Utt_Start( Utt_id) = Utt_Start_l;
|
||||
Utt_End( Utt_id) = Best_BP + ...
|
||||
floor( (Best_D2- Best_D1)/ (2 * Downsample));
|
||||
Utt_Start( Utt_id +1) = Best_BP - ...
|
||||
floor( (Best_D2- Best_D1)/ (2 * Downsample));
|
||||
Utt_End( Utt_id +1) = Utt_End_l;
|
||||
end
|
||||
|
||||
if( (Utt_Start(Utt_id)- SEARCHBUFFER- 1)* Downsample+ 1+ ...
|
||||
Best_D1 < 0 )
|
||||
Utt_Start(Utt_id) = SEARCHBUFFER+ 1+ ...
|
||||
floor( (Downsample - 1 - Best_D1) / Downsample);
|
||||
end
|
||||
|
||||
if( ((Utt_End( Utt_id +1)- 1)* Downsample+ 1 + Best_D2) >...
|
||||
(deg_Nsamples - SEARCHBUFFER * Downsample) )
|
||||
Utt_End( Utt_id +1) = floor( (deg_Nsamples - Best_D2)...
|
||||
/ Downsample)- SEARCHBUFFER+ 1;
|
||||
end
|
||||
else
|
||||
Utt_id= Utt_id+ 1;
|
||||
end
|
||||
else
|
||||
Utt_id = Utt_id+ 1;
|
||||
end
|
||||
end
|
||||
|
||||
Largest_uttsize = max( Utt_End- Utt_Start);
|
||||
|
||||
% fid= fopen( 'uttinfo_mat.txt', 'wt');
|
||||
% fprintf( fid, 'Number of Utterances is:\n');
|
||||
% fprintf( fid, '%d\n', Nutterances);
|
||||
% fprintf( fid, 'Utterance Delay Estimation:\n');
|
||||
% fprintf( fid, '%d\n', Utt_DelayEst( 1: Nutterances) );
|
||||
% fprintf( fid, 'Utterance Delay:\n');
|
||||
% fprintf( fid, '%d\n', Utt_Delay( 1: Nutterances));
|
||||
% fprintf( fid, 'Utterance Delay Confidence:\n');
|
||||
% fprintf( fid, '%f\n', Utt_DelayConf( 1: Nutterances));
|
||||
% fprintf( fid, 'Utterance Start:\n');
|
||||
% fprintf( fid, '%d\n', Utt_Start( 1: Nutterances));
|
||||
% fprintf( fid, 'Utterance End:\n');
|
||||
% fprintf( fid, '%d\n', Utt_End( 1: Nutterances));
|
||||
% fprintf( fid, 'Largest utterance length:\n');
|
||||
% fprintf( fid, '%d\n', Largest_uttsize);
|
||||
% fclose( fid);
|
||||
|
||||
|
||||
|
17103
6th-Semester-Spring-2024/DSP/Labs/FinalProject/parchami_2016.pdf
Normal file
17103
6th-Semester-Spring-2024/DSP/Labs/FinalProject/parchami_2016.pdf
Normal file
File diff suppressed because one or more lines are too long
41
6th-Semester-Spring-2024/DSP/Labs/FinalProject/play_sound.py
Normal file
41
6th-Semester-Spring-2024/DSP/Labs/FinalProject/play_sound.py
Normal file
@ -0,0 +1,41 @@
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import scipy.signal
|
||||
import sounddevice as sd
|
||||
|
||||
|
||||
|
||||
SOUND_PATH = "noisefiles/train.dat"
|
||||
|
||||
|
||||
def normalize_signal(signal):
|
||||
min_amp = np.min(signal)
|
||||
normalized_signal = signal - min_amp
|
||||
max_amp = np.max(normalized_signal)
|
||||
normalized_signal *= 2/max_amp
|
||||
normalized_signal -= 1
|
||||
return normalized_signal
|
||||
|
||||
|
||||
def load_audiofile(path):
|
||||
sound_data = []
|
||||
sample_rate = 8000
|
||||
if path[-3:] == "dat":
|
||||
with open(SOUND_PATH, "r") as sound_file:
|
||||
sound_data_strings = sound_file.readlines()
|
||||
for data_string in sound_data_strings:
|
||||
sound_data.append(eval(data_string.strip()))
|
||||
sound_data = np.array(sound_data)
|
||||
elif path[-3:] == "wav":
|
||||
sample_rate, sound_data = wavfile.read(path)
|
||||
return sample_rate, sound_data
|
||||
|
||||
|
||||
def main():
|
||||
sample_rate, sound_data = load_audiofile(SOUND_PATH)
|
||||
print(sample_rate)
|
||||
sd.play(normalize_signal(sound_data), samplerate=sample_rate, blocking=True)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,176 @@
|
||||
function audnoise(ns_file,outfile)
|
||||
|
||||
%
|
||||
% Implements the audible-noise suppression algorithm [1].
|
||||
%
|
||||
% Usage: audnoise(noisyFile, outputFile)
|
||||
%
|
||||
% infile - noisy speech file in .wav format
|
||||
% outputFile - enhanced output file in .wav format
|
||||
%
|
||||
% It runs 2 iterations, but one could change the number of iterations by
|
||||
% modifying accordingly the variable iter_num on line 33.
|
||||
%
|
||||
% Example call: audnoise('sp04_babble_sn10.wav','out_aud.wav');
|
||||
%
|
||||
% References:
|
||||
% [1] Tsoukalas, D. E., Mourjopoulos, J. N., and Kokkinakis, G. (1997). Speech
|
||||
% enhancement based on audible noise suppression. IEEE Trans. on Speech and
|
||||
% Audio Processing, 5(6), 497-514.
|
||||
%
|
||||
% Authors: Yi Hu and Philipos C. Loizou
|
||||
%
|
||||
% Copyright (c) 2006 by Philipos C. Loizou
|
||||
% $Revision: 0.0 $ $Date: 10/09/2006 $
|
||||
%-------------------------------------------------------------------------
|
||||
|
||||
if nargin<2
|
||||
fprintf('Usage: audnoise(noisyfile.wav,outFile.wav) \n\n');
|
||||
return;
|
||||
end
|
||||
|
||||
|
||||
iter_num=2; % number of iterations
|
||||
NF_SABSENT= 6;
|
||||
%this is the number of speech-absent frames to estimate the initial
|
||||
%noise power spectrum
|
||||
|
||||
[nsdata, Fs, bits]= wavread( ns_file); %nsdata is a column vector
|
||||
|
||||
aa=0.98;
|
||||
mu=0.98;
|
||||
eta=0.15;
|
||||
|
||||
nwind= floor( 20* Fs/ 1000); %this corresponds to 20ms window
|
||||
if rem( nwind, 2)~= 0 nwind= nwind+ 1; end %made window length even
|
||||
noverlap= nwind/ 2;
|
||||
w= hamming( nwind);
|
||||
rowindex= ( 1: nwind)';
|
||||
|
||||
%we assume the first NF_SABSENT frames are speech absent, we use them to estimate the noise power spectrum
|
||||
noisedata= nsdata( 1: nwind* NF_SABSENT); noise_colindex= 1+ ( 0: NF_SABSENT- 1)* nwind;
|
||||
noisematrixdata = zeros( nwind, NF_SABSENT);
|
||||
noisematrixdata( :)= noisedata( ...
|
||||
rowindex( :, ones(1, NF_SABSENT))+ noise_colindex( ones( nwind, 1), :)- 1);
|
||||
noisematrixdata= noisematrixdata.* w( :, ones( 1, NF_SABSENT)) ; %WINDOWING NOISE DATA
|
||||
noise_ps= mean( (abs( fft( noisematrixdata))).^ 2, 2); %NOTE!!!! it is a column vector
|
||||
|
||||
% ----- estimate noise in CBs ------------------
|
||||
%
|
||||
noise_b=zeros(nwind/2+1,1);
|
||||
[CB_FREQ_INDICES]=find_CB_FREQ_INDICES(Fs,nwind,16,nwind/2);
|
||||
|
||||
for i = 1:length(CB_FREQ_INDICES)
|
||||
noise_b(CB_FREQ_INDICES{i})=ones(size(CB_FREQ_INDICES{i},2),1)*mean(noise_ps(CB_FREQ_INDICES{i}));
|
||||
end
|
||||
noise_b1=[noise_b; fliplr(noise_b(2:nwind/2))];
|
||||
|
||||
nslide= nwind- noverlap;
|
||||
|
||||
x= nsdata;
|
||||
nx= length( x); ncol= fix(( nx- noverlap)/ nslide);
|
||||
colindex = 1 + (0: (ncol- 1))* nslide;
|
||||
if nx< (nwind + colindex(ncol) - 1)
|
||||
x(nx+ 1: nwind+ colindex(ncol) - 1) = ...
|
||||
rand( nwind+ colindex( ncol)- 1- nx, 1)* (2^ (-15)); % zero-padding
|
||||
end
|
||||
|
||||
es_old= zeros( noverlap, 1);
|
||||
%es_old is actually the second half of the previous enhanced speech frame,
|
||||
%it is used for overlap-add
|
||||
|
||||
for k= 1: ncol
|
||||
|
||||
y= x( colindex( k): colindex( k)+ nwind- 1);
|
||||
y= y.* w; %WINDOWING NOISY SPEECH DATA
|
||||
|
||||
y_spec= fft( y); y_specmag= abs( y_spec); y_specang= angle( y_spec);
|
||||
%they are the frequency spectrum, spectrum magnitude and spectrum phase, respectively
|
||||
|
||||
y_ps= y_specmag.^ 2; %power spectrum of noisy speech
|
||||
y_ps1=y_ps(1:nwind/2+1);
|
||||
|
||||
% ====start of vad ===
|
||||
gammak=min(y_ps./noise_ps,40); % post SNR
|
||||
if k==1
|
||||
ksi=aa+(1-aa)*max(gammak-1,0);
|
||||
else
|
||||
ksi=aa*Xk_prev./noise_ps + (1-aa)*max(gammak-1,0); % a priori SNR
|
||||
end
|
||||
|
||||
log_sigma_k= gammak.* ksi./ (1+ ksi)- log(1+ ksi);
|
||||
vad_decision= sum( log_sigma_k)/ nwind;
|
||||
if (vad_decision < eta)
|
||||
% noise only frame found
|
||||
noise_ps= mu* noise_ps+ (1- mu)* y_ps;
|
||||
end
|
||||
|
||||
|
||||
for i = 1:length(CB_FREQ_INDICES)
|
||||
noise_b(CB_FREQ_INDICES{i})=...
|
||||
ones(size(CB_FREQ_INDICES{i},2),1)*mean(noise_ps(CB_FREQ_INDICES{i}));
|
||||
end
|
||||
|
||||
% ===end of vad===
|
||||
|
||||
x_cons1=max(y_ps-noise_ps,0.001);
|
||||
% conservative estimate of x from power spectral subtraction
|
||||
x_cons = x_cons1(1:nwind/2+1);
|
||||
|
||||
% --- Estimate masking thresholds iteratively (as per page 505) ----
|
||||
%
|
||||
Tk0=mask(x_cons,nwind,Fs,16);
|
||||
Xp=y_ps1;
|
||||
for j=1:iter_num
|
||||
ab = noise_b+(noise_b.^2)./Tk0; % Eq. 41
|
||||
Xp=(Xp.^2)./(ab+Xp); % Eq. 40
|
||||
Tk0=mask(Xp,nwind,Fs,16);
|
||||
end
|
||||
|
||||
% --- Estimate alpha ------
|
||||
%
|
||||
alpha = (noise_b+Tk0).*(noise_b./Tk0);
|
||||
% eq. 26 for Threshold (T) method with ni(b)=1
|
||||
|
||||
% ---- Apply suppression rule --------------
|
||||
%
|
||||
H0 = (Xp./(alpha+Xp));
|
||||
H=[H0(1:nwind/2+1); flipud(H0(2:nwind/2))];
|
||||
|
||||
x_hat = H.*y_spec;
|
||||
Xk_prev= abs( x_hat).^ 2;
|
||||
|
||||
es_tmp=real(ifft(x_hat));
|
||||
|
||||
% ---- Overlap and add ---------------
|
||||
|
||||
es_data( colindex( k): colindex( k)+ nwind- 1)= [es_tmp( 1: noverlap)+ es_old;...
|
||||
es_tmp( noverlap+ 1: nwind)];
|
||||
%overlap-add
|
||||
es_old= es_tmp( nwind- noverlap+ 1: nwind);
|
||||
end
|
||||
|
||||
wavwrite( es_data, Fs, bits, outfile);
|
||||
|
||||
%------------------------------------------------------
|
||||
|
||||
function [CB_FREQ_INDICES]=find_CB_FREQ_INDICES(Fs,dft_length,nbits,frame_overlap)
|
||||
% This function is from Matlab STSA Toolbox for Audio Signal Noise Reduction
|
||||
% Copyright (C) 2001 Patrick J. Wolfe
|
||||
|
||||
freq_val = (0:Fs/dft_length:Fs/2)';
|
||||
freq=freq_val;
|
||||
crit_band_ends = [0;100;200;300;400;510;630;770;920;1080;1270;1480;1720;2000;2320;2700;3150;3700;4400;5300;6400;7700;9500;12000;15500;Inf];
|
||||
imax = max(find(crit_band_ends < freq(end)));
|
||||
num_bins = length(freq);
|
||||
LIN_TO_BARK = zeros(imax,num_bins);
|
||||
i = 1;
|
||||
for j = 1:num_bins
|
||||
while ~((freq(j) >= crit_band_ends(i)) & (freq(j) < crit_band_ends(i+1))),i = i+1;end
|
||||
LIN_TO_BARK(i,j) = 1;
|
||||
end
|
||||
% Calculation of critical band frequency indices--i.e., which bins are in which critical band for i = 1:imax
|
||||
for i=1:imax,
|
||||
CB_FREQ_INDICES{i} = find(LIN_TO_BARK(i,:));
|
||||
end
|
||||
|
@ -0,0 +1,47 @@
|
||||
function z=confhyperg(a,b,x,n)
|
||||
%
|
||||
% Computes the confluent hypergeometric function
|
||||
% using a series expansion:
|
||||
%
|
||||
% f(a,b;x)=
|
||||
%
|
||||
% 1 + [ab/1!c]x + [a(a+1)/2!b(b+1)]x^2 +
|
||||
% [a(a+1)(a+2)/3!b(b+1)(b+2)]x^3 + ...
|
||||
%
|
||||
% The above series is expanded to n terms
|
||||
%
|
||||
%
|
||||
%
|
||||
% Philipos C. Loizou
|
||||
|
||||
if nargin ~= 4
|
||||
error('Usage: confhyperg(a,b,x,n) - Incorrect number of arguments')
|
||||
end
|
||||
|
||||
if (n <= 0 | n ~= floor(n))
|
||||
error('Usage: confhyperg (a,b,c,x,n) - n has to be a positive integer')
|
||||
end
|
||||
|
||||
NEG=0;
|
||||
if x<0
|
||||
x=abs(x);
|
||||
a=b-a;
|
||||
NEG=1;
|
||||
end
|
||||
|
||||
z = 0;
|
||||
m = 0;
|
||||
while (m<n)
|
||||
if (m == 0)
|
||||
delta = 1;
|
||||
else
|
||||
delta = delta .* x .* (a + (m - 1)) ./ (m .* (b + (m-1)));
|
||||
end
|
||||
|
||||
z = z + delta;
|
||||
m = m + 1;
|
||||
end
|
||||
|
||||
if NEG==1 % if x<0
|
||||
z=exp(-x).*z;
|
||||
end;
|
@ -0,0 +1,54 @@
|
||||
function z=hyperg(a,b,c,x,n)
|
||||
% HYPERGEOMETRIC2F1 Computes the hypergeometric function
|
||||
% using a series expansion:
|
||||
%
|
||||
% f(a,b;c;x)=
|
||||
%
|
||||
% 1 + [ab/1!c]x + [a(a+1)b(b+1)/2!c(c+1)]x^2 +
|
||||
% [a(a+1)(a+2)b(b+1)(b+2)/3!c(c+1)(c+2)]x^3 + ...
|
||||
%
|
||||
% The series is expanded to n terms
|
||||
%
|
||||
% This function solves the Gaussian Hypergeometric Differential Equation:
|
||||
%
|
||||
% x(1-x)y'' + {c-(a+b+1)x}y' - aby = 0
|
||||
%
|
||||
% The Hypergeometric function converges only for:
|
||||
% |x| < 1
|
||||
% c != 0, -1, -2, -3, ...
|
||||
%
|
||||
%
|
||||
% Comments to:
|
||||
% Diego Garcia - d.garcia@ieee.org
|
||||
% Chuck Mongiovi - mongiovi@fast.net
|
||||
% June 14, 2002
|
||||
|
||||
if nargin ~= 5
|
||||
error('Usage: hypergeometric2f1(a,b,c,x,n) --> Wrong number of arguments')
|
||||
end
|
||||
|
||||
if (n <= 0 | n ~= floor(n))
|
||||
error('Usage: hypergeometric2f1(a,b,c,x,n) --> n has to be a positive integer')
|
||||
end
|
||||
|
||||
% if (abs(x) > 1)
|
||||
% z=min(0.99,x);
|
||||
% return;
|
||||
% error('Usage: hypergeometric2f1(a,b,c,x,n) --> |x| has to be less than 1')
|
||||
% end
|
||||
|
||||
if (c <= 0 & c == floor(c))
|
||||
error('Usage: hypergeometric2f1(a,b,c,x,n) --> c != 0, -1, -2, -3, ...')
|
||||
end
|
||||
|
||||
z = 0;
|
||||
m = 0;
|
||||
while (m<n)
|
||||
if (m == 0)
|
||||
delta = 1;
|
||||
else
|
||||
delta = delta .* x .* (a + (m - 1)) .* (b + (m-1)) ./ m ./ (c + (m-1));
|
||||
end
|
||||
z = z + delta;
|
||||
m = m + 1;
|
||||
end
|
@ -0,0 +1,119 @@
|
||||
function logmmse(filename,outfile)
|
||||
|
||||
%
|
||||
% Implements the logMMSE algorithm [1].
|
||||
%
|
||||
% Usage: logmmse(noisyFile, outputFile)
|
||||
%
|
||||
% infile - noisy speech file in .wav format
|
||||
% outputFile - enhanced output file in .wav format
|
||||
%
|
||||
%
|
||||
% Example call: logmmse('sp04_babble_sn10.wav','out_log.wav');
|
||||
%
|
||||
% References:
|
||||
% [1] Ephraim, Y. and Malah, D. (1985). Speech enhancement using a minimum
|
||||
% mean-square error log-spectral amplitude estimator. IEEE Trans. Acoust.,
|
||||
% Speech, Signal Process., ASSP-23(2), 443-445.
|
||||
%
|
||||
% Authors: Philipos C. Loizou
|
||||
%
|
||||
% Copyright (c) 2006 by Philipos C. Loizou
|
||||
% $Revision: 0.0 $ $Date: 10/09/2006 $
|
||||
%-------------------------------------------------------------------------
|
||||
|
||||
if nargin<2
|
||||
fprintf('Usage: logmmse(noisyfile.wav,outFile.wav) \n\n');
|
||||
return;
|
||||
end
|
||||
|
||||
[x, Srate, bits]= wavread( filename); %nsdata is a column vector
|
||||
|
||||
% =============== Initialize variables ===============
|
||||
|
||||
len=floor(20*Srate/1000); % Frame size in samples
|
||||
if rem(len,2)==1, len=len+1; end;
|
||||
PERC=50; % window overlap in percent of frame size
|
||||
len1=floor(len*PERC/100);
|
||||
len2=len-len1;
|
||||
|
||||
|
||||
win=hamming(len); % define window
|
||||
|
||||
|
||||
% Noise magnitude calculations - assuming that the first 6 frames is
|
||||
% noise/silence
|
||||
|
||||
nFFT=2*len;
|
||||
noise_mean=zeros(nFFT,1);
|
||||
j=1;
|
||||
for m=1:6
|
||||
noise_mean=noise_mean+abs(fft(win.*x(j:j+len-1),nFFT));
|
||||
j=j+len;
|
||||
end
|
||||
noise_mu=noise_mean/6;
|
||||
noise_mu2=noise_mu.^2;
|
||||
|
||||
%--- allocate memory and initialize various variables
|
||||
|
||||
|
||||
|
||||
x_old=zeros(len1,1);
|
||||
Nframes=floor(length(x)/len2)-floor(len/len2);
|
||||
xfinal=zeros(Nframes*len2,1);
|
||||
|
||||
|
||||
%=============================== Start Processing =======================================================
|
||||
%
|
||||
k=1;
|
||||
aa=0.98;
|
||||
mu=0.98;
|
||||
eta=0.15;
|
||||
|
||||
ksi_min=10^(-25/10);
|
||||
|
||||
for n=1:Nframes
|
||||
|
||||
insign=win.*x(k:k+len-1);
|
||||
|
||||
spec=fft(insign,nFFT);
|
||||
sig=abs(spec); % compute the magnitude
|
||||
sig2=sig.^2;
|
||||
|
||||
gammak=min(sig2./noise_mu2,40); % limit post SNR to avoid overflows
|
||||
if n==1
|
||||
ksi=aa+(1-aa)*max(gammak-1,0);
|
||||
else
|
||||
ksi=aa*Xk_prev./noise_mu2 + (1-aa)*max(gammak-1,0); % a priori SNR
|
||||
ksi=max(ksi_min,ksi); % limit ksi to -25 dB
|
||||
end
|
||||
|
||||
log_sigma_k= gammak.* ksi./ (1+ ksi)- log(1+ ksi);
|
||||
vad_decision= sum(log_sigma_k)/ len;
|
||||
if (vad_decision< eta)
|
||||
% noise only frame found
|
||||
noise_mu2= mu* noise_mu2+ (1- mu)* sig2;
|
||||
end
|
||||
% ===end of vad===
|
||||
|
||||
A=ksi./(1+ksi); % Log-MMSE estimator
|
||||
vk=A.*gammak;
|
||||
ei_vk=0.5*expint(vk);
|
||||
hw=A.*exp(ei_vk);
|
||||
|
||||
sig=sig.*hw;
|
||||
Xk_prev=sig.^2;
|
||||
|
||||
xi_w= ifft( hw .* spec,nFFT);
|
||||
xi_w= real( xi_w);
|
||||
|
||||
xfinal(k:k+ len2-1)= x_old+ xi_w(1:len1);
|
||||
x_old= xi_w(len1+ 1: len);
|
||||
|
||||
k=k+len2;
|
||||
|
||||
end
|
||||
|
||||
wavwrite(xfinal,Srate,16,outfile);
|
||||
|
||||
|
@ -0,0 +1,287 @@
|
||||
function logmmse_SPU(filename,outfile,option)
|
||||
|
||||
%
|
||||
% Implements the logMMSE algorithm with signal-presence uncertainty (SPU) [1].
|
||||
% Four different methods for estimating the a priori probability of speech absence
|
||||
% (P(H0)) are implemented.
|
||||
%
|
||||
% Usage: logmmse_SPU(noisyFile, outputFile, option)
|
||||
%
|
||||
% infile - noisy speech file in .wav format
|
||||
% outputFile - enhanced output file in .wav format
|
||||
% option - method used to estimate the a priori probability of speech
|
||||
% absence, P(Ho):
|
||||
% 1 - hard decision (Soon et al. [2])
|
||||
% 2 - soft decision (Soon et al. [2])
|
||||
% 3 - Malah et al.(1999) - ICASSP
|
||||
% 4 - Cohen (2002) [1]
|
||||
%
|
||||
%
|
||||
% Example call: logmmse_SPU('sp04_babble_sn10.wav','out_logSPU.wav',1);
|
||||
%
|
||||
% References:
|
||||
% [1] Cohen, I. (2002). Optimal speech enhancement under signal presence
|
||||
% uncertainty using log-spectra amplitude estimator. IEEE Signal Processing
|
||||
% Letters, 9(4), 113-116.
|
||||
% [2] Soon, I., Koh, S., and Yeo, C. (1999). Improved noise suppression
|
||||
% filter using self-adaptive estimator of probability of speech absence.
|
||||
% Signal Processing, 75, 151-159.
|
||||
%
|
||||
% Author: Philipos C. Loizou
|
||||
%
|
||||
% Copyright (c) 2006 by Philipos C. Loizou
|
||||
% $Revision: 0.0 $ $Date: 10/09/2006 $
|
||||
%-------------------------------------------------------------------------
|
||||
|
||||
if nargin<3
|
||||
fprintf('Usage: logmmse_SPU(infile.wav,outfile.wav,option) \n');
|
||||
fprintf('where option = \n');
|
||||
fprintf(' 1 - hard decision ( Soon et al)\n');
|
||||
fprintf(' 2 - soft decision (Soon et al.)\n');
|
||||
fprintf(' 3 - Malah et al.(1999) \n');
|
||||
fprintf(' 4 - Cohen (2002) \n');
|
||||
return;
|
||||
end;
|
||||
|
||||
if option<1 | option>4 | rem(option,1)~=0
|
||||
error('ERROR! option needs to be an integer between 1 and 4.\n\n');
|
||||
end
|
||||
|
||||
[x, Srate, bits]= wavread( filename);
|
||||
|
||||
|
||||
|
||||
% =============== Initialize variables ===============
|
||||
%
|
||||
|
||||
len=floor(20*Srate/1000); % Frame size in samples
|
||||
if rem(len,2)==1, len=len+1; end;
|
||||
PERC=50; % window overlap in percent of frame size
|
||||
len1=floor(len*PERC/100);
|
||||
len2=len-len1;
|
||||
|
||||
win=hamming(len); % define window
|
||||
|
||||
% Noise magnitude calculations - assuming that the first 6 frames is
|
||||
% noise/silence
|
||||
%
|
||||
nFFT=len;
|
||||
nFFT2=floor(len/2);
|
||||
noise_mean=zeros(nFFT,1);
|
||||
j=1;
|
||||
for k=1:6
|
||||
noise_mean=noise_mean+abs(fft(win.*x(j:j+len-1),nFFT));
|
||||
j=j+len;
|
||||
end
|
||||
noise_mu=noise_mean/6;
|
||||
noise_mu2=noise_mu.^2;
|
||||
|
||||
%--- allocate memory and initialize various variables
|
||||
|
||||
|
||||
aa=0.98;
|
||||
mu=0.98;
|
||||
eta=0.15;
|
||||
img=sqrt(-1);
|
||||
x_old=zeros(len1,1);
|
||||
Nframes=floor(length(x)/len2)-floor(len/len2);
|
||||
xfinal=zeros(Nframes*len2,1);
|
||||
|
||||
|
||||
if option==4 % Cohen's method
|
||||
global zetak zeta_fr_old z_peak
|
||||
|
||||
len2a=len/2+1;
|
||||
zetak=zeros(len2a,1);
|
||||
zeta_fr_old=1000;
|
||||
z_peak=0;
|
||||
end;
|
||||
|
||||
%=============================== Start Processing =======================================================
|
||||
%
|
||||
|
||||
qk=0.5*ones(len,1);
|
||||
ksi_old=zeros(len,1);
|
||||
ksi_min=10^(-25/10);
|
||||
%qkr=(1-qk)/qk;
|
||||
%qk2=1/(1-qk);
|
||||
|
||||
Gmin=10^(-20/10); % needed for Cohen's implementation
|
||||
k=1;
|
||||
|
||||
for n=1:Nframes
|
||||
|
||||
insign=win.*x(k:k+len-1);
|
||||
|
||||
%--- Take fourier transform of frame
|
||||
|
||||
spec=fft(insign,nFFT);
|
||||
sig=abs(spec); % compute the magnitude
|
||||
sig2=sig.^2;
|
||||
|
||||
gammak=min(sig2./noise_mu2,40); % post SNR
|
||||
if n==1
|
||||
ksi=aa+(1-aa)*max(gammak-1,0);
|
||||
else
|
||||
ksi=aa*Xk_prev./noise_mu2 + (1-aa)*max(gammak-1,0);
|
||||
% a priori SNR
|
||||
ksi=max(ksi_min,ksi); % limit ksi to -25 dB
|
||||
end
|
||||
|
||||
log_sigma_k= gammak.* ksi./ (1+ ksi)- log(1+ ksi);
|
||||
vad_decision= sum( log_sigma_k)/ len;
|
||||
if (vad_decision< eta)
|
||||
% noise only frame found
|
||||
noise_mu2= mu* noise_mu2+ (1- mu)* sig2;
|
||||
end
|
||||
% ===end of vad===
|
||||
|
||||
%ksi=qk2*ksi;
|
||||
A=ksi./(1+ksi);
|
||||
vk=A.*gammak;
|
||||
ei_vk=0.5*expint(vk);
|
||||
hw=A.*exp(ei_vk);
|
||||
|
||||
% --- estimate conditional speech-presence probability ---------------
|
||||
%
|
||||
[qk]=est_sap(qk,ksi,ksi_old,gammak,option); % estimate P(Ho)- a priori speech absence prob.
|
||||
pSAP = (1-qk)./(1-qk+qk.*(1+ksi).*exp(-vk)); % P(H1 | Yk)
|
||||
|
||||
|
||||
% ---- Cohen's 2002 ------
|
||||
%
|
||||
Gmin2=Gmin.^(1-pSAP); % Cohen's (2002) - Eq 8
|
||||
Gcohen=(hw.^pSAP).*Gmin2;
|
||||
sig = sig.*Gcohen;
|
||||
%----------------------------
|
||||
|
||||
Xk_prev=sig.^2;
|
||||
ksi_old=ksi; % needed for Cohen's method for estimating q
|
||||
|
||||
xi_w= ifft( sig .* exp(img*angle(spec)));
|
||||
xi_w= real( xi_w);
|
||||
|
||||
% --------- Overlap and add ---------------
|
||||
%
|
||||
xfinal(k:k+ len2-1)= x_old+ xi_w(1:len1);
|
||||
x_old= xi_w(len1+ 1: len);
|
||||
|
||||
k=k+len2;
|
||||
end
|
||||
%========================================================================================
|
||||
|
||||
|
||||
wavwrite(xfinal,Srate,16,outfile);
|
||||
|
||||
%--------------------------- E N D -----------------------------------------
|
||||
|
||||
|
||||
function [qk]=est_sap(qk,xsi,xsi_old,gammak,type)
|
||||
|
||||
% function returns a priori probability of speech absence, P(Ho)
|
||||
%
|
||||
|
||||
global zetak zeta_fr_old z_peak
|
||||
|
||||
if type ==1 % hard-decision: Soon et al.
|
||||
beta=0.1;
|
||||
dk=ones(length(xsi),1);
|
||||
i0=besseli(0,2*(gammak.*xsi).^0.5);
|
||||
temp=exp(-xsi).*i0;
|
||||
indx=find(temp>1);
|
||||
dk(indx)=0;
|
||||
|
||||
qk=beta*dk + (1-beta)*qk;
|
||||
|
||||
|
||||
|
||||
elseif type==2 % soft-decision: Soon et al.
|
||||
beta=0.1;
|
||||
i0=besseli(0,2*(gammak.*xsi).^0.5);
|
||||
|
||||
temp=exp(-xsi).*i0;
|
||||
P_Ho=1./(1+temp);
|
||||
P_Ho=min(1,P_Ho);
|
||||
|
||||
qk=beta*P_Ho + (1-beta)*qk;
|
||||
|
||||
|
||||
elseif type==3 % Malah et al. (1999)
|
||||
|
||||
if mean(gammak(1:floor(length(gammak)/2)))> 2.4 % VAD detector
|
||||
|
||||
beta=0.95;
|
||||
gamma_th=0.8;
|
||||
dk=ones(length(xsi),1);
|
||||
indx=find(gammak>gamma_th);
|
||||
dk(indx)=0;
|
||||
|
||||
qk=beta*qk+(1-beta)*dk;
|
||||
end
|
||||
|
||||
elseif type==4 % Cohen (2002)
|
||||
beta=0.7;
|
||||
len=length(qk);
|
||||
len2=len/2+1;
|
||||
|
||||
zetak=beta*zetak+(1-beta)*xsi_old(1:len2);
|
||||
|
||||
|
||||
z_min=0.1; z_max=0.3162;
|
||||
C=log10(z_max/z_min);
|
||||
zp_min=1; zp_max=10;
|
||||
zeta_local=smoothing(zetak,1);
|
||||
zeta_global=smoothing(zetak,15);
|
||||
|
||||
Plocal=zeros(len2,1); % estimate P_local
|
||||
imax=find(zeta_local>z_max);
|
||||
Plocal(imax)=1;
|
||||
ibet=find(zeta_local>z_min & zeta_local<z_max);
|
||||
Plocal(ibet)=log10(zeta_local(ibet)/z_min)/C;
|
||||
|
||||
|
||||
Pglob=zeros(len2,1); % estimate P_global
|
||||
imax=find(zeta_global>z_max);
|
||||
Pglob(imax)=1;
|
||||
ibet=find(zeta_global>z_min & zeta_global<z_max);
|
||||
Pglob(ibet)=log10(zeta_global(ibet)/z_min)/C;
|
||||
|
||||
zeta_fr=mean(zetak); % estimate Pframe
|
||||
if zeta_fr>z_min
|
||||
if zeta_fr>zeta_fr_old
|
||||
Pframe=1;
|
||||
z_peak=min(max(zeta_fr,zp_min),zp_max);
|
||||
else
|
||||
if zeta_fr <=z_peak*z_min, Pframe=0;
|
||||
elseif zeta_fr>= z_peak*z_max, Pframe=1;
|
||||
else, Pframe=log10(zeta_fr/z_peak/z_min)/C;
|
||||
end
|
||||
end
|
||||
else
|
||||
Pframe=0;
|
||||
end
|
||||
zeta_fr_old=zeta_fr;
|
||||
qk2 = 1- Plocal.*Pglob*Pframe; % estimate prob of speech absence
|
||||
qk2= min(0.95,qk2);
|
||||
qk = [qk2; flipud(qk2(2:len2-1))];
|
||||
|
||||
|
||||
end
|
||||
|
||||
%----------------------------------------------
|
||||
function y=smoothing (x,N);
|
||||
|
||||
len=length(x);
|
||||
win=hanning(2*N+1);
|
||||
win1=win(1:N+1);
|
||||
win2=win(N+2:2*N+1);
|
||||
|
||||
y1=filter(flipud(win1),[1],x);
|
||||
|
||||
x2=zeros(len,1);
|
||||
x2(1:len-N)=x(N+1:len);
|
||||
|
||||
y2=filter(flipud(win2),[1],x2);
|
||||
|
||||
y=(y1+y2)/norm(win,2);
|
||||
|
@ -0,0 +1,96 @@
|
||||
% Author: Patrick J. Wolfe
|
||||
% Signal Processing Group
|
||||
% Cambridge University Engineering Department
|
||||
% p.wolfe@ieee.org
|
||||
% Johnston perceptual model initialisation
|
||||
function M= mask( Sx, dft_length, Fs, nbits)
|
||||
|
||||
frame_overlap= dft_length/ 2;
|
||||
freq_val = (0:Fs/dft_length:Fs/2)';
|
||||
half_lsb = (1/(2^nbits-1))^2/dft_length;
|
||||
|
||||
freq= freq_val;
|
||||
thresh= half_lsb;
|
||||
crit_band_ends = [0;100;200;300;400;510;630;770;920;1080;1270;...
|
||||
1480;1720;2000;2320;2700;3150;3700;4400;5300;6400;7700;...
|
||||
9500;12000;15500;Inf];
|
||||
|
||||
% Maximum Bark frequency
|
||||
%
|
||||
imax = max(find(crit_band_ends < freq(end)));
|
||||
|
||||
% Normalised (to 0 dB) threshold of hearing values (Fletcher, 1929)
|
||||
% as used by Johnston. First and last thresholds are corresponding
|
||||
% critical band endpoint values, elsewhere means of interpolated
|
||||
% critical band endpoint threshold values are used.
|
||||
%
|
||||
abs_thr = 10.^([38;31;22;18.5;15.5;13;11;9.5;8.75;7.25;4.75;2.75;...
|
||||
1.5;0.5;0;0;0;0;2;7;12;15.5;18;24;29]./10);
|
||||
ABSOLUTE_THRESH = thresh.*abs_thr(1:imax);
|
||||
|
||||
% Calculation of tone-masking-noise offset ratio in dB
|
||||
%
|
||||
OFFSET_RATIO_DB = 9+ (1:imax)';
|
||||
|
||||
% Initialisation of matrices for bark/linear frequency conversion
|
||||
% (loop increments i to the proper critical band)
|
||||
%
|
||||
num_bins = length(freq);
|
||||
LIN_TO_BARK = zeros(imax,num_bins);
|
||||
i = 1;
|
||||
for j = 1:num_bins
|
||||
while ~((freq(j) >= crit_band_ends(i)) & ...
|
||||
(freq(j) < crit_band_ends(i+1))),
|
||||
i = i+1;
|
||||
end
|
||||
LIN_TO_BARK(i,j) = 1;
|
||||
end
|
||||
|
||||
% Calculation of spreading function (Schroeder et al., 82)
|
||||
|
||||
spreading_fcn = zeros(imax);
|
||||
summ = 0.474:imax;
|
||||
spread = 10.^((15.81+7.5.*summ-17.5.*sqrt(1+summ.^2))./10);
|
||||
for i = 1:imax
|
||||
for j = 1:imax
|
||||
spreading_fcn(i,j) = spread(abs(j-i)+1);
|
||||
end
|
||||
end
|
||||
|
||||
% Calculation of excitation pattern function
|
||||
|
||||
EX_PAT = spreading_fcn* LIN_TO_BARK;
|
||||
|
||||
% Calculation of DC gain due to spreading function
|
||||
|
||||
DC_GAIN = spreading_fcn* ones(imax,1);
|
||||
|
||||
|
||||
%Sx = X.* conj(X);
|
||||
|
||||
|
||||
C = EX_PAT* Sx;
|
||||
|
||||
% Calculation of spectral flatness measure SFM_dB
|
||||
%
|
||||
[num_bins num_frames] = size(Sx);
|
||||
k = 1/num_bins;
|
||||
SFM_dB = 10.*log10((prod(Sx).^k)./(k.*sum(Sx))+ eps);
|
||||
|
||||
% Calculation of tonality coefficient and masked threshold offset
|
||||
%
|
||||
alpha = min(1,SFM_dB./-60);
|
||||
O_dB = OFFSET_RATIO_DB(:,ones(1,num_frames)).*...
|
||||
alpha(ones(length(OFFSET_RATIO_DB),1),:) + 5.5;
|
||||
|
||||
% Threshold calculation and renormalisation, accounting for absolute
|
||||
% thresholds
|
||||
|
||||
T = C./10.^(O_dB./10);
|
||||
T = T./DC_GAIN(:,ones(1,num_frames));
|
||||
T = max( T, ABSOLUTE_THRESH(:, ones(1, num_frames)));
|
||||
|
||||
% Reconversion to linear frequency scale
|
||||
|
||||
%M = 1.* sqrt((LIN_TO_BARK')*T);
|
||||
M= LIN_TO_BARK'* T;
|
@ -0,0 +1,150 @@
|
||||
function mmse(filename,outfile,SPU)
|
||||
|
||||
%
|
||||
% Implements the MMSE algorithm [1].
|
||||
%
|
||||
% Usage: mmse(noisyFile, outputFile, SPU)
|
||||
%
|
||||
% infile - noisy speech file in .wav format
|
||||
% outputFile - enhanced output file in .wav format
|
||||
% SPU - if 1, includes speech-presence uncertainty
|
||||
% if 0, doesnt include speech-presence uncertainty
|
||||
%
|
||||
%
|
||||
% Example call: mmse('sp04_babble_sn10.wav','out_mmse.wav',1);
|
||||
%
|
||||
% References:
|
||||
% [1] Ephraim, Y. and Malah, D. (1985). Speech enhancement using a minimum
|
||||
% mean-square error log-spectral amplitude estimator. IEEE Trans. Acoust.,
|
||||
% Speech, Signal Process., ASSP-23(2), 443-445.
|
||||
%
|
||||
% Authors: Philipos C. Loizou
|
||||
%
|
||||
% Copyright (c) 2006 by Philipos C. Loizou
|
||||
% $Revision: 0.0 $ $Date: 10/09/2006 $
|
||||
%-------------------------------------------------------------------------
|
||||
|
||||
if nargin<3
|
||||
fprintf('Usage: mmse(infile.wav,outfile.wav,SPU) \n');
|
||||
fprintf('where SPU=1 - includes speech presence uncertainty\n');
|
||||
fprintf(' SPU=0 - does not includes speech presence uncertainty\n\n');
|
||||
return;
|
||||
end;
|
||||
|
||||
if SPU~=1 & SPU~=0
|
||||
error('ERROR: SPU needs to be either 1 or 0.');
|
||||
end
|
||||
|
||||
[x, Srate, bits]= wavread( filename);
|
||||
|
||||
|
||||
% =============== Initialize variables ===============
|
||||
|
||||
len=floor(20*Srate/1000); % Frame size in samples
|
||||
if rem(len,2)==1, len=len+1; end;
|
||||
PERC=50; % window overlap in percent of frame size
|
||||
len1=floor(len*PERC/100);
|
||||
len2=len-len1;
|
||||
|
||||
win=hamming(len); %tukey(len,PERC); % define window
|
||||
|
||||
% Noise magnitude calculations - assuming that the first 6 frames is noise/silence
|
||||
%
|
||||
nFFT=2*len;
|
||||
j=1;
|
||||
noise_mean=zeros(nFFT,1);
|
||||
for k=1:6
|
||||
noise_mean=noise_mean+abs(fft(win.*x(j:j+len-1),nFFT));
|
||||
j=j+len;
|
||||
end
|
||||
noise_mu=noise_mean/6;
|
||||
noise_mu2=noise_mu.^2;
|
||||
|
||||
%--- allocate memory and initialize various variables
|
||||
|
||||
k=1;
|
||||
img=sqrt(-1);
|
||||
x_old=zeros(len1,1);
|
||||
Nframes=floor(length(x)/len2)-1;
|
||||
xfinal=zeros(Nframes*len2,1);
|
||||
|
||||
% --------------- Initialize parameters ------------
|
||||
%
|
||||
k=1;
|
||||
aa=0.98;
|
||||
eta= 0.15;
|
||||
mu=0.98;
|
||||
c=sqrt(pi)/2;
|
||||
qk=0.3;
|
||||
qkr=(1-qk)/qk;
|
||||
ksi_min=10^(-25/10);
|
||||
|
||||
%=============================== Start Processing =======================================================
|
||||
%
|
||||
for n=1:Nframes
|
||||
|
||||
insign=win.*x(k:k+len-1);
|
||||
|
||||
%--- Take fourier transform of frame
|
||||
%
|
||||
spec=fft(insign,nFFT);
|
||||
sig=abs(spec); % compute the magnitude
|
||||
sig2=sig.^2;
|
||||
|
||||
gammak=min(sig2./noise_mu2,40); % posteriori SNR
|
||||
if n==1
|
||||
ksi=aa+(1-aa)*max(gammak-1,0);
|
||||
else
|
||||
ksi=aa*Xk_prev./noise_mu2 + (1-aa)*max(gammak-1,0);
|
||||
% decision-direct estimate of a priori SNR
|
||||
ksi=max(ksi_min,ksi); % limit ksi to -25 dB
|
||||
end
|
||||
|
||||
log_sigma_k= gammak.* ksi./ (1+ ksi)- log(1+ ksi);
|
||||
vad_decision= sum( log_sigma_k)/ len;
|
||||
if (vad_decision< eta) % noise only frame found
|
||||
noise_mu2= mu* noise_mu2+ (1- mu)* sig2;
|
||||
end
|
||||
% ===end of vad===
|
||||
|
||||
vk=ksi.*gammak./(1+ksi);
|
||||
[j0,err]=besseli(0,vk/2);
|
||||
[j1,err2]=besseli(1,vk/2);
|
||||
if any(err) | any(err2)
|
||||
fprintf('ERROR! Overflow in Bessel calculation in frame: %d \n',n);
|
||||
else
|
||||
C=exp(-0.5*vk);
|
||||
A=((c*(vk.^0.5)).*C)./gammak;
|
||||
B=(1+vk).*j0+vk.*j1;
|
||||
hw=A.*B;
|
||||
end
|
||||
|
||||
|
||||
% --- estimate speech presence probability
|
||||
%
|
||||
if SPU==1
|
||||
evk=exp(vk);
|
||||
Lambda=qkr*evk./(1+ksi);
|
||||
pSAP=Lambda./(1+Lambda);
|
||||
sig=sig.*hw.*pSAP;
|
||||
else
|
||||
sig=sig.*hw;
|
||||
end
|
||||
|
||||
Xk_prev=sig.^2; % save for estimation of a priori SNR in next frame
|
||||
|
||||
xi_w= ifft( sig .* exp(img*angle(spec)),nFFT);
|
||||
|
||||
xi_w= real( xi_w);
|
||||
|
||||
xfinal(k:k+ len2-1)= x_old+ xi_w(1:len1);
|
||||
x_old= xi_w(len1+ 1: len);
|
||||
|
||||
k=k+len2;
|
||||
|
||||
end
|
||||
%========================================================================================
|
||||
|
||||
|
||||
wavwrite(xfinal,Srate,16,outfile);
|
||||
|
@ -0,0 +1,696 @@
|
||||
function outfile= mt_mask( noisy_file, outfile)
|
||||
|
||||
%
|
||||
% Implements a psychoacoustically motivated algorithm [1].
|
||||
%
|
||||
% Usage: mt_mask(noisyFile, outputFile)
|
||||
%
|
||||
% infile - noisy speech file in .wav format
|
||||
% outputFile - enhanced output file in .wav format
|
||||
%
|
||||
%
|
||||
% Example call: mt_mask('sp04_babble_sn10.wav','out_mask.wav');
|
||||
%
|
||||
% References:
|
||||
% [1] Hu, Y. and Loizou, P. (2004). Incorporating a psychoacoustical model in
|
||||
% frequency domain speech enhancement. IEEE Signal Processing Letters, 11(2),
|
||||
% 270-273.
|
||||
%
|
||||
% Authors: Yi Hu and Philipos C. Loizou
|
||||
%
|
||||
% Copyright (c) 2006 by Philipos C. Loizou
|
||||
% $Revision: 0.0 $ $Date: 10/09/2006 $
|
||||
%-------------------------------------------------------------------------
|
||||
|
||||
if nargin<2
|
||||
fprintf('Usage: mt_mask(noisyfile.wav,outFile.wav) \n\n');
|
||||
return;
|
||||
end
|
||||
|
||||
|
||||
|
||||
% Initialize wavelet parameters (see also wiener_wt.m)
|
||||
wavname='db4';
|
||||
thre_type='ds';thre_func_type='s';q_0=5;
|
||||
taper_num=16;
|
||||
|
||||
%------------------get the noisy speech data
|
||||
[noisy_speech, Srate, NBITS]= wavread( noisy_file);
|
||||
|
||||
%===========initiate the parameters=======================
|
||||
frame_dur= 20; %unit is milli-second
|
||||
len= floor( Srate* frame_dur/ 1000);
|
||||
if rem( len, 2)~= 0
|
||||
len= len+ 1;
|
||||
end
|
||||
NFFT= len; %number of FFT points
|
||||
tapers= sine_taper( taper_num, NFFT);
|
||||
diga= digamma( taper_num)- log( taper_num);
|
||||
|
||||
win= hamming( len);
|
||||
% win= win/ norm( win);
|
||||
PERC= 50; % window overlap in percent of frame size
|
||||
len1=floor(len* PERC/ 100);
|
||||
len2= len- len1;
|
||||
L120= floor( 120* Srate/ 1000);
|
||||
bfl=0.002; % spectral floor
|
||||
|
||||
|
||||
k= 1; %k is starting point of each frame
|
||||
|
||||
%================================================
|
||||
|
||||
q= ceil( log2( len));
|
||||
M= 2^ q;
|
||||
|
||||
sigma_eta_square= trigamma( taper_num);
|
||||
N_autoc= sigma_eta_square* ( 1- ( 0: taper_num+ 1)/ ( taper_num+ 1));
|
||||
N_autoc( M/ 2+ 1)= 0;
|
||||
Sigma_N_firstrow= [N_autoc( 1: M/ 2+ 1), fliplr( N_autoc( 2: M/ 2))];
|
||||
noise_stat= real( fft( Sigma_N_firstrow));
|
||||
|
||||
[wfilter( 1, :), wfilter( 2, :), wfilter( 3, :), wfilter( 4, :)]= ...
|
||||
wfilters( wavname);
|
||||
%------get the wavelet/scaling filter for decomposition/reconstruction
|
||||
|
||||
noise= noisy_speech( 1: L120);
|
||||
noise_ps= psd_mt_sine( noise, tapers);
|
||||
log_noise_ps= log( noise_ps)- diga;
|
||||
den_log_noise_ps= thre_wavelet( log_noise_ps, noise_stat, thre_type, ...
|
||||
thre_func_type, wfilter, q_0);
|
||||
den_log_noise_ps= [den_log_noise_ps( 1: len/ 2+ 1); ...
|
||||
flipud( den_log_noise_ps( 2: len/ 2))];
|
||||
noise_ps= exp( den_log_noise_ps);
|
||||
%=================
|
||||
|
||||
mu_vad= 0.98; % smoothing factor in noise spectrum update
|
||||
aa= 0.98; % smoothing factor in priori update
|
||||
eta= 0.15; % VAD threshold
|
||||
|
||||
%=================
|
||||
|
||||
Nframes= floor( length( noisy_speech)/ len2)- 1;
|
||||
x_old= zeros( len1, 1);
|
||||
xfinal= zeros( Nframes* len2, 1);
|
||||
|
||||
%=============================== Start Processing ==========
|
||||
|
||||
for n= 1: Nframes
|
||||
|
||||
insign= noisy_speech( k: k+ len- 1);
|
||||
insign_spec= fft( insign.* win, NFFT);
|
||||
|
||||
%========estimate the noisy speech power spectrum
|
||||
ns_ps= psd_mt_sine( insign, tapers);
|
||||
|
||||
log_ns_ps= log( ns_ps)- diga;
|
||||
den_log_ns_ps= thre_wavelet( log_ns_ps, noise_stat, thre_type, ...
|
||||
thre_func_type, wfilter, q_0);
|
||||
den_log_ns_ps= [den_log_ns_ps( 1: NFFT/ 2+ 1); ...
|
||||
flipud( den_log_ns_ps( 2: NFFT/ 2))];
|
||||
ns_ps= exp( den_log_ns_ps);
|
||||
%=================================================
|
||||
|
||||
gammak= abs( insign_spec).^ 2/ (norm( win)^2)./ noise_ps;
|
||||
if n==1
|
||||
ksi=aa+(1-aa)*max(gammak-1,0);
|
||||
else
|
||||
ksi=aa*Xk_prev./noise_ps + (1-aa)*max(gammak-1,0);
|
||||
% decision-direct estimate of a priori SNR
|
||||
end
|
||||
|
||||
log_sigma_k= gammak.* ksi./ (1+ ksi)- log(1+ ksi);
|
||||
vad_decision(n)= sum( log_sigma_k)/ len;
|
||||
if (vad_decision(n)< eta)
|
||||
% noise only frame found
|
||||
noise_ps= mu_vad* noise_ps+ (1- mu_vad)* ns_ps;
|
||||
vad( k: k+ len- 1)= 0;
|
||||
else
|
||||
vad( k: k+ len- 1)= 1;
|
||||
end
|
||||
% ===end of vad===
|
||||
|
||||
%========estimate the clean speech power spectrum
|
||||
cl_ps= ns_ps- noise_ps;
|
||||
cl_ps= max( cl_ps, bfl* ns_ps);
|
||||
%--providing a spectral floor
|
||||
%========
|
||||
|
||||
%compute the masking threshold
|
||||
mask_thre= mask( cl_ps( 1: NFFT/ 2+ 1), NFFT, Srate, 16);
|
||||
mask_thre= [mask_thre; flipud( mask_thre( 2: NFFT/ 2))];
|
||||
%expand it to NFFT length
|
||||
|
||||
noise_mask_ratio= noise_ps./ mask_thre;
|
||||
%=======two methods to compute g_wi
|
||||
% get the mu_k by u= max( sqrt( Sn/ alpha- 1), 0) * Sx/ Sn
|
||||
%aprioSNR= cl_ps./ noise_ps;
|
||||
%mu( :, n)= max( sqrt( noise_mask_ratio)-1, 0).* aprioSNR;
|
||||
%g_wi= aprioSNR./ ( aprioSNR+ mu_n);
|
||||
tmp= max( sqrt( noise_mask_ratio)-1, 0);
|
||||
g_wi= 1./ (1+ tmp);
|
||||
|
||||
xi_freq= g_wi.* insign_spec;
|
||||
Xk_prev= abs( xi_freq).^ 2;
|
||||
|
||||
xi_w= ifft( xi_freq);
|
||||
xi_w= real( xi_w);
|
||||
|
||||
xfinal( k: k+ len2- 1)= x_old+ xi_w( 1: len1);
|
||||
x_old= xi_w( len1+ 1: len);
|
||||
k= k+ len2;
|
||||
|
||||
end
|
||||
%========================================================================================
|
||||
|
||||
wavwrite( xfinal, Srate, 16, outfile);
|
||||
|
||||
|
||||
%========================================================================================
|
||||
|
||||
function after_thre= thre_wavelet( before_thre, noise_stat, ...
|
||||
thre_type, thre_func_type, wfilter, q_0)
|
||||
|
||||
%this function implements the wavelet thresholding technique
|
||||
% refer to the paper by Walden/1998, Donoho/1995, Johnstone/1997
|
||||
|
||||
%note on the parameters
|
||||
% before_thre: data before thresholding
|
||||
% noise_stat: the power spectrum of the noise (i.e., noise statistics),
|
||||
% DFT of the first row of Sigma_N, refer to Eq. (8) in Walden's paper
|
||||
% thre_type: threshold type, scale-dependent Universal ('d'),
|
||||
% scale-independent Universal ('i'), scale-dependent SURE ('ds'),
|
||||
% scale-independent SURE ('is'), or scale-dependent Generalized
|
||||
% Corss-Validation ('dg')
|
||||
% thre_func_type: threshold function type: soft ('s') or hard ('h');
|
||||
% wfilter: wavelet low pass and high pass decomposition/reconstruction filters [lo_d, hi_d, lo_r, hi_r]
|
||||
% the 1st row is lo_d, the 2nd row is hi_d, the 3rd row is lo_r, and the 4th row is hi_r
|
||||
% q_0 is the decomposition level
|
||||
|
||||
% after_thre: data after thresholding
|
||||
|
||||
s= size( before_thre);
|
||||
before_thre= before_thre( :)'; %make it a row vector
|
||||
noise_stat= noise_stat( :)';
|
||||
|
||||
N= length( before_thre); %length of before-thresholded data
|
||||
q= ceil( log2( N));
|
||||
M= 2^ q;
|
||||
|
||||
%==get the low pass and high pass decomposition/reconstruction filters from wfilter
|
||||
lo_d= wfilter( 1, :); %low pass decomposition filter/ scaling filter
|
||||
hi_d= wfilter( 2, :); %high pass decomposition filter/ wavelet filter
|
||||
lo_r= wfilter( 3, :); %low pass reconstruction filter/ scaling filter
|
||||
hi_r= wfilter( 4, :); %high pass reconstruction filter/ wavelet filter
|
||||
|
||||
%==refer to pp. 3155 in Walden's paper
|
||||
H= zeros( q_0, M);
|
||||
H( 1, :)= fft( hi_d, M); %frequency response of wavelet filter
|
||||
G( 1, :)= fft( lo_d, M); %frequency response of scaling filter
|
||||
for i= 2: q_0- 1
|
||||
G( i, :)= G( 1, rem( (2^ (i- 1) )* (0: M- 1), M)+ 1);
|
||||
end
|
||||
|
||||
for j= 2: q_0
|
||||
H( j, :)= prod( [G( 1: j- 1, :); H( 1, rem( (2^ (j- 1) )* (0: M- 1), M)+ 1)], 1);
|
||||
end
|
||||
|
||||
[y_coeff, len_info]= wavedec( before_thre, q_0, lo_d, hi_d);
|
||||
|
||||
% --decompose before_thre into q_0 levels using wavelet filter hi_d and scaling filter lo_d
|
||||
% --where y_coeff contains the coefficients and len_info contains the length information
|
||||
% --different segments of y_coeff correspond approximation and detail coefficients;
|
||||
% -- length of len_info should be q_0+ 2
|
||||
|
||||
%===============processing according to 'thre_type'
|
||||
%-------with 'd'--scale-dependent thresholding, threshold has to be computed for each level
|
||||
%-------with 'i'--scale-independent thresholding, threshold is set to a fixed level
|
||||
|
||||
if thre_type== 'i' %scale-independent universal thresholding
|
||||
sigma_square= mean( noise_stat);
|
||||
thre= sqrt( sigma_square* 2* log( M)) ; %mean( noise_stat) is sigma_eta_square in Eq. (6)
|
||||
y_coeff( len_info( 1)+ 1: end)= ...
|
||||
wthresh( y_coeff( len_info( 1)+ 1: end), thre_func_type, thre);
|
||||
|
||||
elseif thre_type== 'd' %scale-dependent universal thresholding
|
||||
%------first we need to compute the energy level of each scale from j= 1: q_0
|
||||
for i= 1: q_0 %refer to Eq. (9) in Walden's paper
|
||||
sigma_j_square( i)= mean( noise_stat.* (abs( H( i, :)).^ 2), 2); %average along the row
|
||||
end
|
||||
|
||||
for i= 2: q_0+ 1 %thresholding for each scale
|
||||
|
||||
sp= sum( len_info( 1: i- 1), 2)+ 1; %starting point
|
||||
ep= sp+ len_info( i)- 1;
|
||||
thre= sqrt( sigma_j_square( q_0- i+ 2)* 2* log( len_info( i)));
|
||||
y_coeff( sp: ep)= wthresh( y_coeff( sp: ep), thre_func_type, thre);
|
||||
|
||||
end
|
||||
|
||||
elseif thre_type== 'ds' %scale-dependent SURE thresholding
|
||||
|
||||
%=======use Eq. (9) in Walden's paper to get sigma_j, MDA estimate seems to be better
|
||||
% for i= 1: q_0
|
||||
% sigma_j_square( i)= mean( noise_stat.* (abs( H( i, :)).^ 2), 2); %average along the row
|
||||
% sigma_j( i)= sqrt( sigma_j_square( i));
|
||||
% end
|
||||
|
||||
%======MDA estimate of sigma_j
|
||||
sigma_j= wnoisest( y_coeff, len_info, 1: q_0);
|
||||
|
||||
for i= 2: q_0+ 1 %thresholding for each scale
|
||||
|
||||
sp= sum( len_info( 1: i- 1), 2)+ 1; %starting point
|
||||
ep= sp+ len_info( i)- 1; %ending point
|
||||
if sigma_j( q_0- i+ 2)< sqrt( eps)* max( y_coeff( sp: ep));
|
||||
thre= 0;
|
||||
else
|
||||
thre= sigma_j( q_0- i+ 2)* thselect( y_coeff( sp: ep)/ ...
|
||||
sigma_j( q_0- i+ 2), 'heursure');
|
||||
end
|
||||
|
||||
%fprintf( 1, 'sigma_j is %6.2f, thre is %6.2f\n', sigma_j, thre);
|
||||
y_coeff( sp: ep)= wthresh( y_coeff( sp: ep), thre_func_type, thre);
|
||||
|
||||
end
|
||||
|
||||
elseif thre_type== 'dn' %new risk function defined in Xiao-ping Zhang's paper
|
||||
|
||||
sigma_j= wnoisest( y_coeff, len_info, 1: q_0);
|
||||
sigma_j_square= sigma_j.^ 2;
|
||||
|
||||
for i= 2: q_0+ 1 %thresholding for each scale
|
||||
|
||||
sp= sum( len_info( 1: i- 1), 2)+ 1; %starting point
|
||||
ep= sp+ len_info( i)- 1; %ending point
|
||||
if sigma_j( q_0- i+ 2)< sqrt( eps)* max( y_coeff( sp: ep));
|
||||
thre= 0;
|
||||
else
|
||||
|
||||
%based on some evidece, the following theme let thre vary with SNR
|
||||
% with ultra low SNR indicating low probability of signal presence,
|
||||
% hence using universal threshold
|
||||
% and very high SNR indicates high probability of signal presence,
|
||||
% hence using SURE threshold
|
||||
|
||||
thre_max= sigma_j( q_0- i+ 2)* sqrt( 2* log( len_info( i))); %thre with SNRlog< -5dB
|
||||
thre_min= sigma_j( q_0- i+ 2)* fminbnd( @riskfunc, 0, sqrt(2* log( ep- sp+ 1)), ...
|
||||
optimset( 'MaxFunEvals',1000,'MaxIter',1000), ...
|
||||
y_coeff( sp: ep)/ sigma_j( q_0- i+ 2), 3); %thre with SNRlog> 20dB
|
||||
slope= (thre_max- thre_min)/ 25;
|
||||
thre_0= thre_min+ 20* slope;
|
||||
|
||||
SNRlog= 10* log10( mean( max( y_coeff( sp: ep).^ 2/ sigma_j_square( q_0- i+ 2)- 1, 0)));
|
||||
if SNRlog>= 20
|
||||
thre= thre_min; %actually this corresponds to SURE threshold
|
||||
elseif ( SNRlog< 20) & ( SNRlog>= -5)
|
||||
thre= thre_0- SNRlog* slope;
|
||||
else
|
||||
thre= thre_max; %this corresponds to oversmooth threshold
|
||||
end
|
||||
|
||||
%the theme below is similar to the option 'heursure' in the function 'thselect'
|
||||
% univ_thr = sqrt(2* log( len_info( i))); %universal thresholding
|
||||
% eta = (norm( y_coeff( sp: ep)/ sigma_j( q_0- i+ 2)).^2)/ ( len_info( i))- 1;
|
||||
% crit = (log2( len_info( i)))^(1.5)/ sqrt( len_info( i));
|
||||
% if 1%eta > crit %high probility that speech exists
|
||||
% thre= sigma_j( q_0- i+ 2)* fminbnd( @riskfunc, 0, sqrt(2* log( ep- sp+ 1)), ...
|
||||
% optimset( 'MaxFunEvals',1000,'MaxIter',1000), ...
|
||||
% y_coeff( sp: ep)/ sigma_j( q_0- i+ 2), 3);
|
||||
% else
|
||||
% thre = sigma_j( q_0- i+ 2)* univ_thr;
|
||||
% end
|
||||
|
||||
end
|
||||
|
||||
y_coeff( sp: ep)= wthresh( y_coeff( sp: ep), thre_func_type, thre);
|
||||
|
||||
end
|
||||
|
||||
elseif thre_type== 'dg' %scale-dependent Generalized Cross Validation thresholding
|
||||
|
||||
for i= 2: q_0+ 1 %thresholding for each scale
|
||||
|
||||
sp= sum( len_info( 1: i- 1), 2)+ 1; %starting point
|
||||
ep= sp+ len_info( i)- 1; %ending point
|
||||
[y_coeff( sp: ep), thre]= mingcv( y_coeff( sp: ep), thre_func_type);
|
||||
|
||||
end
|
||||
|
||||
else
|
||||
error( 'wrong thresholding type');
|
||||
end
|
||||
|
||||
%--reconstruct the thresholded coefficients
|
||||
after_thre= waverec( y_coeff, len_info, lo_r, hi_r);
|
||||
|
||||
if s(1)>1
|
||||
after_thre= after_thre';
|
||||
end
|
||||
%fprintf( 1, 'thre is %f\n', thre);
|
||||
|
||||
|
||||
|
||||
function mt_psd= psd_mt_sine( data, sine_tapers)
|
||||
|
||||
% this function uses sine tapers to get multitaper power spectrum estimation
|
||||
% 'x' is the incoming data, 'sine_tapers' is a matrix with each column being
|
||||
% sine taper, sine_tapers can be obtained using the function sine_taper
|
||||
|
||||
[frame_len, taper_num]= size( sine_tapers);
|
||||
|
||||
eigen_spectra= zeros( frame_len, taper_num);
|
||||
|
||||
data= data( :);
|
||||
data_len= length( data);
|
||||
data_hankel= hankel( data( 1: frame_len), data( frame_len: data_len));
|
||||
|
||||
x_mt_psd= zeros( frame_len, data_len- frame_len+ 1);
|
||||
|
||||
for pp= 1: data_len- frame_len+ 1
|
||||
for index= 1: taper_num
|
||||
x_taperd= sine_tapers( :, index).* data_hankel( :, pp);
|
||||
x_taperd_spec= fft( x_taperd);
|
||||
eigen_spectra( :, index)= abs( x_taperd_spec).^ 2;
|
||||
end
|
||||
x_mt_psd(:, pp)= mean( eigen_spectra, 2);
|
||||
end
|
||||
|
||||
mt_psd= mean( x_mt_psd, 2);
|
||||
|
||||
|
||||
|
||||
function tapers= sine_taper( L, N)
|
||||
|
||||
% this function is used to generate the sine tapers proposed by Riedel et
|
||||
% al in IEEE Transactions on Signal Processing, pp. 188- 195, Jan. 1995
|
||||
|
||||
% there are two parameters, 'L' is the number of the sine tapers generated,
|
||||
% and 'N' is the length of each sine taper; the returned value 'tapers' is
|
||||
% a N-by-L matrix with each column being sine taper
|
||||
|
||||
tapers= zeros( N, L);
|
||||
|
||||
for index= 1: L
|
||||
tapers( :, index)= sqrt( 2/ (N+ 1))* sin (pi* index* (1: N)'/ (N+ 1));
|
||||
end
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
function y = trigamma(z,method,debug)
|
||||
|
||||
% y = trigamma(z) ... Trigamma-Function for real positive z
|
||||
%
|
||||
% trigamma(z) = (d/dz)^2 log(gamma(z)) = d/dz digamma(z)
|
||||
%
|
||||
% if 'z' is a matrix, then the digamma-function is evaluated for
|
||||
% each element. Results are inaccurate for real arguments < 10 which are
|
||||
% neither integers nor half-integers.
|
||||
%
|
||||
% y = trigamma(z,method)
|
||||
%
|
||||
% possible values for optional argument 'method':
|
||||
% method = 1 : quick asymptotic series expansion (approximate)
|
||||
% method = 2 : finite recursion for integer values (exact)
|
||||
% method = 3 : finite recursion for half-integer values (exact)
|
||||
% method = 4 (default) : automatic selection of 1,2 or 3 for individual
|
||||
% elements in z whichever is appropriate.
|
||||
%
|
||||
% see also: digamma, gamma, gammaln, gammainc, specfun
|
||||
|
||||
|
||||
% reference: Abramowitz & Stegun, "Handbook of Mathematical Functions"
|
||||
% Chapter "Gamma Function and Related Functions" :
|
||||
% implemented by: Christoph Mecklenbraeuker
|
||||
% (email: cfm@sth.ruhr-uni-bochum.de), July 4, 1995.
|
||||
|
||||
|
||||
dim = size(z); % save original matrix dimension
|
||||
z = reshape(z,dim(1)*dim(2),1); % make a column vector
|
||||
I1 = ones(length(z),1); % auxiliary vector of ones
|
||||
|
||||
if(nargin==1)
|
||||
method=4; debug=0;
|
||||
elseif(nargin==2)
|
||||
debug=0;
|
||||
end;
|
||||
|
||||
|
||||
if(debug == 1) % if debug==1: track recursion
|
||||
[m,n] =size(z);
|
||||
fprintf(1,'trigamma: method = %d, size(z)=[%d %d],\t min(z)=%f, max(z)=%f\n',...
|
||||
method,m,n,min(min(z)),max(max(z)));
|
||||
end;
|
||||
|
||||
if(method==1) % use 9th order asymptotic expansion
|
||||
if(any(z<1))
|
||||
fprintf(1,'Warning: some elements in argument of "trigamma(z,1)" are < 1\n');
|
||||
fprintf(1,'minimal argument = %g: trigamma-result is inaccurate!\n',min(min(z)));
|
||||
end
|
||||
|
||||
% calculate powers of 1/z :
|
||||
w1 = 1./z; w2 = w1.*w1; w3 = w1.*w2; w5 = w2.*w3; w7 = w2.*w5; w9 = w2.*w7;
|
||||
% generate coefficients of expansion: matrix with constant columns
|
||||
a = [ I1 I1/2 I1/6 -I1/30 I1/42 -I1/30];
|
||||
% make vector of powers of 1/z:
|
||||
w = [ w1 w2 w3 w5 w7 w9];
|
||||
% calculate expansion by summing the ROWS of (a .* w) :
|
||||
y = sum((a.*w).').';
|
||||
elseif(method==2)
|
||||
zmax = max(max(floor(z)));
|
||||
ytab = zeros(zmax,1);
|
||||
ytab(1) = pi^2/6; % = psi'(1)
|
||||
for n=1:zmax-1;
|
||||
ytab(n+1) = ytab(n) - 1/n^2; % generate lookup table
|
||||
end;
|
||||
y = ytab(z);
|
||||
elseif(method==3)
|
||||
zmax = max(max(floor(z)));
|
||||
ytab = zeros(zmax+1,1);
|
||||
ytab(1) = pi^2/2; % = psi'(1/2)
|
||||
for n=1:zmax;
|
||||
ytab(n+1) = ytab(n) - 4/(2*n-1)^2; % generate lookup table
|
||||
end;
|
||||
y = ytab(z+0.5);
|
||||
elseif(method==4) % decide here which method to use
|
||||
Less0 = find(z<0); % negative arguments evaluated by reflexion formula
|
||||
Less1 = find(z>0 & z<1); % values between 0 and 1.
|
||||
fraction = rem(z,1); % fractional part of arguments
|
||||
f2 = rem(2*fraction,1);
|
||||
Integers = find(fraction==0 & z>0); % Index set of positive integer arguments
|
||||
NegInts = find(fraction==0 & z<=0); % Index set of positive integer arguments
|
||||
HalfInts = find(abs(fraction-0.5)<1e-7 & z>0); % Index set of positive half-integers
|
||||
Reals = find(f2>1e-7 & z>1); % Index set of all other arguments > 1
|
||||
if(~isempty(Reals)) y(Reals) = trigamma(z(Reals),1,debug); end;
|
||||
if(~isempty(Less1)) y(Less1) = trigamma(z(Less1)+2,1,debug) + ...
|
||||
1./z(Less1).^2+1./(z(Less1)+1).^2;end;
|
||||
% reflexion formula:
|
||||
if(~isempty(Less0)) y(Less0)= -trigamma(1-z(Less0),1,debug)+(pi./sin(pi*z(Less0))).^2; end;
|
||||
% integers:
|
||||
if(~isempty(Integers)) y(Integers) = trigamma(z(Integers),2,debug); end;
|
||||
% half-integers:
|
||||
if(~isempty(HalfInts)) y(HalfInts) = trigamma(z(HalfInts),3,debug); end;
|
||||
% negative integers:
|
||||
if(~isempty(NegInts)) y(NegInts) = Inf * NegInts; end;
|
||||
end
|
||||
|
||||
y = reshape(y,dim(1),dim(2));
|
||||
return;
|
||||
|
||||
|
||||
|
||||
|
||||
function psi = digamma(z,method,debug)
|
||||
%
|
||||
% psi = digamma(z) ... Digamma-Function for real argument z.
|
||||
%
|
||||
% digamma(z) = d/dz log(gamma(z)) = gamma'(z)/gamma(z)
|
||||
%
|
||||
% if 'z' is a matrix, then the digamma-function is evaluated for
|
||||
% each element. Results may be inaccurate for real arguments < 10
|
||||
% which are neither integers nor half-integers.
|
||||
%
|
||||
% psi = digamma(z,method)
|
||||
%
|
||||
% possible values for optional argument 'method':
|
||||
% method = 1 : quick asymptotic series expansion (approximate)
|
||||
% method = 2 : finite recursion for integer values (exact)
|
||||
% method = 3 : finite recursion for half-integer values (exact)
|
||||
% method = 4 (default) : automatic selection of 1,2 or 3 for individual
|
||||
% elements in z whichever is appropriate.
|
||||
%
|
||||
% see also: trigamma, gamma, gammaln, gammainc, specfun
|
||||
|
||||
% reference: Abramowitz & Stegun, "Handbook of Mathematical Functions"
|
||||
% Chapter "Gamma Function and Related Functions" :
|
||||
% implemented by: Christoph Mecklenbraeuker
|
||||
% (email: cfm@sth.ruhr-uni-bochum.de), July 1, 1995.
|
||||
|
||||
|
||||
dim = size(z); % save original matrix dimension
|
||||
z = reshape(z,dim(1)*dim(2),1); % make a column vector
|
||||
I1 = ones(length(z),1); % auxiliary vector of ones
|
||||
|
||||
if(nargin==1)
|
||||
method=4; debug=0;
|
||||
elseif(nargin==2)
|
||||
debug=0;
|
||||
end;
|
||||
|
||||
if(debug == 1) % if debug==1: track recursion
|
||||
[m,n] = size(z);
|
||||
fprintf(1,'digamma: method = %d, size(z)=[%d %d],\t min(z)=%f, max(z)=%f\n',...
|
||||
method,m,n,min(min(z)),max(max(z)));
|
||||
end;
|
||||
|
||||
|
||||
if(method==1) % use 8th order asymptotic expansion
|
||||
if(any(z<1))
|
||||
fprintf(1,'Warning: some elements in argument of "digamma(z,1)" are < 1\n');
|
||||
fprintf(1,'minimal argument = %g: digamma-result is inaccurate!\n',min(min(z)));
|
||||
end
|
||||
% calculate powers of 1/z :
|
||||
w1 = 1./z; w2 = w1.*w1; w4 = w2.*w2; w6 = w2.*w4; w8 = w4.*w4;
|
||||
% generate coefficients of expansion: matrix with constant columns
|
||||
a = [ -I1/2 -I1/12 I1/120 -I1/252 I1/240 ];
|
||||
% make vector of powers of 1/z:
|
||||
w = [ w1 w2 w4 w6 w8 ];
|
||||
% calculate expansion by summing the ROWS of (a .* w) :
|
||||
psi = log(z) + sum((a.*w).').';
|
||||
elseif(method==2)
|
||||
zmax = max(max(floor(z)));
|
||||
psitab = zeros(zmax,1);
|
||||
psitab(1) = -0.5772156649015328606;
|
||||
for n=1:zmax-1;
|
||||
psitab(n+1) = psitab(n) + 1/n; % generate lookup table
|
||||
end;
|
||||
psi = psitab(z);
|
||||
elseif(method==3)
|
||||
zmax = max(max(floor(z)));
|
||||
psitab = zeros(zmax+1,1);
|
||||
psitab(1) = -0.5772156649015328606 - 2*log(2); % = psi(1/2)
|
||||
for n=1:zmax;
|
||||
psitab(n+1) = psitab(n) + 2/(2*n-1); % generate lookup table
|
||||
end;
|
||||
psi = psitab(z+0.5);
|
||||
elseif(method==4) % decide here which method to use
|
||||
Less0 = find(z<0); % negative arguments evaluated by reflexion formula
|
||||
Less1 = find(z>0 & z<1); % values between 0 and 1.
|
||||
fraction = rem(z,1); % fractional part of arguments
|
||||
f2 = rem(2*fraction,1);
|
||||
Integers = find(fraction==0 & z>0); % Index set of positive integer arguments
|
||||
NegInts = find(fraction==0 & z<=0); % Index set of positive integer arguments
|
||||
HalfInts = find(abs(fraction-0.5)<1e-7 & z>0); % Index set of positive half-integers
|
||||
Reals = find(f2>1e-7 & z>1); % Index set of all other arguments > 1
|
||||
if(~isempty(Reals)) psi(Reals) = digamma(z(Reals),1,debug); end;
|
||||
if(~isempty(Less1)) psi(Less1) = digamma(z(Less1)+2,1,debug) - ...
|
||||
1./z(Less1)-1./(z(Less1)+1);end;
|
||||
% reflexion formula:
|
||||
if(~isempty(Less0)) psi(Less0) = digamma(1-z(Less0),1,debug) - pi./tan(pi*z(Less0)); end;
|
||||
if(~isempty(Integers)) psi(Integers) = digamma(z(Integers),2,debug); end;
|
||||
if(~isempty(HalfInts)) psi(HalfInts) = digamma(z(HalfInts),3,debug); end;
|
||||
if(~isempty(NegInts)) psi(NegInts) = Inf * NegInts; end;
|
||||
end
|
||||
|
||||
psi = reshape(psi,dim(1),dim(2));
|
||||
|
||||
return;
|
||||
|
||||
|
||||
% Author: Patrick J. Wolfe
|
||||
% Signal Processing Group
|
||||
% Cambridge University Engineering Department
|
||||
% p.wolfe@ieee.org
|
||||
% Johnston perceptual model initialisation
|
||||
function M= mask( Sx, dft_length, Fs, nbits)
|
||||
|
||||
frame_overlap= dft_length/ 2;
|
||||
freq_val = (0:Fs/dft_length:Fs/2)';
|
||||
half_lsb = (1/(2^nbits-1))^2/dft_length;
|
||||
|
||||
freq= freq_val;
|
||||
thresh= half_lsb;
|
||||
crit_band_ends = [0;100;200;300;400;510;630;770;920;1080;1270;...
|
||||
1480;1720;2000;2320;2700;3150;3700;4400;5300;6400;7700;...
|
||||
9500;12000;15500;Inf];
|
||||
|
||||
% Maximum Bark frequency
|
||||
%
|
||||
imax = max(find(crit_band_ends < freq(end)));
|
||||
|
||||
% Normalised (to 0 dB) threshold of hearing values (Fletcher, 1929)
|
||||
% as used by Johnston. First and last thresholds are corresponding
|
||||
% critical band endpoint values, elsewhere means of interpolated
|
||||
% critical band endpoint threshold values are used.
|
||||
%
|
||||
abs_thr = 10.^([38;31;22;18.5;15.5;13;11;9.5;8.75;7.25;4.75;2.75;...
|
||||
1.5;0.5;0;0;0;0;2;7;12;15.5;18;24;29]./10);
|
||||
ABSOLUTE_THRESH = thresh.*abs_thr(1:imax);
|
||||
|
||||
% Calculation of tone-masking-noise offset ratio in dB
|
||||
%
|
||||
OFFSET_RATIO_DB = 9+ (1:imax)';
|
||||
|
||||
% Initialisation of matrices for bark/linear frequency conversion
|
||||
% (loop increments i to the proper critical band)
|
||||
%
|
||||
num_bins = length(freq);
|
||||
LIN_TO_BARK = zeros(imax,num_bins);
|
||||
i = 1;
|
||||
for j = 1:num_bins
|
||||
while ~((freq(j) >= crit_band_ends(i)) & ...
|
||||
(freq(j) < crit_band_ends(i+1))),
|
||||
i = i+1;
|
||||
end
|
||||
LIN_TO_BARK(i,j) = 1;
|
||||
end
|
||||
|
||||
% Calculation of spreading function (Schroeder et al., 82)
|
||||
|
||||
spreading_fcn = zeros(imax);
|
||||
summ = 0.474:imax;
|
||||
spread = 10.^((15.81+7.5.*summ-17.5.*sqrt(1+summ.^2))./10);
|
||||
for i = 1:imax
|
||||
for j = 1:imax
|
||||
spreading_fcn(i,j) = spread(abs(j-i)+1);
|
||||
end
|
||||
end
|
||||
|
||||
% Calculation of excitation pattern function
|
||||
|
||||
EX_PAT = spreading_fcn* LIN_TO_BARK;
|
||||
|
||||
% Calculation of DC gain due to spreading function
|
||||
|
||||
DC_GAIN = spreading_fcn* ones(imax,1);
|
||||
|
||||
|
||||
%Sx = X.* conj(X);
|
||||
|
||||
C = EX_PAT* Sx;
|
||||
|
||||
% Calculation of spectral flatness measure SFM_dB
|
||||
%
|
||||
[num_bins num_frames] = size(Sx);
|
||||
k = 1/num_bins;
|
||||
SFM_dB = 10.*log10((prod(Sx).^k)./(k.*sum(Sx)+eps)+ eps);
|
||||
|
||||
% Calculation of tonality coefficient and masked threshold offset
|
||||
%
|
||||
alpha = min(1,SFM_dB./-60);
|
||||
O_dB = OFFSET_RATIO_DB(:,ones(1,num_frames)).*...
|
||||
alpha(ones(length(OFFSET_RATIO_DB),1),:) + 5.5;
|
||||
|
||||
% Threshold calculation and renormalisation, accounting for absolute
|
||||
% thresholds
|
||||
|
||||
T = C./10.^(O_dB./10);
|
||||
T = T./DC_GAIN(:,ones(1,num_frames));
|
||||
T = max( T, ABSOLUTE_THRESH(:, ones(1, num_frames)));
|
||||
|
||||
% Reconversion to linear frequency scale
|
||||
|
||||
%M = 1.* sqrt((LIN_TO_BARK')*T);
|
||||
M= LIN_TO_BARK'* T;
|
@ -0,0 +1,153 @@
|
||||
function stsa_mis(filename,outfile)
|
||||
|
||||
%
|
||||
% Implements the Bayesian estimator based on the modified Itakura-Saito
|
||||
% distortion measure [1, Eq. 43].
|
||||
%
|
||||
% Usage: stsa_mis(noisyFile, outputFile)
|
||||
%
|
||||
% infile - noisy speech file in .wav format
|
||||
% outputFile - enhanced output file in .wav format
|
||||
%
|
||||
%
|
||||
% Example call: stsa_mis('sp04_babble_sn10.wav','out_mis.wav');
|
||||
%
|
||||
% References:
|
||||
% [1] Loizou, P. (2005). Speech enhancement based on perceptually motivated
|
||||
% Bayesian estimators of the speech magnitude spectrum. IEEE Trans. on Speech
|
||||
% and Audio Processing, 13(5), 857-869.
|
||||
%
|
||||
% Author: Philipos C. Loizou
|
||||
%
|
||||
% Copyright (c) 2006 by Philipos C. Loizou
|
||||
% $Revision: 0.0 $ $Date: 10/09/2006 $
|
||||
%-------------------------------------------------------------------------
|
||||
|
||||
if nargin<2
|
||||
fprintf('Usage: stsa_mis inFile outFile.wav \n\n');
|
||||
return;
|
||||
end
|
||||
|
||||
|
||||
[x, Srate, bits]= wavread( filename);
|
||||
|
||||
% =============== Initialize variables ===============
|
||||
%
|
||||
|
||||
len=floor(20*Srate/1000); % Frame size in samples
|
||||
if rem(len,2)==1, len=len+1; end;
|
||||
PERC=50; % window overlap in percent of frame size
|
||||
len1=floor(len*PERC/100);
|
||||
len2=len-len1;
|
||||
|
||||
|
||||
win=hanning(len); %tukey(len,PERC); % define window
|
||||
|
||||
|
||||
|
||||
% Noise magnitude calculations - assuming that the first 6 frames is noise/silence
|
||||
%
|
||||
nFFT=len;
|
||||
nFFT2=len/2;
|
||||
noise_mean=zeros(nFFT,1);
|
||||
j=1;
|
||||
for k=1:5
|
||||
noise_mean=noise_mean+abs(fft(win.*x(j:j+len-1),nFFT));
|
||||
j=j+len;
|
||||
end
|
||||
noise_mu=noise_mean/5;
|
||||
noise_mu2=noise_mu.^2;
|
||||
|
||||
%--- allocate memory and initialize various variables
|
||||
|
||||
|
||||
img=sqrt(-1);
|
||||
x_old=zeros(len1,1);
|
||||
Nframes=floor(length(x)/len2)-1;
|
||||
xfinal=zeros(Nframes*len2,1);
|
||||
|
||||
%=============================== Start Processing =======================================================
|
||||
%
|
||||
k=1;
|
||||
aa=0.98;
|
||||
fprintf('\nThis might take some time ...\n');
|
||||
for n=1:Nframes
|
||||
|
||||
|
||||
insign=win.*x(k:k+len-1);
|
||||
|
||||
%--- Take fourier transform of frame ----
|
||||
|
||||
spec=fft(insign,nFFT);
|
||||
sig=abs(spec); % compute the magnitude
|
||||
sig2=sig.^2;
|
||||
|
||||
gammak=min(sig2./noise_mu2,40); % post SNR. Limit it to avoid overflows
|
||||
if n==1
|
||||
ksi=aa+(1-aa)*max(gammak-1,0);
|
||||
else
|
||||
ksi=aa*Xk_prev./noise_mu2 + (1-aa)*max(gammak-1,0); % a priori SNR
|
||||
end
|
||||
|
||||
vk=ksi.*gammak./(1+ksi);
|
||||
|
||||
sig_hat=log(comp_int(vk,gammak,sig)); % Eq. 41
|
||||
|
||||
Xk_prev=sig_hat.^2;
|
||||
|
||||
xi_w= ifft( sig_hat.* exp(img*angle(spec)));
|
||||
xi_w= real( xi_w);
|
||||
|
||||
|
||||
% --- Overlap and add ---------------
|
||||
%
|
||||
xfinal(k:k+ len2-1)= x_old+ xi_w(1:len1);
|
||||
x_old= xi_w(len1+ 1: len);
|
||||
|
||||
if rem(n,20)==0, fprintf('Frame: %d Percent completed:%4.2f\n',n,n*100/Nframes); end;
|
||||
|
||||
k=k+len2;
|
||||
end
|
||||
%========================================================================================
|
||||
|
||||
|
||||
|
||||
|
||||
wavwrite(xfinal,Srate,16,outfile);
|
||||
|
||||
%------------------------------E N D -----------------------------------
|
||||
function xhat=comp_int(vk,gammak,Yk)
|
||||
|
||||
% -- Evaluates Eq. 43 in [1]
|
||||
%
|
||||
|
||||
Yk2=Yk.*Yk;
|
||||
G2=gammak.^2;
|
||||
EV=exp(-vk);
|
||||
|
||||
N=40; % number of terms to keep in infinite sum (Eq. 43)
|
||||
L=length(vk)/2+1;
|
||||
J1=zeros(L,1);
|
||||
J2=zeros(L,1);
|
||||
|
||||
for j=1:L
|
||||
sum=0; sum_b=0;
|
||||
for m=0:N
|
||||
F=factorial(m);
|
||||
d1=(vk(j))^m;
|
||||
d2=hyperg(-m,-m,0.5,Yk2(j)/(4*G2(j)),10);
|
||||
d2_b=hyperg(-m,-m,1.5,Yk2(j)/(4*G2(j)),10);
|
||||
sum=sum+d1*d2/F;
|
||||
sum_b=sum_b+gamma(m+1.5)*d1*d2_b/(F*gamma(m+1));
|
||||
end
|
||||
J1(j)=sum;
|
||||
J2(j)=sum_b;
|
||||
end
|
||||
|
||||
|
||||
J1=J1.*EV(1:L);
|
||||
J2=J2.*EV(1:L).*sqrt(vk(1:L)).*Yk(1:L)./gammak(1:L);
|
||||
|
||||
|
||||
xhat2=max(real(J1+J2),0.00001);
|
||||
xhat = [xhat2; flipud(xhat2(2:L-1))];
|
@ -0,0 +1,131 @@
|
||||
function stsa_wcosh(filename,outfile,p)
|
||||
|
||||
%
|
||||
% Implements the Bayesian estimator based on the weighted cosh
|
||||
% distortion measure [1, Eq. 34].
|
||||
%
|
||||
% Usage: stsa_wcosh(noisyFile, outputFile, p)
|
||||
%
|
||||
% infile - noisy speech file in .wav format
|
||||
% outputFile - enhanced output file in .wav format
|
||||
% p - power exponent used in the weighted cosh measure.
|
||||
% Valid values for p: p>-1
|
||||
%
|
||||
%
|
||||
% Example call: stsa_wcosh('sp04_babble_sn10.wav','out_wcosh.wav',-0.5);
|
||||
%
|
||||
% References:
|
||||
% [1] Loizou, P. (2005). Speech enhancement based on perceptually motivated
|
||||
% Bayesian estimators of the speech magnitude spectrum. IEEE Trans. on Speech
|
||||
% and Audio Processing, 13(5), 857-869.
|
||||
%
|
||||
% Author: Philipos C. Loizou
|
||||
%
|
||||
% Copyright (c) 2006 by Philipos C. Loizou
|
||||
% $Revision: 0.0 $ $Date: 10/09/2006 $
|
||||
%-------------------------------------------------------------------------
|
||||
|
||||
if nargin<3
|
||||
fprintf('Usage: stsa_wcosh(infile.wav,outfile.wav,p) \n');
|
||||
fprintf(' where p>-1 \n\n');
|
||||
return;
|
||||
end;
|
||||
|
||||
if p<-1
|
||||
error('ERROR! p needs to be larger than -1.\n\n');
|
||||
end
|
||||
|
||||
[x, Srate, bits]= wavread( filename);
|
||||
|
||||
|
||||
|
||||
% =============== Initialize variables ===============
|
||||
%
|
||||
|
||||
len=floor(20*Srate/1000); % Frame size in samples
|
||||
if rem(len,2)==1, len=len+1; end;
|
||||
PERC=50; % window overlap in percent of frame size
|
||||
len1=floor(len*PERC/100);
|
||||
len2=len-len1;
|
||||
|
||||
|
||||
win=hanning(len); %tukey(len,PERC); % define window
|
||||
|
||||
|
||||
|
||||
% Noise magnitude calculations - assuming that the first 6 frames is noise/silence
|
||||
%
|
||||
nFFT=2*len;
|
||||
nFFT2=len/2;
|
||||
noise_mean=zeros(nFFT,1);
|
||||
j=1;
|
||||
for k=1:5
|
||||
noise_mean=noise_mean+abs(fft(win.*x(j:j+len-1),nFFT));
|
||||
j=j+len;
|
||||
end
|
||||
noise_mu=noise_mean/5;
|
||||
noise_mu2=noise_mu.^2;
|
||||
|
||||
%--- allocate memory and initialize various variables
|
||||
|
||||
x_old=zeros(len1,1);
|
||||
Nframes=floor(length(x)/len2)-1;
|
||||
xfinal=zeros(Nframes*len2,1);
|
||||
|
||||
|
||||
%=============================== Start Processing =======================================================
|
||||
%
|
||||
k=1;
|
||||
aa=0.98;
|
||||
CC2=sqrt(gamma((p+3)/2)/gamma((p+1)/2));
|
||||
|
||||
for n=1:Nframes
|
||||
|
||||
|
||||
insign=win.*x(k:k+len-1);
|
||||
|
||||
%--- Take fourier transform of frame
|
||||
|
||||
spec=fft(insign,nFFT);
|
||||
sig=abs(spec); % compute the magnitude
|
||||
sig2=sig.^2;
|
||||
|
||||
gammak=min(sig2./noise_mu2,40); % post SNR
|
||||
if n==1
|
||||
ksi=aa+(1-aa)*max(gammak-1,0);
|
||||
else
|
||||
|
||||
ksi=aa*Xk_prev./noise_mu2 + (1-aa)*max(gammak-1,0); % a priori SNR
|
||||
|
||||
end
|
||||
|
||||
vk=ksi.*gammak./(1+ksi);
|
||||
|
||||
|
||||
% --- for the weighted cosh measure
|
||||
|
||||
numer=CC2*sqrt(vk.*confhyperg(-(p+1)/2,1,-vk,100));
|
||||
denom=gammak.*sqrt(confhyperg(-(p-1)/2,1,-vk,100));
|
||||
hw=numer./denom;
|
||||
|
||||
|
||||
sig=sig.*hw;
|
||||
Xk_prev=sig.^2;
|
||||
|
||||
xi_w= ifft( hw .* spec, nFFT);
|
||||
xi_w= real( xi_w);
|
||||
|
||||
|
||||
% --- Overlap and add ---------------
|
||||
%
|
||||
xfinal(k:k+ len2-1)= x_old+ xi_w(1:len1);
|
||||
x_old= xi_w(len1+ 1: len);
|
||||
|
||||
|
||||
k=k+len2;
|
||||
end
|
||||
%========================================================================================
|
||||
|
||||
|
||||
wavwrite(xfinal,Srate,16,outfile);
|
||||
|
@ -0,0 +1,145 @@
|
||||
function stsa_weuclid(filename,outfile,p)
|
||||
|
||||
%
|
||||
% Implements the Bayesian estimator based on the weighted-Euclidean
|
||||
% distortion measure [1, Eq. 18].
|
||||
%
|
||||
% Usage: stsa_weuclid(noisyFile, outputFile, p)
|
||||
%
|
||||
% infile - noisy speech file in .wav format
|
||||
% outputFile - enhanced output file in .wav format
|
||||
% p - power exponent used in the weighted-Euclidean measure.
|
||||
% Valid values for p: p>-2
|
||||
%
|
||||
%
|
||||
% Example call: stsa_weuclid('sp04_babble_sn10.wav','out_weuclid.wav',-1);
|
||||
%
|
||||
% References:
|
||||
% [1] Loizou, P. (2005). Speech enhancement based on perceptually motivated
|
||||
% Bayesian estimators of the speech magnitude spectrum. IEEE Trans. on Speech
|
||||
% and Audio Processing, 13(5), 857-869.
|
||||
%
|
||||
% Author: Philipos C. Loizou
|
||||
%
|
||||
% Copyright (c) 2006 by Philipos C. Loizou
|
||||
% $Revision: 0.0 $ $Date: 10/09/2006 $
|
||||
%-------------------------------------------------------------------------
|
||||
|
||||
if nargin<3
|
||||
fprintf('Usage: stsa_weuclid(infile.wav,outfile.wav,p) \n');
|
||||
fprintf(' where p>-2 \n\n');
|
||||
return;
|
||||
end;
|
||||
|
||||
if p<-2,
|
||||
error('ERROR! p needs to be larger than -2.\n\n');
|
||||
end
|
||||
|
||||
[x, Srate, bits]= wavread( filename);
|
||||
|
||||
|
||||
% =============== Initialize variables ===============
|
||||
|
||||
len=floor(20*Srate/1000); % Frame size in samples
|
||||
if rem(len,2)==1, len=len+1; end;
|
||||
PERC=50; % window overlap in percent of frame size
|
||||
len1=floor(len*PERC/100);
|
||||
len2=len-len1;
|
||||
|
||||
|
||||
win=hamming(len); %tukey(len,PERC); % define window
|
||||
|
||||
|
||||
% Noise magnitude calculations - assuming that the first 6 frames is noise/silence
|
||||
%
|
||||
nFFT=2*len;
|
||||
nFFT2=len/2;
|
||||
noise_mean=zeros(nFFT,1);
|
||||
j=1;
|
||||
for k=1:6
|
||||
noise_mean=noise_mean+abs(fft(win.*x(j:j+len-1),nFFT));
|
||||
j=j+len;
|
||||
end
|
||||
noise_mu=noise_mean/6;
|
||||
noise_mu2=noise_mu.^2;
|
||||
|
||||
%--- allocate memory and initialize various variables
|
||||
|
||||
k=1;
|
||||
img=sqrt(-1);
|
||||
x_old=zeros(len1,1);
|
||||
Nframes=floor(length(x)/len2)-1;
|
||||
xfinal=zeros(Nframes*len2,1);
|
||||
|
||||
%=============================== Start Processing =======================================================
|
||||
%
|
||||
k=1;
|
||||
aa=0.98;
|
||||
mu=0.98;
|
||||
eta=0.15;
|
||||
c=sqrt(pi)/2;
|
||||
C2=gamma(0.5);
|
||||
|
||||
%p=-1;
|
||||
CC=gamma((p+3)/2)/gamma(p/2+1);
|
||||
ksi_min=10^(-25/10);
|
||||
|
||||
for n=1:Nframes
|
||||
|
||||
|
||||
insign=win.*x(k:k+len-1);
|
||||
|
||||
%--- Take fourier transform of frame
|
||||
|
||||
spec=fft(insign,nFFT);
|
||||
sig=abs(spec); % compute the magnitude
|
||||
sig2=sig.^2;
|
||||
|
||||
gammak=min(sig2./noise_mu2,40); % post SNR
|
||||
if n==1
|
||||
ksi=aa+(1-aa)*max(gammak-1,0);
|
||||
else
|
||||
ksi=aa*Xk_prev./noise_mu2 + (1-aa)*max(gammak-1,0); % a priori SNR
|
||||
ksi=max(ksi_min,ksi); % limit ksi to -25 dB
|
||||
end
|
||||
|
||||
log_sigma_k= gammak.* ksi./ (1+ ksi)- log(1+ ksi);
|
||||
vad_decision= sum( log_sigma_k)/ len;
|
||||
if (vad_decision< eta)
|
||||
% noise only frame found
|
||||
noise_mu2= mu* noise_mu2+ (1- mu)* sig2;
|
||||
end
|
||||
% ===end of vad===
|
||||
|
||||
vk=ksi.*gammak./(1+ksi);
|
||||
|
||||
%----- weighted Euclidean distance ------------------------
|
||||
if p==-1
|
||||
hw=CC*sqrt(vk)./(gammak.*exp(-vk/2).*besseli(0,vk/2)); % if p=-1 use this equation as it's faster
|
||||
else
|
||||
numer=CC*sqrt(vk).*confhyperg(-(p+1)/2,1,-vk,100);
|
||||
denom=gammak.*confhyperg(-p/2,1,-vk,100);
|
||||
hw=numer./denom;
|
||||
end
|
||||
%
|
||||
|
||||
sig=sig.*hw;
|
||||
Xk_prev=sig.^2;
|
||||
|
||||
xi_w= ifft( hw .* spec, nFFT);
|
||||
xi_w= real( xi_w);
|
||||
|
||||
|
||||
% --- Overlap and add ---------------
|
||||
%
|
||||
xfinal(k:k+ len2-1)= x_old+ xi_w(1:len1);
|
||||
x_old= xi_w(len1+ 1: len);
|
||||
|
||||
|
||||
k=k+len2;
|
||||
end
|
||||
%========================================================================================
|
||||
|
||||
|
||||
wavwrite(xfinal,Srate,16,outfile);
|
||||
|
@ -0,0 +1,169 @@
|
||||
function stsa_wlr(filename,outfile)
|
||||
|
||||
%
|
||||
% Implements the Bayesian estimator based on the weighted likelihood ratio
|
||||
% distortion measure [1, Eq. 37].
|
||||
%
|
||||
% Usage: stsa_wlr(noisyFile, outputFile)
|
||||
%
|
||||
% infile - noisy speech file in .wav format
|
||||
% outputFile - enhanced output file in .wav format
|
||||
%
|
||||
%
|
||||
% Example call: stsa_wlr('sp04_babble_sn10.wav','out_wlr.wav');
|
||||
%
|
||||
% References:
|
||||
% [1] Loizou, P. (2005). Speech enhancement based on perceptually motivated
|
||||
% Bayesian estimators of the speech magnitude spectrum. IEEE Trans. on Speech
|
||||
% and Audio Processing, 13(5), 857-869.
|
||||
%
|
||||
% Author: Philipos C. Loizou
|
||||
%
|
||||
% Copyright (c) 2006 by Philipos C. Loizou
|
||||
% $Revision: 0.0 $ $Date: 10/09/2006 $
|
||||
%-------------------------------------------------------------------------
|
||||
|
||||
if nargin<2
|
||||
fprintf('Usage: stsa_wlr inFile outFile.wav \n\n');
|
||||
return;
|
||||
end
|
||||
|
||||
[x, Srate, bits]= wavread( filename);
|
||||
|
||||
|
||||
% =============== Initialize variables ===============
|
||||
%
|
||||
len=floor(20*Srate/1000); % Frame size in samples
|
||||
if rem(len,2)==1, len=len+1; end;
|
||||
PERC=50; % window overlap in percent of frame size
|
||||
len1=floor(len*PERC/100);
|
||||
len2=len-len1;
|
||||
|
||||
|
||||
win=hanning(len); %tukey(len,PERC); % define window
|
||||
|
||||
|
||||
% Noise magnitude calculations - assuming that the first 6 frames is noise/silence
|
||||
%
|
||||
nFFT=len;
|
||||
nFFT2=len/2;
|
||||
noise_mean=zeros(nFFT,1);
|
||||
j=1;
|
||||
for k=1:5
|
||||
noise_mean=noise_mean+abs(fft(win.*x(j:j+len-1),nFFT));
|
||||
j=j+len;
|
||||
end
|
||||
noise_mu=noise_mean/5;
|
||||
noise_mu2=noise_mu.^2;
|
||||
|
||||
%--- allocate memory and initialize various variables
|
||||
|
||||
img=sqrt(-1);
|
||||
x_old=zeros(len1,1);
|
||||
Nframes=floor(length(x)/len2)-1;
|
||||
xfinal=zeros(Nframes*len2,1);
|
||||
xinterv=0.001:0.01:10;
|
||||
k=1;
|
||||
aa=0.98;
|
||||
|
||||
%=============================== Start Processing =======================================================
|
||||
%
|
||||
fprintf('This might take some time ...\n')
|
||||
for n=1:Nframes
|
||||
|
||||
|
||||
insign=win.*x(k:k+len-1);
|
||||
|
||||
%--- Take fourier transform of frame
|
||||
|
||||
spec=fft(insign,nFFT);
|
||||
sig=abs(spec); % compute the magnitude
|
||||
sig2=sig.^2;
|
||||
|
||||
gammak=min(sig2./noise_mu2,40); % post SNR. Limit it to avoid overflows
|
||||
|
||||
if n==1
|
||||
ksi=aa+(1-aa)*max(gammak-1,0);
|
||||
else
|
||||
ksi=aa*Xk_prev./noise_mu2 + (1-aa)*max(gammak-1,0); % a priori SNR
|
||||
end
|
||||
|
||||
vk=ksi.*gammak./(1+ksi);
|
||||
|
||||
|
||||
xx=solve_wlr(vk,gammak,sig,xinterv); % solves Eq. 37 in [1]
|
||||
|
||||
sig_hat=xx;
|
||||
Xk_prev=sig_hat.^2;
|
||||
|
||||
xi_w= ifft( sig_hat.* exp(img*angle(spec)));
|
||||
xi_w= real( xi_w);
|
||||
|
||||
|
||||
% --- Overlap and add ---------------
|
||||
%
|
||||
xfinal(k:k+ len2-1)= x_old+ xi_w(1:len1);
|
||||
x_old= xi_w(len1+ 1: len);
|
||||
|
||||
if rem(n,20)==0, fprintf('Frame: %d Percent completed:%4.2f \n',n,n*100/Nframes); end;
|
||||
|
||||
k=k+len2;
|
||||
end
|
||||
%========================================================================================
|
||||
|
||||
|
||||
|
||||
|
||||
wavwrite(xfinal,Srate,16,outfile);
|
||||
|
||||
|
||||
%==========================================================================
|
||||
function x=solve_wlr(vk,gammak,Yk,xx);
|
||||
|
||||
% solves non-linear Eq. 37 in [1]
|
||||
%
|
||||
|
||||
Len=length(vk);
|
||||
L2=Len/2+1;
|
||||
|
||||
lk05=sqrt(vk).*Yk./gammak;
|
||||
Ex=gamma(1.5)*lk05.*confhyperg(-0.5,1,-vk,100);
|
||||
Elogx=1-0.5*(2*log(lk05)+log(vk)+expint(vk));
|
||||
|
||||
x=zeros(Len,1);
|
||||
|
||||
for n=1:L2
|
||||
|
||||
a=Elogx(n);
|
||||
b=Ex(n);
|
||||
ff=sprintf('log(x)+%f - %f/x',a,b);
|
||||
y=log(xx)+a-b./xx;
|
||||
bet=xx(1); tox=200;
|
||||
if y(1)<0
|
||||
ind=find(y>0);
|
||||
bet=xx(1)/2;
|
||||
tox=xx(ind(1));
|
||||
|
||||
[x(n),fval,flag]=fzero(inline(ff),[bet tox]);
|
||||
if flag<0
|
||||
x(n)=x(n-1);
|
||||
end
|
||||
else
|
||||
ind=find(y<0);
|
||||
if ~isempty(ind)
|
||||
bet=xx(1);
|
||||
tox=xx(ind(1));
|
||||
[x(n),fval]=fzero(inline(ff),[bet tox]);
|
||||
|
||||
else
|
||||
|
||||
x(n)=0.001; % spectral floor
|
||||
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
end
|
||||
|
||||
x(L2+1:Len)=flipud(x(2:L2-1));
|
||||
|
@ -0,0 +1,126 @@
|
||||
function wiener_as(filename,outfile)
|
||||
|
||||
%
|
||||
% Implements the Wiener filtering algorithm based on a priori SNR estimation [1].
|
||||
%
|
||||
% Usage: wiener_as(noisyFile, outputFile)
|
||||
%
|
||||
% infile - noisy speech file in .wav format
|
||||
% outputFile - enhanced output file in .wav format
|
||||
|
||||
%
|
||||
% Example call: wiener_as('sp04_babble_sn10.wav','out_wien_as.wav');
|
||||
%
|
||||
% References:
|
||||
% [1] Scalart, P. and Filho, J. (1996). Speech enhancement based on a priori
|
||||
% signal to noise estimation. Proc. IEEE Int. Conf. Acoust. , Speech, Signal
|
||||
% Processing, 629-632.
|
||||
%
|
||||
% Authors: Yi Hu and Philipos C. Loizou
|
||||
%
|
||||
% Copyright (c) 2006 by Philipos C. Loizou
|
||||
% $Revision: 0.0 $ $Date: 10/09/2006 $
|
||||
%-------------------------------------------------------------------------
|
||||
|
||||
if nargin<2
|
||||
fprintf('Usage: wiener_as(noisyfile.wav,outFile.wav) \n\n');
|
||||
return;
|
||||
end
|
||||
|
||||
|
||||
|
||||
[noisy_speech, fs]= audioread( filename);
|
||||
noisy_speech= noisy_speech;
|
||||
% column vector noisy_speech
|
||||
|
||||
% set parameter values
|
||||
mu= 0.98; % smoothing factor in noise spectrum update
|
||||
a_dd= 0.98; % smoothing factor in priori update
|
||||
eta= 0.15; % VAD threshold
|
||||
frame_dur= 20; % frame duration
|
||||
L= frame_dur* fs/ 1000; % L is frame length (160 for 8k sampling rate)
|
||||
hamming_win= hamming( L); % hamming window
|
||||
U= ( hamming_win'* hamming_win)/ L; % normalization factor
|
||||
|
||||
% first 120 ms is noise only
|
||||
len_120ms= fs/ 1000* 120;
|
||||
% first_120ms= noisy_speech( 1: len_120ms).* ...
|
||||
% (hann( len_120ms, 'periodic'))';
|
||||
first_120ms= noisy_speech( 1: len_120ms);
|
||||
|
||||
% =============now use Welch's method to estimate power spectrum with
|
||||
% Hamming window and 50% overlap
|
||||
nsubframes= floor( len_120ms/ (L/ 2))- 1; % 50% overlap
|
||||
noise_ps= zeros( L, 1);
|
||||
n_start= 1;
|
||||
for j= 1: nsubframes
|
||||
noise= first_120ms( n_start: n_start+ L- 1);
|
||||
noise= noise.* hamming_win;
|
||||
noise_fft= fft( noise, L);
|
||||
noise_ps= noise_ps+ ( abs( noise_fft).^ 2)/ (L* U);
|
||||
n_start= n_start+ L/ 2;
|
||||
end
|
||||
noise_ps= noise_ps/ nsubframes;
|
||||
%==============
|
||||
|
||||
% number of noisy speech frames
|
||||
len1= L/ 2; % with 50% overlap
|
||||
nframes= floor( length( noisy_speech)/ len1)- 1;
|
||||
n_start= 1;
|
||||
|
||||
for j= 1: nframes
|
||||
noisy= noisy_speech( n_start: n_start+ L- 1);
|
||||
noisy= noisy.* hamming_win;
|
||||
noisy_fft= fft( noisy, L);
|
||||
noisy_ps= ( abs( noisy_fft).^ 2)/ (L* U);
|
||||
|
||||
% ============ voice activity detection
|
||||
if (j== 1) % initialize posteri
|
||||
posteri= noisy_ps./ noise_ps;
|
||||
posteri_prime= posteri- 1;
|
||||
posteri_prime( find( posteri_prime< 0))= 0;
|
||||
priori= a_dd+ (1-a_dd)* posteri_prime;
|
||||
else
|
||||
posteri= noisy_ps./ noise_ps;
|
||||
posteri_prime= posteri- 1;
|
||||
posteri_prime( find( posteri_prime< 0))= 0;
|
||||
priori= a_dd* (G_prev.^ 2).* posteri_prev+ ...
|
||||
(1-a_dd)* posteri_prime;
|
||||
end
|
||||
|
||||
log_sigma_k= posteri.* priori./ (1+ priori)- log(1+ priori);
|
||||
vad_decision(j)= sum( log_sigma_k)/ L;
|
||||
if (vad_decision(j)< eta)
|
||||
% noise only frame found
|
||||
noise_ps= mu* noise_ps+ (1- mu)* noisy_ps;
|
||||
vad( n_start: n_start+ L- 1)= 0;
|
||||
else
|
||||
vad( n_start: n_start+ L- 1)= 1;
|
||||
end
|
||||
% ===end of vad===
|
||||
|
||||
G= sqrt( priori./ (1+ priori)); % gain function
|
||||
|
||||
enhanced= ifft( noisy_fft.* G, L);
|
||||
|
||||
if (j== 1)
|
||||
enhanced_speech( n_start: n_start+ L/2- 1)= ...
|
||||
enhanced( 1: L/2);
|
||||
else
|
||||
enhanced_speech( n_start: n_start+ L/2- 1)= ...
|
||||
overlap+ enhanced( 1: L/2);
|
||||
end
|
||||
|
||||
overlap= enhanced( L/ 2+ 1: L);
|
||||
n_start= n_start+ L/ 2;
|
||||
|
||||
G_prev= G;
|
||||
posteri_prev= posteri;
|
||||
|
||||
end
|
||||
|
||||
enhanced_speech( n_start: n_start+ L/2- 1)= overlap;
|
||||
|
||||
audiowrite(outfile,enhanced_speech,fs,'BitsPerSample',16);
|
||||
|
||||
|
@ -6,9 +6,9 @@
|
||||
\@writefile{lof}{\contentsline {figure}{\numberline {1}{\ignorespaces The $N$-point DFT of $x[n]$, where $N=8$\relax }}{1}{}\protected@file@percent }
|
||||
\providecommand*\caption@xref[2]{\@setref\relax\@undefined{#1}}
|
||||
\newlabel{fig:N_point_DFT}{{1}{1}}
|
||||
\@writefile{toc}{\contentsline {subsection}{\numberline {1.2}The Z-Transform}{1}{}\protected@file@percent }
|
||||
\@writefile{lof}{\contentsline {figure}{\numberline {2}{\ignorespaces The 9-point DFT of $x[n]$, where $N=8$\relax }}{2}{}\protected@file@percent }
|
||||
\newlabel{fig:9_point_DFT}{{2}{2}}
|
||||
\@writefile{toc}{\contentsline {subsection}{\numberline {1.2}The Z-Transform}{2}{}\protected@file@percent }
|
||||
\@writefile{toc}{\contentsline {subsection}{\numberline {1.3}The Inverse Z-Transform}{2}{}\protected@file@percent }
|
||||
\@writefile{toc}{\contentsline {section}{\numberline {2}Conclusions}{3}{}\protected@file@percent }
|
||||
\gdef \@abspage@last{4}
|
||||
|
@ -1,4 +1,4 @@
|
||||
This is pdfTeX, Version 3.141592653-2.6-1.40.24 (TeX Live 2022/CVE-2023-32700 patched) (preloaded format=pdflatex 2024.3.9) 17 APR 2024 21:12
|
||||
This is pdfTeX, Version 3.141592653-2.6-1.40.24 (TeX Live 2022/CVE-2023-32700 patched) (preloaded format=pdflatex 2024.3.9) 25 APR 2024 09:37
|
||||
entering extended mode
|
||||
restricted \write18 enabled.
|
||||
%&-line parsing enabled.
|
||||
@ -318,37 +318,34 @@ File: Q9_point_DFT.png Graphic file (type png)
|
||||
<use Q9_point_DFT.png>
|
||||
Package pdftex.def Info: Q9_point_DFT.png used on input line 61.
|
||||
(pdftex.def) Requested size: 234.8775pt x 176.15768pt.
|
||||
|
||||
LaTeX Warning: `h' float specifier changed to `ht'.
|
||||
|
||||
|
||||
[1 <./N8_point_dft.png>]
|
||||
Underfull \hbox (badness 10000) in paragraph at lines 71--76
|
||||
|
||||
[]
|
||||
|
||||
[1 <./N8_point_dft.png>] [2 <./Q9_point_dft.png>] [3] (./lab-4.aux) )
|
||||
[2 <./Q9_point_dft.png>] [3] (./lab-4.aux) )
|
||||
Here is how much of TeX's memory you used:
|
||||
5569 strings out of 476182
|
||||
90423 string characters out of 5796582
|
||||
1859793 words of memory out of 6000000
|
||||
25843 multiletter control sequences out of 15000+600000
|
||||
5570 strings out of 476182
|
||||
90433 string characters out of 5796582
|
||||
1858793 words of memory out of 6000000
|
||||
25844 multiletter control sequences out of 15000+600000
|
||||
520010 words of font info for 63 fonts, out of 8000000 for 9000
|
||||
1137 hyphenation exceptions out of 8191
|
||||
55i,8n,63p,490b,340s stack positions out of 10000i,1000n,20000p,200000b,200000s
|
||||
</usr/shar
|
||||
e/texlive/texmf-dist/fonts/type1/public/amsfonts/cm/cmbx12.pfb></usr/share/texl
|
||||
ive/texmf-dist/fonts/type1/public/amsfonts/cm/cmex10.pfb></usr/share/texlive/te
|
||||
xmf-dist/fonts/type1/public/amsfonts/cm/cmmi10.pfb></usr/share/texlive/texmf-di
|
||||
st/fonts/type1/public/amsfonts/cm/cmmi5.pfb></usr/share/texlive/texmf-dist/font
|
||||
s/type1/public/amsfonts/cm/cmmi7.pfb></usr/share/texlive/texmf-dist/fonts/type1
|
||||
/public/amsfonts/cm/cmr10.pfb></usr/share/texlive/texmf-dist/fonts/type1/public
|
||||
/amsfonts/cm/cmr12.pfb></usr/share/texlive/texmf-dist/fonts/type1/public/amsfon
|
||||
ts/cm/cmr17.pfb></usr/share/texlive/texmf-dist/fonts/type1/public/amsfonts/cm/c
|
||||
mr5.pfb></usr/share/texlive/texmf-dist/fonts/type1/public/amsfonts/cm/cmr7.pfb>
|
||||
</usr/share/texlive/texmf-dist/fonts/type1/public/amsfonts/cm/cmsy10.pfb></usr/
|
||||
share/texlive/texmf-dist/fonts/type1/public/amsfonts/cm/cmsy7.pfb></usr/share/t
|
||||
exlive/texmf-dist/fonts/type1/public/amsfonts/cm/cmtt10.pfb>
|
||||
Output written on lab-4.pdf (4 pages, 162298 bytes).
|
||||
55i,8n,63p,490b,332s stack positions out of 10000i,1000n,20000p,200000b,200000s
|
||||
</usr/share/texlive/texmf-dist/font
|
||||
s/type1/public/amsfonts/cm/cmbx12.pfb></usr/share/texlive/texmf-dist/fonts/type
|
||||
1/public/amsfonts/cm/cmex10.pfb></usr/share/texlive/texmf-dist/fonts/type1/publ
|
||||
ic/amsfonts/cm/cmmi10.pfb></usr/share/texlive/texmf-dist/fonts/type1/public/ams
|
||||
fonts/cm/cmmi5.pfb></usr/share/texlive/texmf-dist/fonts/type1/public/amsfonts/c
|
||||
m/cmmi7.pfb></usr/share/texlive/texmf-dist/fonts/type1/public/amsfonts/cm/cmr10
|
||||
.pfb></usr/share/texlive/texmf-dist/fonts/type1/public/amsfonts/cm/cmr12.pfb></
|
||||
usr/share/texlive/texmf-dist/fonts/type1/public/amsfonts/cm/cmr17.pfb></usr/sha
|
||||
re/texlive/texmf-dist/fonts/type1/public/amsfonts/cm/cmr5.pfb></usr/share/texli
|
||||
ve/texmf-dist/fonts/type1/public/amsfonts/cm/cmr7.pfb></usr/share/texlive/texmf
|
||||
-dist/fonts/type1/public/amsfonts/cm/cmsy10.pfb></usr/share/texlive/texmf-dist/
|
||||
fonts/type1/public/amsfonts/cm/cmsy7.pfb></usr/share/texlive/texmf-dist/fonts/t
|
||||
ype1/public/amsfonts/cm/cmtt10.pfb>
|
||||
Output written on lab-4.pdf (4 pages, 162398 bytes).
|
||||
PDF statistics:
|
||||
87 PDF objects out of 1000 (max. 8388607)
|
||||
50 compressed objects within 1 object stream
|
||||
|
Binary file not shown.
@ -49,14 +49,14 @@ where $r$ is the common ratio between adjacent terms. For the $N$-point DFT of $
|
||||
\label{eqn:DFT_N_point}
|
||||
\end{equation}
|
||||
The $N$-point DFT of $x[n]$, where $N=8$ is seen in figure \ref{fig:N_point_DFT}. It only has a non-zero value for $k={N\over2}=4$. This is the case for all even-number-point DFTs. Therefore, only odd-number-point DFTs should be used.
|
||||
\begin{figure}[h]
|
||||
\begin{figure}[H]
|
||||
\center
|
||||
\includegraphics[width=0.5\textwidth]{N8_point_DFT.png}
|
||||
\caption{The $N$-point DFT of $x[n]$, where $N=8$}
|
||||
\label{fig:N_point_DFT}
|
||||
\end{figure}
|
||||
For example, the 9-point DFT of $x[n]$, where $N=8$ is seen in figure \ref{fig:9_point_DFT}. While equation \ref{eqn:DFT_N_point} cannot be used because there are a different number of samples for the DFT and the input signal, the overall DFT is more useful than the 8-point DFT.
|
||||
\begin{figure}[h]
|
||||
\begin{figure}[H]
|
||||
\center
|
||||
\includegraphics[width=0.5\textwidth]{Q9_point_DFT.png}
|
||||
\caption{The 9-point DFT of $x[n]$, where $N=8$}
|
||||
|
Binary file not shown.
@ -0,0 +1,45 @@
|
||||
*--- SIMULATE FILE
|
||||
|
||||
*---SIMULATION PARAMETERS
|
||||
.PARAM:
|
||||
+ FS=100K ;SYSTEM SWITCHING FREQUENCY
|
||||
+ TS={1/FS}
|
||||
+ W={2*PI*FS}
|
||||
+ CYCLE=3 ;SIMULATED CYCLES
|
||||
+ START={500*TS}
|
||||
+ END={START+CYCLE*TS}
|
||||
+ STEP={TS/1000}
|
||||
|
||||
.TRAN {STEP} {END} {START} {STEP} UIC
|
||||
|
||||
*---CIRCUIT PARAMETERS
|
||||
.PARAM:
|
||||
+ VIN = 100
|
||||
+ L1 = 50U
|
||||
+ C1 = 20U
|
||||
+ RL = 10
|
||||
+ D = 0.5
|
||||
+ TON = D*TS
|
||||
|
||||
*--- DC POWER SUPPLY
|
||||
VIN IN 0 {VIN}
|
||||
|
||||
*--- CIRCUIT DISCRIPTION
|
||||
S1 IN S1OUT GP 0 MYSWITCH
|
||||
D1 0 S1OUT MYDIODE
|
||||
|
||||
L1 S1OUT L1OUT {L1} IC=0
|
||||
C1 L1OUT 0 {C1} IC=0
|
||||
|
||||
*--- LOAD RESISTANCE
|
||||
RL L1OUT 0 {RL}
|
||||
|
||||
*--- CONTROL SIGNAL FOR THE SWITCH
|
||||
VGP GP 0 PULSE(0 10 0 0.1U 0.1U {TON-0.1U} {TS})
|
||||
RGP GP 0 100K
|
||||
|
||||
*--- MEASURE POWER AND EFFICIENCY
|
||||
.MEAS TRAN VOUT AVG V(L1OUT)
|
||||
|
||||
.MODEL MYDIODE D(RON=0.1M ROFF=100MEG VFWD=0.1M)
|
||||
.MODEL MYSWITCH SW(RON=0.1M ROFF=100MEG VT=3)
|
@ -0,0 +1,25 @@
|
||||
Circuit: *--- SIMULATE FILE
|
||||
|
||||
Per .tran options, skipping operating point for transient analysis.
|
||||
|
||||
vout: AVG(v(l1out))=50.3944 FROM 0 TO 3e-05
|
||||
|
||||
|
||||
Date: Tue Apr 23 17:05:31 2024
|
||||
Total elapsed time: 2.271 seconds.
|
||||
|
||||
tnom = 27
|
||||
temp = 27
|
||||
method = modified trap
|
||||
totiter = 1032241
|
||||
traniter = 1032241
|
||||
tranpoints = 514891
|
||||
accept = 512502
|
||||
rejected = 2389
|
||||
matrix size = 6
|
||||
fillins = 0
|
||||
solver = Normal
|
||||
Avg thread counts: 1.0/1.0/1.0/1.0
|
||||
Matrix Compiler1: 146 bytes object code size 0.2/0.2/[0.2]
|
||||
Matrix Compiler2: 346 bytes object code size 0.2/0.3/[0.2]
|
||||
|
BIN
6th-Semester-Spring-2024/Power-Electronics/Simulations/BUCK.raw
Normal file
BIN
6th-Semester-Spring-2024/Power-Electronics/Simulations/BUCK.raw
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
After Width: | Height: | Size: 60 KiB |
@ -0,0 +1,13 @@
|
||||
Tu = 0.2
|
||||
Ku = 0.022
|
||||
|
||||
Kp_pid = 0.6*Ku
|
||||
Ki_pid = 1.2*Ku/Tu
|
||||
Kd_pid = 0.075*Ku*Tu
|
||||
|
||||
Kp_no_os = 0.2*Ku
|
||||
Ki_no_os = 0.4*Ku/Tu
|
||||
Kd_no_os = 0.066*Ku*Tu
|
||||
|
||||
print(f"Ziegler Nichols PID Tune: K_p={Kp_pid}, K_i={Ki_pid}, K_d={Kd_pid}")
|
||||
print(f"Ziegler Nichols No Overshoot Tune: K_p={Kp_no_os}, K_i={Ki_no_os}, K_d={Kd_no_os}")
|
Binary file not shown.
After Width: | Height: | Size: 22 KiB |
Binary file not shown.
After Width: | Height: | Size: 23 KiB |
Binary file not shown.
After Width: | Height: | Size: 24 KiB |
Binary file not shown.
After Width: | Height: | Size: 21 KiB |
Binary file not shown.
After Width: | Height: | Size: 19 KiB |
Loading…
Reference in New Issue
Block a user