function [pesq_mos]= pesq(ref_wav, deg_wav) % ---------------------------------------------------------------------- % PESQ objective speech quality measure % % This function implements the PESQ measure based on the ITU standard % P.862 [1]. % % % Usage: pval=pesq(cleanFile.wav, enhancedFile.wav) % % cleanFile.wav - clean input file in .wav format % enhancedFile - enhanced output file in .wav format % pval - PESQ value % % Note that the PESQ routine only supports sampling rates of 8 kHz and % 16 kHz [1] % % Example call: pval = pesq ('sp04.wav','enhanced.wav') % % % References: % [1] ITU (2000). Perceptual evaluation of speech quality (PESQ), and % objective method for end-to-end speech quality assessment of % narrowband telephone networks and speech codecs. ITU-T % Recommendation P. 862 % % Authors: Yi Hu and Philipos C. Loizou % % % Copyright (c) 2006 by Philipos C. Loizou % $Revision: 0.0 $ $Date: 10/09/2006 $ % ---------------------------------------------------------------------- if nargin<2 fprintf('Usage: [pesq_mos]=pesq(cleanfile.wav,enhanced.wav) \n'); return; end; global Downsample DATAPADDING_MSECS SEARCHBUFFER Fs WHOLE_SIGNAL global Align_Nfft Window [ref_data,sampling_rate]= audioread( ref_wav); if sampling_rate~=8000 & sampling_rate~=16000 error('Sampling frequency needs to be either 8000 or 16000 Hz'); end setup_global( sampling_rate); % Window= hann( Align_Nfft, 'periodic'); %Hanning window % Window= Window'; TWOPI= 6.28318530717959; %for count = 0: Align_Nfft- 1 % Window(1+ count) = 0.5 * (1.0 - cos((TWOPI * count) / Align_Nfft)); %end count=0:Align_Nfft- 1; Window= 0.5 * (1.0 - cos((TWOPI * count) / Align_Nfft)); ref_data= ref_data'; ref_data= ref_data* 32768; ref_Nsamples= length( ref_data)+ 2* SEARCHBUFFER* Downsample; ref_data= [zeros( 1, SEARCHBUFFER* Downsample), ref_data, ... zeros( 1, DATAPADDING_MSECS* (Fs/ 1000)+ SEARCHBUFFER* Downsample)]; deg_data= audioread( deg_wav); deg_data= deg_data'; deg_data= deg_data* 32768; deg_Nsamples= length( deg_data)+ 2* SEARCHBUFFER* Downsample; deg_data= [zeros( 1, SEARCHBUFFER* Downsample), deg_data, ... zeros( 1, DATAPADDING_MSECS* (Fs/ 1000)+ SEARCHBUFFER* Downsample)]; maxNsamples= max( ref_Nsamples, deg_Nsamples); ref_data= fix_power_level( ref_data, ref_Nsamples, maxNsamples); deg_data= fix_power_level( deg_data, deg_Nsamples, maxNsamples); standard_IRS_filter_dB= [0, -200; 50, -40; 100, -20; 125, -12; 160, -6; 200, 0;... 250, 4; 300, 6; 350, 8; 400, 10; 500, 11; 600, 12; 700, 12; 800, 12;... 1000, 12; 1300, 12; 1600, 12; 2000, 12; 2500, 12; 3000, 12; 3250, 12;... 3500, 4; 4000, -200; 5000, -200; 6300, -200; 8000, -200]; ref_data= apply_filter( ref_data, ref_Nsamples, standard_IRS_filter_dB); deg_data= apply_filter( deg_data, deg_Nsamples, standard_IRS_filter_dB); % % for later use in psychoacoustical model model_ref= ref_data; model_deg= deg_data; [ref_data, deg_data]= input_filter( ref_data, ref_Nsamples, deg_data, ... deg_Nsamples); [ref_VAD, ref_logVAD]= apply_VAD( ref_data, ref_Nsamples); [deg_VAD, deg_logVAD]= apply_VAD( deg_data, deg_Nsamples); crude_align (ref_logVAD, ref_Nsamples, deg_logVAD, deg_Nsamples,... WHOLE_SIGNAL); utterance_locate (ref_data, ref_Nsamples, ref_VAD, ref_logVAD,... deg_data, deg_Nsamples, deg_VAD, deg_logVAD); ref_data= model_ref; deg_data= model_deg; % make ref_data and deg_data equal length if (ref_Nsamples< deg_Nsamples) newlen= deg_Nsamples+ DATAPADDING_MSECS* (Fs/ 1000); ref_data( newlen)= 0; elseif (ref_Nsamples> deg_Nsamples) newlen= ref_Nsamples+ DATAPADDING_MSECS* (Fs/ 1000); deg_data( newlen)= 0; end pesq_mos= pesq_psychoacoustic_model (ref_data, ref_Nsamples, deg_data, ... deg_Nsamples );