HTK計算mfcc/filter_bank源碼解析 HTK可以用簡單的 求取mfcc或者filter_bank 關於mfcc的原理在 http://my.oschina.net/jamesju/blog/193343 中有講解,這裡主要說HTK具體是如何用C實現的,因為HTK自身的龐大,文件嵌套不少 ...
HTK計算mfcc/filter_bank源碼解析
HTK可以用簡單的
HCopy -C config -s scp
求取mfcc或者filter_bank
關於mfcc的原理在
中有講解,這裡主要說HTK具體是如何用C實現的,因為HTK自身的龐大,文件嵌套不少,所以我提取出了求取filter_bank的源碼並重寫了,可以直接運行。
讀入數據、分幀
首先定義三個結構體:
typedef struct Wave
{
int nSample;//wav中樣本個數
int frSize;//一幀中的樣本數
int frIdx;//當前幀位置
int frRate;//幀移
float *wavdata;
int nRow;
float *Rdata;
};
typedef struct FBankInfo
{
int frameSize; /* speech frameSize */
int numChans; /* number of channels */
long sampPeriod; /* sample period */
int fftN; /* fft size */
int klo, khi; /* lopass to hipass cut-off fft indices */
int usePower; /* use power rather than magnitude *///boolen
int takeLogs; /* log filterbank channels *///boolen
float fres; /* scaled fft resolution */
float *cf; /* array[1..pOrder+1] of centre freqs */
float *loChan; /* array[1..fftN/2] of loChan index */
float *loWt; /* array[1..fftN/2] of loChan weighting */
float *x; /* array[1..fftN] of fftchans */
};
typedef struct IOConfig
{
float curVol;/* current volume dB (0.0-100.0) */
float preEmph;
int frSize;//一幀中的樣本數
int frIdx;//當前幀位置
int frRate;//幀移
float *fbank;
struct FBankInfo fbInfo;
float *s;//幀數據
};
之後讀入數據,這裡需要WAV頭文件的知識,每個採樣點讀入的
就是兩個位元組的音頻信息,如果matlab打開wav文件得到的採樣點數值會
很小,因為是除以採樣數之後的值,存入w->wavdata中,對應函數是
void LoadFile(char *s, struct Wave *w)
下一部是分幀,預設值為
w->frIdx = 0; w->frRate = 160; w->frSize = 400;
/*採樣率為25ms,則幀長為400,預設幀移為160*/
幀信息存入cf中,幀的具體數值存入cf->s中
for (int k = 0; k < w->nRow + 1; k++)
{
/*分幀,計算幀能量*/
cf->s[0] = w->frSize;
GetWave(cf->s + 1, w);
int j, m, e, x;
for (j = 1, m = e = 0.0; j <= w->frSize; j++) {
x = (int)cf->s[j];
m += x; e += x*x;
}
m = m / w->frSize; e = e / w->frSize - m*m;
if (e>0.0) e = 10.0*log10(e / 0.32768);
else e = 0.0;
cf->curVol = e;
/*處理*/
ConvertFrame(cf, w);
linkdata(cf, w, k);
}
處理幀數據
在
void ConvertFrame(struct IOConfig *cf, struct Wave *w)
中處理幀數據
void ConvertFrame(struct IOConfig *cf, struct Wave *w)
{
float re, rawte = 0.0, te, *p, cepScale = 1.0;
int i, bsize = 0;
char buf[50];
int rawE;//boolen
cf->frIdx = w->frIdx; cf->frSize = w->frSize; cf->frRate = w->frRate;
cf->preEmph = 0.97;
ZeroMeanFrame(cf->s);//零均值處理
PreEmphasise(cf->s, cf->preEmph);//預加重處理
Ham(cf->s);//加窗 此處可以優化,創建ham窗多次十分耗費時間,可以用全局申請ham的記憶體
cf->fbInfo = InitFBank(cf);
cf->fbank = (float*)malloc(sizeof(float)* NUMCHANS);
cf->fbank[0] = NUMCHANS;
Wave2FBank(cf->s, cf->fbank, cf->fbInfo);//提取NUMCHANS為40的fbank
}
可以看到,先後是零均值處理、預加重處理、加窗處理、最後是fbank的參數處理,其中包含了FFT等,最後將fbank數據存入cf->fbank中
很重要的是在計算結束之後,有一個
void zeromean(struct Wave *w)
還要進行一次零均值!
所有代碼如下:
頭文件:filterbank.h
#ifndef __FILTERFBANK_H__
#define __FILTERFBANK_H__
#define INPUT_DIMEN 1640
#define LAYER_DIMEN 128
#define OUTPUT_DIMEN 3
#define LAYER 4
#define PI 3.14159265358979
#define TPI 6.28318530717959 /* PI*2 */
#define NUMCHANS 40
typedef struct Wave
{
int nSample;//wav中樣本個數
int frSize;//一幀中的樣本數
int frIdx;//當前幀位置
int frRate;//幀移
float *wavdata;
int nRow;
float *Rdata;
};
typedef struct FBankInfo
{
int frameSize; /* speech frameSize */
int numChans; /* number of channels */
long sampPeriod; /* sample period */
int fftN; /* fft size */
int klo, khi; /* lopass to hipass cut-off fft indices */
int usePower; /* use power rather than magnitude *///boolen
int takeLogs; /* log filterbank channels *///boolen
float fres; /* scaled fft resolution */
float *cf; /* array[1..pOrder+1] of centre freqs */
float *loChan; /* array[1..fftN/2] of loChan index */
float *loWt; /* array[1..fftN/2] of loChan weighting */
float *x; /* array[1..fftN] of fftchans */
};
typedef struct IOConfig
{
float curVol;/* current volume dB (0.0-100.0) */
float preEmph;
int frSize;//一幀中的樣本數
int frIdx;//當前幀位置
int frRate;//幀移
float *fbank;
struct FBankInfo fbInfo;
float *s;//幀數據
};
/*讀取PCM文件到wavdata中*/
void LoadFile(char *s, struct Wave *w);
void GetWave(float *buf, struct Wave *w);
void ZeroMeanFrame(float *frame);
void PreEmphasise(float *frame, float k);
void Ham(float *frame);
float Mel(int k, float fres);
float WarpFreq(float fcl, float fcu, float freq, float minFreq, float maxFreq, float alpha);
struct FBankInfo InitFBank(struct IOConfig *cf);
void FFT(float *s, int invert);
void Realft(float *s);
void Wave2FBank(float *s, float *fbank, struct FBankInfo info);
void ConvertFrame(struct IOConfig *cf, struct Wave *w);
void linkdata(struct IOConfig *cf, struct Wave *w, int k);
void zeromean(struct Wave *w);
struct Wave filter_bank(char *s);
#endif
函數文件:filterbank.cpp
#include<stdio.h>
#include<math.h>
#include <stdlib.h>
#include <time.h>
#include "nNet.h"
#include "filterbank.h"
/*讀取PCM文件到wavdata中*/
void LoadFile(char *s, struct Wave *w)
{
FILE *fp;
fp = fopen(s, "rb");
if (!fp)
{
printf("can not open this file\n");
exit(0);
}
unsigned char ch1, ch2, ch3, ch4;
int i;
float *buf;
buf = w->wavdata;
for (i = 0; i < 40; i++)
{
ch1 = fgetc(fp);
}
ch1 = fgetc(fp); ch2 = fgetc(fp); ch3 = fgetc(fp); ch4 = fgetc(fp);
w->nSample = (ch2 * 16 * 16 + ch1) + (ch4 * 16 * 16 + ch3) * 16 * 16 * 16 * 16;
w->nSample /= 2;
printf("%ld", w->nSample);
w->nRow = (w->nSample - w->frSize) / w->frRate + 1;
for (i = 0; i<w->nSample; i++)
{
ch1 = fgetc(fp); //每次讀取兩個字元,存在數組ch中
ch2 = fgetc(fp);
//if (i % 8 == 0) //每行輸出16個字元對應的十六進位數
//printf("\n");
float temp = ch2 * 16 * 16 + ch1;
if (temp < 32768){
//printf("%d ", temp);
*buf++ = temp;
//printf("%lf ", (float)(ch2 * 16 * 16 + ch1) / 32767);
}
else{
//printf("%d ",temp - 65535 - 1);
*buf++ = temp - 65535 - 1;
//printf("%lf ", (float)((ch2 * 16 * 16 + ch1)-65535-1) / 32768);
}
}
fclose(fp);
}
/*從wavdata中提取當前幀*/
void GetWave(float *buf, struct Wave *w)
{
int k;
if (w->frIdx + w->frSize > w->nSample)
{
printf("GetWave: attempt to read past end of buffer\n");
for (k = 0; k < w->frSize; k++)
{
buf[k] = 0;
}
for (k = 0; w->frIdx + k < w->nSample; k++)
{
buf[k] = w->wavdata[w->frIdx + k];
}
}
for (k = 0; k < w->frSize; k++)
{
buf[k] = w->wavdata[w->frIdx + k];
}
w->frIdx += w->frRate;
}
void ZeroMeanFrame(float *frame)
{
int size, i;
float sum = 0.0, off;
size = frame[0];
for (i = 1; i <= size; i++) sum += frame[i];
off = sum / size;
for (i = 1; i <= size; i++) frame[i] -= off;
}
void PreEmphasise(float *frame, float k)
{
int i;
float preE = k;
int size = frame[0];
for (i = size; i >= 2; i--)
frame[i] -= frame[i - 1] * preE;
frame[1] *= 1.0 - preE;
}
void Ham(float *frame)
{
int frameSize = frame[0];
int i;
float a;
float *hamWin;
hamWin = (float*)malloc(sizeof(float)*frameSize);
a = TPI / (frameSize - 1);
int b = cos(a * 0);
for (i = 1; i <= frameSize; i++)
hamWin[i] = 0.54 - 0.46 * cos(a*(i - 1));
for (i = 1; i <= frameSize; i++)
{
frame[i] *= hamWin[i];
}
//free(hamWin);
}
float Mel(int k, float fres)
{
return 1127 * log(1 + (k - 1)*fres);
}
float WarpFreq(float fcl, float fcu, float freq, float minFreq, float maxFreq, float alpha)
{
if (alpha == 1.0)
return freq;
else {
float scale = 1.0 / alpha;
float cu = fcu * 2 / (1 + scale);
float cl = fcl * 2 / (1 + scale);
float au = (maxFreq - cu * scale) / (maxFreq - cu);
float al = (cl * scale - minFreq) / (cl - minFreq);
if (freq > cu)
return au * (freq - cu) + scale * cu;
else if (freq < cl)
return al * (freq - minFreq) + minFreq;
else
return scale * freq;
}
}
struct FBankInfo InitFBank(struct IOConfig *cf)
{
int numChans = 40; int usePower = 0; int takeLogs = 1; int sampPeriod = 625;//sampPeriod要考慮一下
float alpha = 1; int warpLowCut = 0; int warpUpCut = 0;
struct FBankInfo fb;
float mlo, mhi, ms, melk;
int k, chan, maxChan, Nby2;
int doubleFFT = 0;
/* Save sizes to cross-check subsequent usage */
fb.frameSize = cf->frSize;
fb.numChans = numChans;
fb.sampPeriod = sampPeriod;
fb.usePower = usePower;
fb.takeLogs = takeLogs;
/* Calculate required FFT size */
fb.fftN = 2;
while (fb.frameSize>fb.fftN)
fb.fftN *= 2;
if (doubleFFT)//不執行
fb.fftN *= 2;
Nby2 = fb.fftN / 2;
fb.fres = 1.0E7 / (sampPeriod * fb.fftN * 700.0);
maxChan = numChans + 1;
/* set lo and hi pass cut offs if any */
fb.klo = 2; fb.khi = Nby2; /* apply lo/hi pass filtering */
mlo = 0; mhi = Mel(Nby2 + 1, fb.fres);
/* Create vector of fbank centre frequencies */
fb.cf = (float*)malloc(sizeof(float)*maxChan + 1);
fb.cf[0] = maxChan;
ms = mhi - mlo;
for (chan = 1; chan <= maxChan; chan++) {
if (alpha == 1.0) {
fb.cf[chan] = ((float)chan / (float)maxChan)*ms + mlo;
}
else {
/* scale assuming scaling starts at lopass */
float minFreq = 700.0 * (exp(mlo / 1127.0) - 1.0);
float maxFreq = 700.0 * (exp(mhi / 1127.0) - 1.0);
float cf = ((float)chan / (float)maxChan) * ms + mlo;
cf = 700 * (exp(cf / 1127.0) - 1.0);
fb.cf[chan] = 1127.0 * log(1.0 + WarpFreq(warpLowCut, warpUpCut, cf, minFreq, maxFreq, alpha) / 700.0);
}
}
/* Create loChan map, loChan[fftindex] . lower channel index */
fb.loChan = (float*)malloc(sizeof(float)*Nby2 + 1);
fb.loChan[0] = Nby2;
for (k = 1, chan = 1; k <= Nby2; k++){
melk = Mel(k, fb.fres);
if (k<fb.klo || k>fb.khi) fb.loChan[k] = -1;
else {
while (fb.cf[chan] < melk && chan <= maxChan) ++chan;
fb.loChan[k] = chan - 1;
}
}
/* Create vector of lower channel weights */
fb.loWt = (float*)malloc(sizeof(float)*Nby2 + 1);
fb.loWt[0] = Nby2;
for (k = 1; k <= Nby2; k++) {
chan = fb.loChan[k];
if (k<fb.klo || k>fb.khi) fb.loWt[k] = 0.0;
else {
if (chan>0)
fb.loWt[k] = ((fb.cf[chan + 1] - Mel(k, fb.fres)) /
(fb.cf[chan + 1] - fb.cf[chan]));
else
fb.loWt[k] = (fb.cf[1] - Mel(k, fb.fres)) / (fb.cf[1] - mlo);
}
}
/* Create workspace for fft */
fb.x = (float*)malloc(sizeof(float)*fb.fftN + 1);
fb.x[0] = fb.fftN;
return fb;
}
void FFT(float *s, int invert)
{
int ii, jj, n, nn, limit, m, j, inc, i;
double wx, wr, wpr, wpi, wi, theta;
double xre, xri, x;
n = s[0];
nn = n / 2; j = 1;
for (ii = 1; ii <= nn; ii++) {
i = 2 * ii - 1;
if (j>i) {
xre = s[j]; xri = s[j + 1];
s[j] = s[i]; s[j + 1] = s[i + 1];
s[i] = xre; s[i + 1] = xri;
}
m = n / 2;
while (m >= 2 && j > m) {
j -= m; m /= 2;
}
j += m;
};
limit = 2;
while (limit < n) {
inc = 2 * limit; theta = TPI / limit;
if (invert) theta = -theta;
x = sin(0.5 * theta);
wpr = -2.0 * x * x; wpi = sin(theta);
wr = 1.0; wi = 0.0;
for (ii = 1; ii <= limit / 2; ii++) {
m = 2 * ii - 1;
for (jj = 0; jj <= (n - m) / inc; jj++) {
i = m + jj * inc;
j = i + limit;
xre = wr * s[j] - wi * s[j + 1];
xri = wr * s[j + 1] + wi * s[j];
s[j] = s[i] - xre; s[j + 1] = s[i + 1] - xri;
s[i] = s[i] + xre; s[i + 1] = s[i + 1] + xri;
}
wx = wr;
wr = wr * wpr - wi * wpi + wr;
wi = wi * wpr + wx * wpi + wi;
}
limit = inc;
}
if (invert)
for (i = 1; i <= n; i++)
s[i] = s[i] / nn;
}
void Realft(float *s)
{
int n, n2, i, i1, i2, i3, i4;
double xr1, xi1, xr2, xi2, wrs, wis;
double yr, yi, yr2, yi2, yr0, theta, x;
n = s[0] / 2; n2 = n / 2;
theta = PI / n;
FFT(s, 0);
x = sin(0.5 * theta);
yr2 = -2.0 * x * x;
yi2 = sin(theta); yr = 1.0 + yr2; yi = yi2;
for (i = 2; i <= n2; i++) {
i1 = i + i - 1; i2 = i1 + 1;
i3 = n + n + 3 - i2; i4 = i3 + 1;
wrs = yr; wis = yi;
xr1 = (s[i1] + s[i3]) / 2.0; xi1 = (s[i2] - s[i4]) / 2.0;
xr2 = (s[i2] + s[i4]) / 2.0; xi2 = (s[i3] - s[i1]) / 2.0;
s[i1] = xr1 + wrs * xr2 - wis * xi2;
s[i2] = xi1 + wrs * xi2 + wis * xr2;
s[i3] = xr1 - wrs * xr2 + wis * xi2;
s[i4] = -xi1 + wrs * xi2 + wis * xr2;
yr0 = yr;
yr = yr * yr2 - yi * yi2 + yr;
yi = yi * yr2 + yr0 * yi2 + yi;
}
xr1 = s[1];
s[1] = xr1 + s[2];
s[2] = 0.0;
}
void Wave2FBank(float *s, float *fbank, struct FBankInfo info)
{
const float melfloor = 1.0;
int k, bin;
float t1, t2; /* real and imag parts */
float ek; /* energy of k'th fft channel */
float te = 0.0;
for (k = 1; k <= info.frameSize; k++)
te += (s[k] * s[k]);
/* Apply FFT */
for (k = 1; k <= info.frameSize; k++)
info.x[k] = s[k]; /* copy to workspace */
for (k = info.frameSize + 1; k <= info.fftN; k++)
info.x[k] = 0.0; /* pad with zeroes */
Realft(info.x); /* take fft */
/* Fill filterbank channels */
int i = 0;
for (i = 1; i <= fbank[0]; i++)
fbank[i] = 0.0;
for (k = info.klo; k <= info.khi; k++) { /* fill bins */
t1 = info.x[2 * k - 1]; t2 = info.x[2 * k];
if (info.usePower)
ek = t1*t1 + t2*t2;
else
ek = sqrt(t1*t1 + t2*t2);
bin = info.loChan[k];
t1 = info.loWt[k] * ek;
if (bin > 0) fbank[bin] += t1;
if (bin < info.numChans) fbank[bin + 1] += ek - t1;
//printf("k:%d bin:%d info.loWt:%f fbank[bin]:%f\n", k, bin, info.loWt[k], fbank[bin]);
}
/* Take logs */
if (info.takeLogs)
for (bin = 1; bin <= info.numChans; bin++) {
t1 = fbank[bin];
if (t1 < melfloor) t1 = melfloor;
fbank[bin] = log(t1);
}
}
void ConvertFrame(struct IOConfig *cf, struct Wave *w)
{
float re, rawte = 0.0, te, *p, cepScale = 1.0;
int i, bsize = 0;
char buf[50];
int rawE;//boolen
cf->frIdx = w->frIdx; cf->frSize = w->frSize; cf->frRate = w->frRate;
cf->preEmph = 0.97;
ZeroMeanFrame(cf->s);//零均值處理
PreEmphasise(cf->s, cf->preEmph);//預加重處理
Ham(cf->s);//加窗 此處可以優化,創建ham窗多次十分耗費時間,可以用全局申請ham的記憶體
cf->fbInfo = InitFBank(cf);
cf->fbank = (float*)malloc(sizeof(float)* NUMCHANS);
cf->fbank[0] = NUMCHANS;
Wave2FBank(cf->s, cf->fbank, cf->fbInfo);//提取NUMCHANS為40的fbank
}
void linkdata(struct IOConfig *cf, struct Wave *w, int k)
{
for (int i = 0; i < NUMCHANS; i++)
{
*(w->Rdata + i + (k*NUMCHANS)) = *(cf->fbank + 1 + i);
}
}
void zeromean(struct Wave *w)
{
int i, j;
float sum[NUMCHANS];
int n = w->nRow;
for (i = 0; i < NUMCHANS; i++)
{
sum[i] = 0.0;
for (j = 0; j < n; j++)
{
sum[i] += *(w->Rdata + j*NUMCHANS + i);
}
sum[i] = sum[i] / n;
}
for (i = 0; i < NUMCHANS*n; i++)
{
*(w->Rdata + i) -= sum[i%NUMCHANS];
}
}
struct Wave filter_bank(char *s)
{
/*初始化*/
/*採樣個數為16028*/
/*由HTK,採樣率為25ms,則幀長為400,預設幀移為160*/
struct Wave *w;
w = (struct Wave*)malloc(sizeof(struct Wave));
w->frIdx = 0; w->frRate = 160; w->frSize = 400;
w->nSample = 160000;
w->nRow = (w->nSample - w->frSize) / w->frRate + 1;
w->wavdata = (float*)malloc(sizeof(float) * w->nSample);
LoadFile(s, w);
w->Rdata = (float*)malloc(sizeof(float)*NUMCHANS*w->nRow);
struct IOConfig *cf;
cf = (struct IOConfig*)malloc(sizeof(struct IOConfig));
cf->s = (float*)malloc(sizeof(float) * w->frSize + 1);
/*讀PCM文件到wavdata中,可以直接讀取緩存數據,此處是為了方便PC測試*/
for (int k = 0; k < w->nRow + 1; k++)
{
/*分幀,計算幀能量*/
cf->s[0] = w->frSize;
GetWave(cf->s + 1, w);
int j, m, e, x;
for (j = 1, m = e = 0.0; j <= w->frSize; j++) {
x = (int)cf->s[j];
m += x; e += x*x;
}
m = m / w->frSize; e = e / w->frSize - m*m;
if (e>0.0) e = 10.0*log10(e / 0.32768);
else e = 0.0;
cf->curVol = e;
/*處理*/
ConvertFrame(cf, w);
linkdata(cf, w, k);
}
zeromean(w);
for (int i = 0; i < NUMCHANS*w->nRow; i++)
{
if (i%NUMCHANS == 0)
printf("\n%d:\n", i / NUMCHANS);
printf("%f ", *(w->Rdata + i));
}
/*free(cf->fbInfo.cf);
free(cf->fbInfo.loChan);
free(cf->fbInfo.loWt);
free(cf->fbInfo.x);
free(cf->fbank);
free(cf->s);
free(cf);*/
return *w;
}
測試文件:test.cpp
#include "nNet.h"
#include "filterbank.h"
#include<stdio.h>
#include<math.h>
#include <stdlib.h>
#include <time.h>
int main()
{
Wave w = filter_bank("F:\\newrecord.wav");
return 0;
}