/** * * @author HarveyD * Dan Harvey - Professor of Computer Science * Southern Oregon University, 1250 Siskiyou Blvd., Ashland, OR 97520-5028 * harveyd@sou.edu * @version 1.00 * * Copyright 2010, all rights reserved * * This software is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * To receive a copy of the GNU Lesser General Public write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ /** Note: Re-sample the data to 10 KHZ to normalize between different record rates. * * The first try used the algorithm in ResampleAudio. It worked fine for * many conversions (like 16000 KHZ to 1000 10 KHZ). However, it fails when * converting from 22050 KHZ to 10000 KHZ. We then utilized the built in converters * provided by the Java Sound System. This seems to work better. The original code * follows: * * double[] samples = timeDomain.getTimeDomainFromAudio(-1, -1); * samples = ResampleAudio.apply(samples, (int)frameRate); * audio.setFrameRate(frameRate); * * The above works better after some debugging. The correct version is in the * Android mobile app. * */ package cs415; import java.awt.Point; import java.io.Serializable; import org.acorns.audio.NormalizeFrames; import org.acorns.audio.Pitch; import org.acorns.audio.SoundDefaults; import org.acorns.audio.TimeDomain; import org.acorns.audio.frequencydomain.Cepstrum; import org.acorns.audio.frequencydomain.FastFourierTransform; import org.acorns.audio.frequencydomain.HarmonicProductSpectrum; import org.acorns.audio.frequencydomain.MFCC; import org.acorns.audio.frequencydomain.MelFilterBank; import org.acorns.audio.frequencydomain.RastaPLP; import org.acorns.audio.timedomain.Butterworth; import org.acorns.audio.timedomain.Filter; import org.acorns.audio.timedomain.LinearPrediction; import org.acorns.audio.timedomain.ResampleAudio; import org.acorns.audio.timedomain.Yin; import org.acorns.data.SoundData; public class FeatureData implements Serializable { /** Java serial file version */ private static final long serialVersionUID = 1; private static String[] optionText = { "LPErr", "LPCSum", "Energy", "0Cross", "AutoCor", "Entropy", "Higuchi", "Katz", "Box", "YIN", "Harmonic", "Cepstral", "SpecFlux", "MelFlux", }; private static String[] diffText = { "MFCC", "LPC", "LErr", "LSum", "Ener", "0C", "Acor", "Ent", "Hig", "Katz", "Box", "Yin", "HAR", "CEP", "L0", "L1", "M0", "M1", "M2", "M3", }; // The number of LPC coefficients private static final int P = SoundDefaults.getLPCCoefficients(); // The number of CEPSTRAL coefficients to model the vocal track (normally: f + sample rate/1000) private static final int C = SoundDefaults.getCepstrumLength(); // The number of Diff coefficients private static final int DIFF = diffText.length; // The temporal filtering algorithm, private static final int TEMPORAL_FILTER = RastaPLP.NONE; // Number of iterations for convergence of variance and skew statistics of features private static final int NORMALIZE_LOOP = 3; /* * The following are parameters based on Rabiner's end point algorithm * * QUARTER_SEC_FRAMES, ZCROSS_COUNT, STD_MUTIPLE are constants that Rabiner seemingly * arbitrarily picks. Literature seems to vary in how to pick their values. */ /** Number of high zero crossing frames to decide non-voiced sound */ private static final int ZCROSS_COUNT = 4; /** Standard deviation multiples */ private static final int STD_MULTIPLE = 2; /** The minimum energy above nose for a sound to be perceived as voiced */ private static final int DELTA_ENERGY = 10; // Define the parameter to control degree of dynamic feature linear regression curve fitting private final static int D = 1; // Curve fitting loop goes from -D to +D /** Option to indicate if mean normalization should be done */ public static int CMN = 1; /** Option to indicate if variance and skew normalization should be done */ public static int CVN = 2; /** Option to convert LPC parameters to cepstrals */ public static int CEP = 4; /** Bit to indicate if speech is present in a frame */ public static int SPEECH = 1; /** Bit to indicate if a frame is voiced */ public static int VOICED = 2; /** Bit to indicate if a frame is silence */ public static int SILENCE = 4; /** Bit to indicate if a frame is a phoneme boundary */ public static int PHONEME = 8; /** Starting offset to array of MFCC CEPSTRAL coefficients */ public static final int CEPSTRAL_COEFFICIENTS = 0; /* Easier symbol access to specific CEPSTRAL_COEFFICIENTS */ public static final int MFCC0 = CEPSTRAL_COEFFICIENTS; public static final int MFCC1 = CEPSTRAL_COEFFICIENTS + 1; public static final int MFCC2 = CEPSTRAL_COEFFICIENTS + 2; public static final int MFCC3 = CEPSTRAL_COEFFICIENTS + 3; public static final int MFCC4 = CEPSTRAL_COEFFICIENTS + 4; public static final int MFCC5 = CEPSTRAL_COEFFICIENTS + 5; public static final int MFCC6 = CEPSTRAL_COEFFICIENTS + 6; public static final int MFCC7 = CEPSTRAL_COEFFICIENTS + 7; public static final int MFCC8 = CEPSTRAL_COEFFICIENTS + 8; public static final int MFCC9 = CEPSTRAL_COEFFICIENTS + 9; public static final int MFCC10 = CEPSTRAL_COEFFICIENTS + 10; public static final int MFCC11 = CEPSTRAL_COEFFICIENTS + 11; public static final int MFCC12 = CEPSTRAL_COEFFICIENTS + 12; /** Starting offset to array of Linear Prediction coefficients */ public static final int LPC_COEFFICIENTS = SoundDefaults.getCepstrumLength(); /* Easier symbol access to specific LPC_COEFFICIENTS */ public static final int LPC0 = LPC_COEFFICIENTS; public static final int LPC1 = LPC_COEFFICIENTS + 1; public static final int LPC2 = LPC_COEFFICIENTS + 2; public static final int LPC3 = LPC_COEFFICIENTS + 3; public static final int LPC4 = LPC_COEFFICIENTS + 4; public static final int LPC5 = LPC_COEFFICIENTS + 5; public static final int LPC6 = LPC_COEFFICIENTS + 6; public static final int LPC7 = LPC_COEFFICIENTS + 7; public static final int DIFF_COEFFICIENTS = LPC_COEFFICIENTS + P; /* Easier symbol access to specific DIFF_COEFFICIENTS */ public static final int FLUX_CEPSTRAL_COEFFICIENTS = DIFF_COEFFICIENTS; public static final int FLUX_LPC_COEFFICIENTS = DIFF_COEFFICIENTS + 1; public static final int DIFF_LPC_ERROR = DIFF_COEFFICIENTS + 2; public static final int DIFF_LPC_SUM = DIFF_COEFFICIENTS + 3; public static final int DIFF_ENERGY = DIFF_COEFFICIENTS + 4; public static final int DIFF_ZERO_CROSS = DIFF_COEFFICIENTS + 5; public static final int DIFF_AUTOCORRELATION_COEFFICIENT = DIFF_COEFFICIENTS + 6; public static final int DIFF_ENTROPY = DIFF_COEFFICIENTS + 7; public static final int DIFF_HIGUCHI_FRACTAL_DIMENSION = DIFF_COEFFICIENTS + 8; public static final int DIFF_KATZ_FRACTAL_DIMENSION = DIFF_COEFFICIENTS + 9; public static final int DIFF_BOX_FRACTAL_DIMENSION = DIFF_COEFFICIENTS + 10; public static final int DIFF_YIN_PITCH = DIFF_COEFFICIENTS + 11; public static final int DIFF_HARMONIC_PITCH = DIFF_COEFFICIENTS + 12; public static final int DIFF_CEPSTRAL_PITCH = DIFF_COEFFICIENTS + 13; public static final int DIFF_LPC0 = DIFF_COEFFICIENTS + 14; public static final int DIFF_LPC1 = DIFF_COEFFICIENTS + 15; public static final int DIFF_MFCC0 = DIFF_COEFFICIENTS + 16; public static final int DIFF_MFCC1 = DIFF_COEFFICIENTS + 17; public static final int DIFF_MFCC2 = DIFF_COEFFICIENTS + 18; public static final int DIFF_MFCC3 = DIFF_COEFFICIENTS + 19; /** Starting offset to array of Linear Prediction coefficients */ public static final int LPC_ERROR = DIFF_COEFFICIENTS + DIFF; /** Offset to the sum of the LPC coefficients */ public static final int LPC_SUM = LPC_ERROR + 1; /** Starting offset to array of Linear Prediction coefficients */ public static final int ENERGY = LPC_SUM + 1; /** Starting offset to zero crossing feature */ public static final int ZERO_CROSS = ENERGY + 1; /** Starting offset to audio correlation feature with delta = 1 */ public static final int AUTOCORRELATION_COEFFICIENT = ZERO_CROSS + 1; /** Starting offset to entropy feature */ public static final int ENTROPY = AUTOCORRELATION_COEFFICIENT + 1; /** Starting offset to fractal dimension feature using the HIGUCHI algorithm */ public static final int HIGUCHI_FRACTAL_DIMENSION = ENTROPY + 1; /** Starting offset to fractal dimension feature using the KATZ algorithm */ public static final int KATZ_FRACTAL_DIMENSION = HIGUCHI_FRACTAL_DIMENSION + 1; /** Starting offset to fractal dimension feature using the BOX counting algorithm */ public static final int BOX_FRACTAL_DIMENSION = KATZ_FRACTAL_DIMENSION + 1; /** Starting offset to pitch estimate using the YIN algorithm */ public static final int YIN_PITCH = BOX_FRACTAL_DIMENSION + 1; /** Starting offset to pitch estimate using the HARMONIC product spectrum */ public static final int HARMONIC_PITCH = YIN_PITCH + 1; /** Starting offset to pitch estimate using CEPSTRALS */ public static final int CEPSTRAL_PITCH = HARMONIC_PITCH + 1; /** Starting offset to Spectral flux */ public static final int SPECTRAL_FLUX = CEPSTRAL_PITCH + 1; /** Starting offset to Mel filter flux */ public static final int MEL_FLUX = SPECTRAL_FLUX + 1; /** Number of audio features */ public static final int FEATURES_LENGTH = MEL_FLUX + 1; /** Number of audio features including delta and delta delta values */ public static final int FEATURE_ARRAY_LENGTH = FEATURES_LENGTH * 3; /** Features where statistics are needed */ public static final int[] SPEECH_AVERAGES_FEATURES = { ENERGY, ENERGY + FEATURES_LENGTH, ENERGY + 2*FEATURES_LENGTH, ZERO_CROSS, ZERO_CROSS + FEATURES_LENGTH, ZERO_CROSS + 2*FEATURES_LENGTH, LPC_COEFFICIENTS, LPC_COEFFICIENTS + FEATURES_LENGTH, LPC_COEFFICIENTS + 2*FEATURES_LENGTH, LPC_SUM, LPC_SUM + FEATURES_LENGTH, LPC_SUM + 2*FEATURES_LENGTH, LPC_ERROR, LPC_ERROR + FEATURES_LENGTH, LPC_ERROR + 2*FEATURES_LENGTH, AUTOCORRELATION_COEFFICIENT, AUTOCORRELATION_COEFFICIENT + FEATURES_LENGTH, AUTOCORRELATION_COEFFICIENT + 2*FEATURES_LENGTH, ENTROPY, ENTROPY + FEATURES_LENGTH, ENTROPY + 2*FEATURES_LENGTH, HIGUCHI_FRACTAL_DIMENSION, HIGUCHI_FRACTAL_DIMENSION + FEATURES_LENGTH, HIGUCHI_FRACTAL_DIMENSION + 2*FEATURES_LENGTH, KATZ_FRACTAL_DIMENSION, KATZ_FRACTAL_DIMENSION + FEATURES_LENGTH, KATZ_FRACTAL_DIMENSION + 2*FEATURES_LENGTH, BOX_FRACTAL_DIMENSION, BOX_FRACTAL_DIMENSION + FEATURES_LENGTH, BOX_FRACTAL_DIMENSION + 2*FEATURES_LENGTH, YIN_PITCH, YIN_PITCH + FEATURES_LENGTH, YIN_PITCH + 2*FEATURES_LENGTH, HARMONIC_PITCH, HARMONIC_PITCH + FEATURES_LENGTH, HARMONIC_PITCH + 2*FEATURES_LENGTH, CEPSTRAL_PITCH, CEPSTRAL_PITCH + FEATURES_LENGTH, CEPSTRAL_PITCH + 2*FEATURES_LENGTH, SPECTRAL_FLUX, SPECTRAL_FLUX + FEATURES_LENGTH, SPECTRAL_FLUX + 2*FEATURES_LENGTH, MEL_FLUX, MEL_FLUX + FEATURES_LENGTH, MEL_FLUX + 2*FEATURES_LENGTH, }; /** * This section defines combinations of features that could be * useful for detecting phoneme boundaries. To add additional features * or combination of features, adjust the following table and update * the diffText table at the top of this listing with an appropriate header */ int[][] distanceFeatures = { { // Non energy components of the cepstrals CEPSTRAL_COEFFICIENTS+1, CEPSTRAL_COEFFICIENTS+2, CEPSTRAL_COEFFICIENTS+3, CEPSTRAL_COEFFICIENTS+4, CEPSTRAL_COEFFICIENTS+5, CEPSTRAL_COEFFICIENTS+6, CEPSTRAL_COEFFICIENTS+7, CEPSTRAL_COEFFICIENTS+8, CEPSTRAL_COEFFICIENTS+9, CEPSTRAL_COEFFICIENTS+10, CEPSTRAL_COEFFICIENTS+11, CEPSTRAL_COEFFICIENTS+12, }, { // Non energy components of the linear prediction LPC_COEFFICIENTS + 1, LPC_COEFFICIENTS + 2, LPC_COEFFICIENTS + 3, LPC_COEFFICIENTS + 4, LPC_COEFFICIENTS + 5, LPC_COEFFICIENTS + 6, LPC_COEFFICIENTS + 7, }, { LPC_ERROR, }, { LPC_SUM, }, { ENERGY, }, { ZERO_CROSS, }, { AUTOCORRELATION_COEFFICIENT, }, { ENTROPY, }, { HIGUCHI_FRACTAL_DIMENSION, }, { KATZ_FRACTAL_DIMENSION, }, { BOX_FRACTAL_DIMENSION, }, { YIN_PITCH, }, { HARMONIC_PITCH, }, { CEPSTRAL_PITCH, }, { LPC_COEFFICIENTS, }, { LPC_COEFFICIENTS + 1, }, { CEPSTRAL_COEFFICIENTS, }, { CEPSTRAL_COEFFICIENTS + 1, }, { CEPSTRAL_COEFFICIENTS + 2, }, { CEPSTRAL_COEFFICIENTS + 3, }, }; /** offsets into the statistics array */ private static int ENERGY_STATS = 0; private static int ZERO_CROSS_STATS = 3; /** statistics array row for holding mean */ private int MEAN = 0; /** statistics array row for holding standard deviation */ private int STD = 1; /** statistics array row for holding variance */ private int VARIANCE = 2; /** statistics array row for holding skew */ private int SKEW = 3; /** statistics array row for holding kirtosis */ private int KIRTOSIS = 4; /** toString() Descriptions */ private static String[] toStringText; private double[][] features; // Array to hold computed features for each frame private int[] frameType; // Array to categorize frame types /** Spectrum and Mel coefficients from previous frame */ private double[] prevSpectrum; private double[] prevMelSpectrum; private float frameRate; // Frame rate private int wStep; // Window step size private int wSize; // Window size private int FFT_Size; // FFT bins; private int harmonic_FFT_Size; // FFT for computing pitch /** Constructor to create a feature list of the audio signal with mean normalization * * @param audio Object containing the audio signal * @param options * NORM for mean normalization, * CVN for variance and skew normalization * CEP for converting LPC parameters to CEPSTRALS * @param frameRate the desired frame rate */ public FeatureData(SoundData audio, float frameRate, int options) { this(audio, frameRate, (options & CMN) != 0, (options & CVN) != 0, (options & CEP) != 0); } /** Constructor to create a feature list of the audio signal with mean normalization * * @param audio Object containing the audio signal * @param options * NORM for mean normalization, * CVN for variance and skew normalization * CEP for converting LPC parameters to CEPSTRALS */ public FeatureData(SoundData audio, int options) { this(audio, SoundDefaults.getFrameRate(), (options & CMN) != 0, (options & CVN) != 0, (options & CEP) != 0); } /** Constructor to create a feature list of the audio signal * * @param audio Object containing the audio signal * @param norm true to perform mean normalization * @param cvn true to perform CVN and skew normalization, in addition to CMN * @param cep true if to convert LPC parameters to CEPSTRALS * @param frameRate The desired frame rate to use */ public FeatureData(SoundData audio, float frameRate, boolean norm, boolean cvn, boolean cep) { // Initialize text for the toString output toStringText = new String[FEATURE_ARRAY_LENGTH]; int index; for (int i=0; i samples.length) frames--; features = new double[frames][FEATURE_ARRAY_LENGTH]; double prior = 0, nextPrior, energy; if (frames==0) return features; for (int frame = 0; frame< frames; frame++) { startFrame = 2*frame*wStep; endFrame = startFrame + wSize*2; if (endFrame>complex.length) { endFrame = complex.length; } /* Clear data left over from the previous frame */ if (endFrame - startFrame != wSize*2) { for (int s=0; s<2*wSize; s+=2) { filtered[s/2] = pitchWindow[s/2] = fftWindow[s] = fftWindow[s+1] = 0; } } for (int s=wSize; s<2*FFT_Size; s++) fftWindow[s] = 0; /* Load windows for processing the next frame */ System.arraycopy(complex, startFrame, fftWindow, 0, endFrame - startFrame); System.arraycopy(samples, startFrame/2, filtered, 0, (endFrame - startFrame)/2); System.arraycopy(pitchSamples, startFrame/2, pitchWindow, 0, (endFrame - startFrame)/2); for (int i=0; iMath.max(forwardFrame - sampleFrames, 0); tempFrame--) { if (features[tempFrame][ZERO_CROSS]>izct) { if (++count>=ZCROSS_COUNT) startSpeech = tempFrame; } } foundStartSpeech = true; break; } // end back search for 0 crossings > izct } // end while search forward for frame > itu } // end else } // end for search for start of speech boolean foundEndSpeech = false; for (int frame=features.length-1; frame>startSpeech && !foundEndSpeech; frame--) { energy = features[frame][ENERGY]; if (energy < itr) frameType[frame] &= ~SPEECH; else { int backwardsFrame = endSpeech = frame; while (--backwardsFrame > startSpeech) { energy = features[backwardsFrame][ENERGY]; if (energy < itr) break; else { if (energy < itu) continue; int count = 0; int endFrame = Math.min(backwardsFrame+sampleFrames , features.length-1); for (int tempFrame = backwardsFrame;tempFrameizct) { if (++count>=ZCROSS_COUNT) endSpeech = tempFrame; } } foundEndSpeech = true; break; } // end forward search for 0 crossings > izct } // end while search backwards for frame > itu } // end else } // end for search for end of speech // Mark all the other frames to be speech and to further classify their types if (startSpeech<0) startSpeech = 0; for (int frame=startSpeech; frame<=endSpeech; frame++) { frameType[frame] |= SPEECH; // designate voiced frames energy = features[frame][ENERGY]; if (energy > itu) frameType[frame] |= VOICED; } // end for return new Point(startSpeech, endSpeech); } // end of updateFrameTypes /** Get the statistical averages for all frames of the audio * * @param averages The statistical averages in deviation units of the speech features */ public double[][] getStatisticsSTD(double[][] averages) { Point bounds = new Point(0, features.length - 1); return getStatisticsSTD(bounds, averages); } public double[][] getStatisticsSTD(Point bounds, double[][] features, double[][] averages) { if (bounds==null) bounds = new Point(0, features.length - 1); int startSpeech = bounds.x; int endSpeech = bounds.y; double[][] results = new double[endSpeech -startSpeech + 1][averages[0].length + 2]; double value, mean, std, units; int feature; for (int frame=startSpeech; frame<=endSpeech; frame++) { results[frame-startSpeech][0] = frame*wStep; for (int stat=0; stat sampleFrames) ? sampleFrames : frame; while (index > 0) { featureValue = features[selectFrames[index-1]][feature]; if (featureValue > frameValue && maximum) break; if (featureValue < frameValue && !maximum) break; if (index=features.length) return totals; for (int feature=0; feature2) totals[SKEW][feature] += Math.pow(delta, 3); if (N>3) totals[KIRTOSIS][feature] += Math.pow(delta, 4); } } // Complete the calculations (using Excel formulas) double stdev, factor; for (int feature=0; feature 2 && stdev != 0) { factor = 1.0 * N / ((N-1)*(N-2)); totals[SKEW][feature] = factor * totals[SKEW][feature] / Math.pow(stdev, 3); } if (N > 3 && stdev !=0) { factor = 1.0 * N * (N+1) / ((N-1)*(N-2)*(N-3)); totals[KIRTOSIS][feature] = factor * totals[KIRTOSIS][feature] / Math.pow(stdev, 4); factor = 3.0 * (N-1)*(N-1) / ((N-2)*(N-3)); totals[KIRTOSIS][feature] -= factor; } } return totals; } /** Create the title for toString output * * @param offsets array of offsets of the desired features * @param detail true if this is for detailed data rather then summary totals */ public static String title(int[] offsets, boolean detail) { StringBuilder build = new StringBuilder(); String spaces = " ", text; if (detail) build.append("#### "); else build.append(" "); for (int feature = 0; feature save) { median = features[frame+1][feature]; save = features[frame+2][feature]; } // Eliminate the high and low of the bottom two and top to if (features[frame-2][feature]median) median = features[frame-2][feature]; if (features[frame-1][feature]median) median = features[frame-1][feature]; if (features[frame-2][feature]=features.length - D) ? features.length - frame - 1: +D; for (int d= start; d<= end; d++) { numerator += d * features[frame + d][feature + featureOffset]; denominator += d * d; } if (denominator !=0) features[frame][feature + featureOffset + FEATURES_LENGTH] = numerator / denominator; } } } /** Compute distance features * * @param distanceFeature The feature in questions (LPC_DISTANCE, CEPSTRAL_DISTANCE, MISC_DISTANCE) * @param Array of feature indices for which to apply distances */ private void computeDistances(int distanceFeature, int[] distanceFeatures) { double total, previous, diff, current; int feature; for (int frame=0; frame=frameType.length) return false; return (frameType[frame] & VOICED) != 0; } /** Determine if a particular frame contains speech * * @param frame Frame number * @param state true if previouse frame contains speech * @return true if yes */ public boolean isSpeech(int frame, boolean state) { if (frameType==null || frame<0 || frame>=frameType.length) return false; return (frameType[frame] & SPEECH) != 0; } /** Get the number of frames in this audio signal */ public int getSize() { if (features==null) return 0; return features.length; } /** Create string representation of audio signal between starting and ending samples * * @param bounds The starting and ending offsets (-1,-1) means entire signal * @param offsets The features of interest * @return String representation of the frames in question */ public String toString(Point bounds, int[] offsets) { if (bounds == null) bounds = new Point(-1,-1); int start = (bounds.x<0) ? 0 : bounds.x / wStep; int end = (bounds.y + wStep - 1)/wStep; if (end > features.length) end = features.length; if (end=features.length) continue; build.append( String.format("%4d", frame)); for (int feature=0; feature