# Load the data
import numpy as np
import pylab as pl
%pylab inline
# Change default figure sizes
pylab.rcParams['figure.figsize'] = 13, 9
# Import the examples of clean and noisy data
cleandf = np.loadtxt('clean.txt', dtype='str', delimiter=' ')
cleanTime = np.array(cleandf[:,1], dtype='float')
cleany = np.array(cleandf[:,2], dtype='float')
noisedf = np.loadtxt('noise.txt', dtype='str', delimiter=' ')
noiseTime = np.array(noisedf[:,1], dtype='float')
noisy = np.array(noisedf[:,2], dtype='float')
# use the length of the first clean curve
time = range(len(cleanTime))
index = range(1750)
cleany = cleany[index]
noisy = noisy[index]
# NOTE: 88_1705 really clean day
pl.figure()
pl.title('Clean Chromatagram', fontsize = 30)
pl.plot(index, cleany,'r', label='clean curve', linewidth=1)
pl.ylabel('Hz [1/s]', fontsize = 20)
pl.xlabel('seconds', fontsize = 20)
pl.legend(fontsize = 20)
pl.show('cleanChromatagram.pdf')
pl.figure()
pl.title('Noisy Chromatagram', fontsize = 30)
pl.plot(index, noisy,'b', label='clean curve', linewidth=1)
pl.ylabel('Hz [1/s]', fontsize = 20)
pl.xlabel('seconds', fontsize = 20)
pl.legend(fontsize = 20)
pl.show('noisyChromatagram.pdf')
The units of the y axis are not important for this analysis. What I want to find out is the frequency of the noise seen in the blue curve and not in the red curve. The seconds represent the origin from one cycle of measurements and are also not important. what is important is that the major peaks appear at the same time for noisy and non-noisy curves. Hopefully this will allow the signal of the noise to stand out in spectral space.
import scipy
from scipy import signal
N = len(index)
fNoisy , powerNoisy = scipy.signal.periodogram(noisy, fs = 1., window = 'hanning', nfft = N/2 )
powerNoisy = abs(powerNoisy)**2
powerNoisy = powerNoisy / sum(powerNoisy)
fClean , powerClean = scipy.signal.periodogram(cleany, fs = 1., window = 'hanning', nfft = N/2 )
powerClean = abs(powerClean)**2
powerClean = powerClean / sum(powerClean)
pl.figure()
pl.plot(fNoisy,powerNoisy,'b', label='Noisy')
pl.plot(fClean,powerClean,'r', label='Clean')
pl.xlabel("Hz [1/s]",fontsize=20)
pl.ylabel('C^2 (normalized)', fontsize=20)
pl.legend(fontsize=20)
pl.title('Full window, power spectrum of clean and noisy chromatagrams',fontsize=20)
pl.show('powerSpectrum.pdf')
Right off the bat I can see that the higher frequencies are not all that interesting, and explained very little of variance observed in the chromatagrams. As I suspected the clean curve has much more variance explained at lowr frequencies.
pl.figure()
pl.plot(fNoisy,powerNoisy,'b', label='Noisy', linewidth=2)
pl.plot(fClean,powerClean, 'r', label='Clean', linewidth=2)
pl.xlabel("Hz $s^{-1}$",fontsize=20)
pl.ylabel('$C^{2}$ (variance explained normalized)', fontsize=20)
pl.legend(fontsize=20)
pl.xlim(0,0.020)
pl.xticks(fontsize=20)
pl.yticks(fontsize=20)
pl.title('Full window, power spectrum of clean and noisy chromatagrams',fontsize=20)
pl.show('powerSpectrumLimited.pdf')
I think the peak at 0.010 Hz frequency is due to the noise since it is the highest frequency and is the biggest difference between the clean and noisy data curves. I do not know what to make of this frequency as I do not know how much higher frequency waves like radio would manifest in the chromatagrams.
A brief search has not given me any idea what kind of physical processes have this type of frequency.
# Now use the spectrum argument to see how it is different
# TODO: Make sure you are windowing this and chunking in a reasonable way but you will only know
# TODO: after you talk with chris
f_clean_welch, power_clean_welch_spec = scipy.signal.welch(cleany, fs = 1., window = 'hanning',nperseg=200,
noverlap=100, nfft=N/2, scaling = 'spectrum' )
f_noisy_welch, power_noisy_welch_spec = scipy.signal.welch(noisy, fs = 1., window = 'hanning',nperseg=200,
noverlap=100, nfft=N/2, scaling = 'spectrum' )
power_clean_welch_spec = abs(power_clean_welch_spec)**2
power_clean_welch_spec = power_clean_welch_spec / sum(power_clean_welch_spec)
power_noisy_welch_spec = abs(power_noisy_welch_spec)**2
power_noisy_welch_spec = power_noisy_welch_spec / sum(power_noisy_welch_spec)
pl.figure()
pl.plot(f_noisy_welch,power_noisy_welch_spec,'b', label='Noisy', linewidth=2)
pl.plot(f_clean_welch,power_clean_welch_spec,'r', label='Clean', linewidth=2)
pl.xlabel("Hz [1/s]",fontsize=20)
pl.ylabel('$C^{2}$ (normalized)', fontsize=20)
pl.legend(fontsize=20)
pl.xticks(fontsize=20)
pl.yticks(fontsize=20)
pl.xlim(0,1)
pl.title('Welch\'s method power spectrum of clean and noisy chromatagrams',fontsize=20)
pl.show('powerSpectrumWelch.pdf')
The chunking strategy does not pay off for my question because it gets rid of some of the noise and the noise is exactly what I am interested in. This does serve as a nice demonstration for what the chunking strategy does to your power spectrum. It is good to see that the power spectra for both Chromatagrams looks the same when the higher frequency noise is smoothed out.
noisySubset = noisy[400:700]
cleanSubset =cleany[475:600]
fNoisy , powerNoisy = scipy.signal.periodogram(noisySubset, fs = 1., nfft=len(noisySubset))
fClean , powerClean = scipy.signal.periodogram(cleanSubset, fs = 1., nfft=len(cleanSubset))
pl.figure()
pl.plot(noisySubset, label='noisy series subset')
pl.title('Noisy time series subset', fontsize=30)
pl.ylabel('Hz [1/s]', fontsize = 20)
pl.xlabel('seconds', fontsize = 20)
pl.show('chromNoisyTimeSubset.pdf')
pl.figure()
pl.plot(cleanSubset,'r', label='clean series subset')
pl.title('Clean time series subset', fontsize=30)
pl.ylabel('Hz [1/s]', fontsize = 20)
pl.xlabel('seconds', fontsize = 20)
pl.show('chromCleanTimeSubset.pdf')
pl.figure()
pl.plot(fNoisy,powerNoisy,'b', label='Noisy', linewidth=3)
pl.plot(fClean,powerClean,'r', label='Clean', linewidth=3)
pl.xlim(0,0.1)
pl.xlabel("Hz $s^{-1}$",fontsize=20)
pl.ylabel('$C^{2}$ (variance explained normalized)', fontsize=20)
pl.title('Subset power spectrum', fontsize=30)
pl.legend(fontsize=30)
pl.show('chromPowerSubste.pdf')
The peaks in the blue curves are the frequency of the main sources of noise in the PAN data.
If we accept that we are going to have to live with the noise it is worth figuring out if the signal in the noisy data can be preserved by filtering out the noise. Further, it is worth exploring if filting out the noise will allow auto-integration sequences to successfully integrate the previously noisy peaks.
# Recal, the noisy data looks like this
pl.figure()
pl.plot(noisy)
pl.show()
# Take the fft using numpy
dataSubsetIndex = range(50,500) # need to cut off sharp beginning and ends of the chromatagrams
data = noisy[dataSubsetIndex]
Y = np.fft.fft(data)
T = (len(data))
k = np.array(range(T))
freq = float_(k) / float_(T)
# Find the indicies where the noise frequency lives
noiseFreq = [0.02] #np.linspace(0.02,0.02,1000000) # Hz
noiseYIndex = []
width = 3
for nF in noiseFreq:
difference = abs(freq - nF)
minDifference = difference.min()
minIndex = np.where(difference == minDifference)[0][0]
noiseYIndex.append(minIndex)
maskIndex = int_(np.linspace(minIndex-width,minIndex+width,width*2+1))
# Give these frequencies zero weight
Y[maskIndex] = 0
# using these indicies as the center, get rid of a width of signals about these frequencies
## set values we want filtered out equal to zero
#Y[440:-1] = 0
Ck2 = (abs(Y))**2
Ck2_norm = Ck2 / Ck2.sum()
y = np.fft.ifft(Y)
pl.figure()
pl.plot(data, label='data')
pl.plot(y, label='filtered data')
pl.legend()
pl.show()
# TODO: Try this on subsets sinces there are many sharp corners between the 600s cycles
As it turns out, filtering out the noise will take a significant level of sophistication. I will post updates to my efforts soon.