In [1]:
# Markdown is not rendered in PyCharm.

DeepBird

Xeno Canto

This is a website containing bird song recordings and classification. They have shared part of their collection to Global Biodiversity Information Facility

Terms of use

It is ok to download some files but we still need to contact for downloading the complete set. See the terms

About this project

We want to learn the individual bird sound and try to learn their songs.

For this to happen we need to

  • download the zip containing a csv from GBIF.
  • download samples with their annotations.
  • process the samples into a spectrogram and extract regions of interest.
  • learn the extracted regions.

Fetching

It seems the first item is https://www.xeno-canto.org/1 and currently https://www.xeno-canto.org/460846 the last.

As we have not contacted them yet we collect 5 samples each time we run this page.

In [2]:
import project;

# In case you want to have proces data somewhere else change it
data_dir = project.get_project_dir() + 'data/'
#data_dir = '/media/clemens/Maxtor/xeno-canto/'
project.set_data_dir(data_dir)

num_files = 5

project.print_stats()
Project   : /Users/clemens/Sites/groningenml/deeptweet/
Data      : /Users/clemens/Sites/groningenml/deeptweet/data/
GBIF      : /Users/clemens/Sites/groningenml/deeptweet/data/gbif/
sample    : /Users/clemens/Sites/groningenml/deeptweet/data/xc/
fragments : /Users/clemens/Sites/groningenml/deeptweet/data/fragments/
In [3]:
import gbif

gbif.set_gbif_dir(project.get_gbif_dir())

gbif_id = '0025627-181108115102211'
In [4]:
gbif_csv = gbif.get_data()
Already downloaded zip 0025627-181108115102211
In [5]:
import pandas as pd

df = pd.read_csv(gbif_csv, sep='\t')

project.fix_gbif_df(df)

# Do we have data?
df.head()
Out[5]:
gbifID datasetKey occurrenceID kingdom phylum class order family genus species ... dateIdentified license rightsHolder recordedBy typeStatus establishmentMeans lastInterpreted mediaType issue XC_ID
0 1934871083 b1047888-ae52-4179-9dd5-5448ea342a24 http://data.biodiversitydata.nl/xeno-canto/obs... Animalia Chordata Aves Passeriformes Parulidae Seiurus Seiurus aurocapilla ... NaN CC_BY_NC_4_0 Mike Nelson Mike Nelson NaN NaN 2019-01-03T10:18:26.934Z SOUND;STILLIMAGE NaN 100119
1 1934871084 b1047888-ae52-4179-9dd5-5448ea342a24 http://data.biodiversitydata.nl/xeno-canto/obs... Animalia Chordata Aves Passeriformes Turdidae Catharus Catharus guttatus ... NaN CC_BY_NC_4_0 Mike Nelson Mike Nelson NaN NaN 2019-01-03T10:18:26.933Z SOUND;STILLIMAGE NaN 100113
2 1934871085 b1047888-ae52-4179-9dd5-5448ea342a24 http://data.biodiversitydata.nl/xeno-canto/obs... Animalia Chordata Aves Passeriformes Parulidae Setophaga Setophaga ruticilla ... NaN CC_BY_NC_4_0 Mike Nelson Mike Nelson NaN NaN 2019-01-03T10:18:26.936Z SOUND;STILLIMAGE NaN 100082
3 1934871086 b1047888-ae52-4179-9dd5-5448ea342a24 http://data.biodiversitydata.nl/xeno-canto/obs... Animalia Chordata Aves Passeriformes Vireonidae Vireo Vireo gilvus ... NaN CC_BY_NC_4_0 Andrew Spencer Andrew Spencer NaN NaN 2019-01-03T10:18:27.014Z SOUND;STILLIMAGE NaN 100053
4 1934871087 b1047888-ae52-4179-9dd5-5448ea342a24 http://data.biodiversitydata.nl/xeno-canto/obs... Animalia Chordata Aves Passeriformes Icteridae Dolichonyx Dolichonyx oryzivorus ... NaN CC_BY_NC_4_0 Mike Nelson Mike Nelson NaN NaN 2019-01-03T10:18:26.932Z SOUND;STILLIMAGE NaN 100089

5 rows × 46 columns

In [6]:
# We have some classification of the birg
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170041 entries, 0 to 170040
Data columns (total 46 columns):
gbifID                           170041 non-null int64
datasetKey                       170041 non-null object
occurrenceID                     170041 non-null object
kingdom                          170041 non-null object
phylum                           168749 non-null object
class                            168749 non-null object
order                            168749 non-null object
family                           168749 non-null object
genus                            168749 non-null object
species                          168535 non-null object
infraspecificEpithet             39389 non-null object
taxonRank                        170041 non-null object
scientificName                   170041 non-null object
countryCode                      169447 non-null object
locality                         170040 non-null object
publishingOrgKey                 170041 non-null object
decimalLatitude                  166100 non-null float64
decimalLongitude                 166100 non-null float64
coordinateUncertaintyInMeters    0 non-null float64
coordinatePrecision              0 non-null float64
elevation                        0 non-null float64
elevationAccuracy                0 non-null float64
depth                            0 non-null float64
depthAccuracy                    0 non-null float64
eventDate                        168070 non-null object
day                              168070 non-null float64
month                            168070 non-null float64
year                             168070 non-null float64
taxonKey                         170041 non-null int64
speciesKey                       168535 non-null float64
basisOfRecord                    170041 non-null object
institutionCode                  0 non-null float64
collectionCode                   170041 non-null object
catalogNumber                    170041 non-null object
recordNumber                     0 non-null float64
identifiedBy                     0 non-null float64
dateIdentified                   0 non-null float64
license                          170041 non-null object
rightsHolder                     170041 non-null object
recordedBy                       170041 non-null object
typeStatus                       0 non-null float64
establishmentMeans               0 non-null float64
lastInterpreted                  170041 non-null object
mediaType                        170041 non-null object
issue                            12938 non-null object
XC_ID                            170041 non-null object
dtypes: float64(18), int64(2), object(26)
memory usage: 59.7+ MB
In [7]:
# occurenceID references to http://data.biodiversitydata.nl/xeno-canto/observation/XC######
# fetching this resource redirects to the page and not the download link

df['occurrenceID'].head().map(lambda x: x.rsplit('/', 1)[1])
Out[7]:
0    XC100119
1    XC100113
2    XC100082
3    XC100053
4    XC100089
Name: occurrenceID, dtype: object
In [8]:
df['XC_ID'].head()
Out[8]:
0    100119
1    100113
2    100082
3    100053
4    100089
Name: XC_ID, dtype: object
In [9]:
xc_id='100082'

print('FP:', project.get_fragments_path(df, xc_id))
import split


args = dict(split.defaults)

args['silence_threshold'] = 0.01
args['min_silence_length'] = 1.0
args['dry_run'] = False

print('Data:', project.get_data_dir())
print('Fragments:',project.get_fragments_dir())
project.build_fragments(df.head(num_files), args)
  3%|â–Ž         | 38/1510 [00:00<00:03, 371.13it/s]
FP: /Users/clemens/Sites/groningenml/deeptweet/data/fragments/Animalia/Chordata/Aves/Passeriformes/Parulidae/Setophaga/Setophaga ruticilla/
Data: /Users/clemens/Sites/groningenml/deeptweet/data/
Fragments: /Users/clemens/Sites/groningenml/deeptweet/data/fragments/
Already downloaded 100113 /Users/clemens/Sites/groningenml/deeptweet/data/xc/100113.mp3
Already converted 100113 /Users/clemens/Sites/groningenml/deeptweet/data/xc/100113.wav
/Users/clemens/Sites/groningenml/deeptweet/data/xc/100113.wav file:///Users/clemens/Sites/groningenml/deeptweet/data/fragments/Animalia/Chordata/Aves/Passeriformes/Turdidae/Catharus/Catharus guttatus/
Splitting /Users/clemens/Sites/groningenml/deeptweet/data/xc/100113.wav where energy is below 1.0% for longer than 1.0s.
Finding silences...
 99%|█████████▉| 1501/1510 [00:04<00:00, 375.00it/s]
100%|██████████| 41/41 [00:00<00:00, 864.03it/s]
Writing file /Users/clemens/Sites/groningenml/deeptweet/data/fragments/Animalia/Chordata/Aves/Passeriformes/Turdidae/Catharus/Catharus guttatus/100113_000.wav
Writing file /Users/clemens/Sites/groningenml/deeptweet/data/fragments/Animalia/Chordata/Aves/Passeriformes/Turdidae/Catharus/Catharus guttatus/100113_001.wav
Writing file /Users/clemens/Sites/groningenml/deeptweet/data/fragments/Animalia/Chordata/Aves/Passeriformes/Turdidae/Catharus/Catharus guttatus/100113_002.wav
Writing file /Users/clemens/Sites/groningenml/deeptweet/data/fragments/Animalia/Chordata/Aves/Passeriformes/Turdidae/Catharus/Catharus guttatus/100113_003.wav
Writing file /Users/clemens/Sites/groningenml/deeptweet/data/fragments/Animalia/Chordata/Aves/Passeriformes/Turdidae/Catharus/Catharus guttatus/100113_004.wav
Writing file /Users/clemens/Sites/groningenml/deeptweet/data/fragments/Animalia/Chordata/Aves/Passeriformes/Turdidae/Catharus/Catharus guttatus/100113_005.wav
Writing file /Users/clemens/Sites/groningenml/deeptweet/data/fragments/Animalia/Chordata/Aves/Passeriformes/Turdidae/Catharus/Catharus guttatus/100113_006.wav
Writing file /Users/clemens/Sites/groningenml/deeptweet/data/fragments/Animalia/Chordata/Aves/Passeriformes/Turdidae/Catharus/Catharus guttatus/100113_007.wav
Writing file /Users/clemens/Sites/groningenml/deeptweet/data/fragments/Animalia/Chordata/Aves/Passeriformes/Turdidae/Catharus/Catharus guttatus/100113_008.wav
Writing file /Users/clemens/Sites/groningenml/deeptweet/data/fragments/Animalia/Chordata/Aves/Passeriformes/Turdidae/Catharus/Catharus guttatus/100113_009.wav
Writing file /Users/clemens/Sites/groningenml/deeptweet/data/fragments/Animalia/Chordata/Aves/Passeriformes/Turdidae/Catharus/Catharus guttatus/100113_010.wav
Writing file /Users/clemens/Sites/groningenml/deeptweet/data/fragments/Animalia/Chordata/Aves/Passeriformes/Turdidae/Catharus/Catharus guttatus/100113_011.wav
Writing file /Users/clemens/Sites/groningenml/deeptweet/data/fragments/Animalia/Chordata/Aves/Passeriformes/Turdidae/Catharus/Catharus guttatus/100113_012.wav
Writing file /Users/clemens/Sites/groningenml/deeptweet/data/fragments/Animalia/Chordata/Aves/Passeriformes/Turdidae/Catharus/Catharus guttatus/100113_013.wav
Writing file /Users/clemens/Sites/groningenml/deeptweet/data/fragments/Animalia/Chordata/Aves/Passeriformes/Turdidae/Catharus/Catharus guttatus/100113_014.wav
Writing file /Users/clemens/Sites/groningenml/deeptweet/data/fragments/Animalia/Chordata/Aves/Passeriformes/Turdidae/Catharus/Catharus guttatus/100113_015.wav
Writing file /Users/clemens/Sites/groningenml/deeptweet/data/fragments/Animalia/Chordata/Aves/Passeriformes/Turdidae/Catharus/Catharus guttatus/100113_016.wav
Writing file /Users/clemens/Sites/groningenml/deeptweet/data/fragments/Animalia/Chordata/Aves/Passeriformes/Turdidae/Catharus/Catharus guttatus/100113_017.wav
Writing file /Users/clemens/Sites/groningenml/deeptweet/data/fragments/Animalia/Chordata/Aves/Passeriformes/Turdidae/Catharus/Catharus guttatus/100113_018.wav
Writing file /Users/clemens/Sites/groningenml/deeptweet/data/fragments/Animalia/Chordata/Aves/Passeriformes/Turdidae/Catharus/Catharus guttatus/100113_019.wav
Writing file /Users/clemens/Sites/groningenml/deeptweet/data/fragments/Animalia/Chordata/Aves/Passeriformes/Turdidae/Catharus/Catharus guttatus/100113_020.wav
Writing file /Users/clemens/Sites/groningenml/deeptweet/data/fragments/Animalia/Chordata/Aves/Passeriformes/Turdidae/Catharus/Catharus guttatus/100113_021.wav
Writing file /Users/clemens/Sites/groningenml/deeptweet/data/fragments/Animalia/Chordata/Aves/Passeriformes/Turdidae/Catharus/Catharus guttatus/100113_022.wav
Writing file /Users/clemens/Sites/groningenml/deeptweet/data/fragments/Animalia/Chordata/Aves/Passeriformes/Turdidae/Catharus/Catharus guttatus/100113_023.wav
Writing file /Users/clemens/Sites/groningenml/deeptweet/data/fragments/Animalia/Chordata/Aves/Passeriformes/Turdidae/Catharus/Catharus guttatus/100113_024.wav
Writing file /Users/clemens/Sites/groningenml/deeptweet/data/fragments/Animalia/Chordata/Aves/Passeriformes/Turdidae/Catharus/Catharus guttatus/100113_025.wav
Writing file /Users/clemens/Sites/groningenml/deeptweet/data/fragments/Animalia/Chordata/Aves/Passeriformes/Turdidae/Catharus/Catharus guttatus/100113_026.wav
Writing file /Users/clemens/Sites/groningenml/deeptweet/data/fragments/Animalia/Chordata/Aves/Passeriformes/Turdidae/Catharus/Catharus guttatus/100113_027.wav
Writing file /Users/clemens/Sites/groningenml/deeptweet/data/fragments/Animalia/Chordata/Aves/Passeriformes/Turdidae/Catharus/Catharus guttatus/100113_028.wav
Writing file /Users/clemens/Sites/groningenml/deeptweet/data/fragments/Animalia/Chordata/Aves/Passeriformes/Turdidae/Catharus/Catharus guttatus/100113_029.wav
Writing file /Users/clemens/Sites/groningenml/deeptweet/data/fragments/Animalia/Chordata/Aves/Passeriformes/Turdidae/Catharus/Catharus guttatus/100113_030.wav
Writing file /Users/clemens/Sites/groningenml/deeptweet/data/fragments/Animalia/Chordata/Aves/Passeriformes/Turdidae/Catharus/Catharus guttatus/100113_031.wav
Writing file /Users/clemens/Sites/groningenml/deeptweet/data/fragments/Animalia/Chordata/Aves/Passeriformes/Turdidae/Catharus/Catharus guttatus/100113_032.wav
Writing file /Users/clemens/Sites/groningenml/deeptweet/data/fragments/Animalia/Chordata/Aves/Passeriformes/Turdidae/Catharus/Catharus guttatus/100113_033.wav
Writing file /Users/clemens/Sites/groningenml/deeptweet/data/fragments/Animalia/Chordata/Aves/Passeriformes/Turdidae/Catharus/Catharus guttatus/100113_034.wav
Writing file /Users/clemens/Sites/groningenml/deeptweet/data/fragments/Animalia/Chordata/Aves/Passeriformes/Turdidae/Catharus/Catharus guttatus/100113_035.wav
Writing file /Users/clemens/Sites/groningenml/deeptweet/data/fragments/Animalia/Chordata/Aves/Passeriformes/Turdidae/Catharus/Catharus guttatus/100113_036.wav
Writing file /Users/clemens/Sites/groningenml/deeptweet/data/fragments/Animalia/Chordata/Aves/Passeriformes/Turdidae/Catharus/Catharus guttatus/100113_037.wav
Writing file /Users/clemens/Sites/groningenml/deeptweet/data/fragments/Animalia/Chordata/Aves/Passeriformes/Turdidae/Catharus/Catharus guttatus/100113_038.wav
Writing file /Users/clemens/Sites/groningenml/deeptweet/data/fragments/Animalia/Chordata/Aves/Passeriformes/Turdidae/Catharus/Catharus guttatus/100113_039.wav
Writing file /Users/clemens/Sites/groningenml/deeptweet/data/fragments/Animalia/Chordata/Aves/Passeriformes/Turdidae/Catharus/Catharus guttatus/100113_040.wav

In [10]:
import XenoCanto as xc
#import importlib; importlib.reload(XenoCanto)

xc.set_dir(project.get_sample_dir())
In [11]:
for id in df['XC_ID'].head(num_files):
    xc.convert_mp3_to_wav(id)
Already downloaded 100119 /Users/clemens/Sites/groningenml/deeptweet/data/xc/100119.mp3
Already converted 100119 /Users/clemens/Sites/groningenml/deeptweet/data/xc/100119.wav
Already downloaded 100113 /Users/clemens/Sites/groningenml/deeptweet/data/xc/100113.mp3
Already converted 100113 /Users/clemens/Sites/groningenml/deeptweet/data/xc/100113.wav
Already downloaded 100082 /Users/clemens/Sites/groningenml/deeptweet/data/xc/100082.mp3
Already converted 100082 /Users/clemens/Sites/groningenml/deeptweet/data/xc/100082.wav
Already downloaded 100053 /Users/clemens/Sites/groningenml/deeptweet/data/xc/100053.mp3
Already converted 100053 /Users/clemens/Sites/groningenml/deeptweet/data/xc/100053.wav
Already downloaded 100089 /Users/clemens/Sites/groningenml/deeptweet/data/xc/100089.mp3
Already converted 100089 /Users/clemens/Sites/groningenml/deeptweet/data/xc/100089.wav
In [12]:
from scipy.io import wavfile
In [13]:
id = '100113'
fs, data = wavfile.read(xc.get_wav_file(id))
print( 'data', data.shape)
print( 'Duration:', data.shape[0] / fs)
print( 'Channels:', data.shape[1])
data (6659712, 2)
Duration: 151.0138775510204
Channels: 2
In [14]:
import matplotlib.pyplot as plt
In [15]:
# Is this stereo recording useful?

plt.plot(data)
plt.show()
In [16]:
data[:,0][200000:200100]
Out[16]:
array([-1276, -1813, -2581, -2862, -2677, -2004, -1890, -2836, -3270,
       -2883, -2558, -1736,  -712,  -547,  -367,  -103,  -718,  -824,
         212,   633,   506,   950,  1695,  2366,  2503,  1950,  1297,
         412,  -334,   -66,   218,    71,   334,   370,    61,   303,
         618,   584,   -87, -1485, -1716,  -665,  -389,   -91,   655,
          11,  -974,  -559,   369,  1062,   974,   204,   286,   600,
        -199,  -624,    52,   734,  1217,  1208,   703,   924,  1660,
        1728,  1410,   905,  -181, -1042,  -999, -1045, -1621, -1694,
       -1192, -1022,  -873,  -371,  -505, -1251, -1155,  -437,  -405,
        -695,  -688,  -720,  -514,  -140,  -463,  -896,  -798,  -838,
        -691,  -204,  -381,  -606,  -209,    28,   381,   807,   418,
         -13], dtype=int16)
In [17]:
data[:,1][200000:200100]
Out[17]:
array([-1276, -1813, -2581, -2862, -2677, -2004, -1890, -2836, -3270,
       -2883, -2558, -1736,  -712,  -547,  -367,  -103,  -718,  -824,
         212,   633,   506,   950,  1695,  2366,  2503,  1950,  1297,
         412,  -334,   -66,   218,    71,   334,   370,    61,   303,
         618,   584,   -87, -1485, -1716,  -665,  -389,   -91,   655,
          11,  -974,  -559,   369,  1062,   974,   204,   286,   600,
        -199,  -624,    52,   734,  1217,  1208,   703,   924,  1660,
        1728,  1410,   905,  -181, -1042,  -999, -1045, -1621, -1694,
       -1192, -1022,  -873,  -371,  -505, -1251, -1155,  -437,  -405,
        -695,  -688,  -720,  -514,  -140,  -463,  -896,  -798,  -838,
        -691,  -204,  -381,  -606,  -209,    28,   381,   807,   418,
         -13], dtype=int16)
In [18]:
# FIX ME: what are min and max

diff = data[:,0] - data[:,1]
diff[200000:200100]
Out[18]:
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int16)
In [19]:
import numpy as np

plt.plot(diff)
plt.show()
In [20]:
### https://shallowsky.com/blog/programming/sonograms-in-python.html
# https://matplotlib.org/examples/pylab_examples/specgram_demo.html

Pxx, freqs, bins, im = plt.specgram(data[0:fs*2,0], Fs=5000)#, NFFT=1024, noverlap=900)
plt.show()
In [21]:
Pxx, freqs, bins, im = plt.specgram(data[:,1], Fs=500)#, NFFT=1024, noverlap=900)
plt.show()
/Users/clemens/.local/share/virtualenvs/deeptweet-eGBalDS9/lib/python3.6/site-packages/matplotlib/axes/_axes.py:7609: RuntimeWarning: divide by zero encountered in log10
  Z = 10. * np.log10(spec)
In [22]:
from scipy import signal
from scipy.io import wavfile
import scipy.io.wavfile

M = 1024

freqs, times, spect = signal.spectrogram(data[:,0], fs=fs, window='hanning',
                                  nperseg=1024, noverlap=M - 100,
                                  detrend=False, scaling='spectrum')
In [23]:
freqs.size
Out[23]:
513
In [24]:
freqs[0:20]
Out[24]:
array([  0.        ,  43.06640625,  86.1328125 , 129.19921875,
       172.265625  , 215.33203125, 258.3984375 , 301.46484375,
       344.53125   , 387.59765625, 430.6640625 , 473.73046875,
       516.796875  , 559.86328125, 602.9296875 , 645.99609375,
       689.0625    , 732.12890625, 775.1953125 , 818.26171875])
In [25]:
times.size
Out[25]:
66587
In [26]:
plt.plot(spect)
plt.show()
In [ ]:
 
In [ ]: