In [1]:
import project

import gbif
import pandas as pd
In [2]:
data_dir = project.get_project_dir() + 'data/'
#data_dir = '/media/clemens/Maxtor/xeno-canto/'

project.set_data_dir(data_dir)

gbif.set_gbif_dir(project.get_gbif_dir())

gbif_csv = gbif.get_data()

df = pd.read_csv(gbif.get_data(), sep='\t')

project.fix_gbif_df(df)

df['XC_ID'].head()
Already downloaded zip 0025627-181108115102211
Already downloaded zip 0025627-181108115102211
Out[2]:
0    100119
1    100113
2    100082
3    100053
4    100089
Name: XC_ID, dtype: object
In [3]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170041 entries, 0 to 170040
Data columns (total 46 columns):
gbifID                           170041 non-null int64
datasetKey                       170041 non-null object
occurrenceID                     170041 non-null object
kingdom                          170041 non-null object
phylum                           168749 non-null object
class                            168749 non-null object
order                            168749 non-null object
family                           168749 non-null object
genus                            168749 non-null object
species                          168535 non-null object
infraspecificEpithet             39389 non-null object
taxonRank                        170041 non-null object
scientificName                   170041 non-null object
countryCode                      169447 non-null object
locality                         170040 non-null object
publishingOrgKey                 170041 non-null object
decimalLatitude                  166100 non-null float64
decimalLongitude                 166100 non-null float64
coordinateUncertaintyInMeters    0 non-null float64
coordinatePrecision              0 non-null float64
elevation                        0 non-null float64
elevationAccuracy                0 non-null float64
depth                            0 non-null float64
depthAccuracy                    0 non-null float64
eventDate                        168070 non-null object
day                              168070 non-null float64
month                            168070 non-null float64
year                             168070 non-null float64
taxonKey                         170041 non-null int64
speciesKey                       168535 non-null float64
basisOfRecord                    170041 non-null object
institutionCode                  0 non-null float64
collectionCode                   170041 non-null object
catalogNumber                    170041 non-null object
recordNumber                     0 non-null float64
identifiedBy                     0 non-null float64
dateIdentified                   0 non-null float64
license                          170041 non-null object
rightsHolder                     170041 non-null object
recordedBy                       170041 non-null object
typeStatus                       0 non-null float64
establishmentMeans               0 non-null float64
lastInterpreted                  170041 non-null object
mediaType                        170041 non-null object
issue                            12938 non-null object
XC_ID                            170041 non-null object
dtypes: float64(18), int64(2), object(26)
memory usage: 59.7+ MB
In [4]:
df.head()
Out[4]:
gbifID datasetKey occurrenceID kingdom phylum class order family genus species ... dateIdentified license rightsHolder recordedBy typeStatus establishmentMeans lastInterpreted mediaType issue XC_ID
0 1934871083 b1047888-ae52-4179-9dd5-5448ea342a24 http://data.biodiversitydata.nl/xeno-canto/obs... Animalia Chordata Aves Passeriformes Parulidae Seiurus Seiurus aurocapilla ... NaN CC_BY_NC_4_0 Mike Nelson Mike Nelson NaN NaN 2019-01-03T10:18:26.934Z SOUND;STILLIMAGE NaN 100119
1 1934871084 b1047888-ae52-4179-9dd5-5448ea342a24 http://data.biodiversitydata.nl/xeno-canto/obs... Animalia Chordata Aves Passeriformes Turdidae Catharus Catharus guttatus ... NaN CC_BY_NC_4_0 Mike Nelson Mike Nelson NaN NaN 2019-01-03T10:18:26.933Z SOUND;STILLIMAGE NaN 100113
2 1934871085 b1047888-ae52-4179-9dd5-5448ea342a24 http://data.biodiversitydata.nl/xeno-canto/obs... Animalia Chordata Aves Passeriformes Parulidae Setophaga Setophaga ruticilla ... NaN CC_BY_NC_4_0 Mike Nelson Mike Nelson NaN NaN 2019-01-03T10:18:26.936Z SOUND;STILLIMAGE NaN 100082
3 1934871086 b1047888-ae52-4179-9dd5-5448ea342a24 http://data.biodiversitydata.nl/xeno-canto/obs... Animalia Chordata Aves Passeriformes Vireonidae Vireo Vireo gilvus ... NaN CC_BY_NC_4_0 Andrew Spencer Andrew Spencer NaN NaN 2019-01-03T10:18:27.014Z SOUND;STILLIMAGE NaN 100053
4 1934871087 b1047888-ae52-4179-9dd5-5448ea342a24 http://data.biodiversitydata.nl/xeno-canto/obs... Animalia Chordata Aves Passeriformes Icteridae Dolichonyx Dolichonyx oryzivorus ... NaN CC_BY_NC_4_0 Mike Nelson Mike Nelson NaN NaN 2019-01-03T10:18:26.932Z SOUND;STILLIMAGE NaN 100089

5 rows × 46 columns

In [5]:
df['kingdom'].unique()
Out[5]:
array(['Animalia'], dtype=object)
In [6]:
df['phylum'].unique()
Out[6]:
array(['Chordata', nan], dtype=object)
In [7]:
df['phylum'].isnull().sum()
Out[7]:
1292
In [8]:
# There are some unclassified records in the GBIF download
df_null_classification = df[df['phylum'].isnull()][['gbifID', 'XC_ID']]

df_null_classification.to_csv(project.get_gbif_dir() + 'gbif_null_classifications.csv', sep='\t')

df_null_classification.head()
Out[8]:
gbifID XC_ID
2028 1934873111 107195
2194 1934873277 107628
2196 1934873279 107637
2203 1934873286 107788
2209 1934873292 107789
In [9]:
df['class'].unique()
Out[9]:
array(['Aves', nan], dtype=object)
In [10]:
df['class'].describe()
Out[10]:
count     168749
unique         1
top         Aves
freq      168749
Name: class, dtype: object
In [11]:
df['order'].unique()
Out[11]:
array(['Passeriformes', 'Cuculiformes', 'Gruiformes', 'Columbiformes',
       'Piciformes', 'Galliformes', 'Charadriiformes',
       'Procellariiformes', 'Pelecaniformes', 'Trogoniformes',
       'Psittaciformes', 'Accipitriformes', 'Anseriformes',
       'Caprimulgiformes', 'Strigiformes', 'Apodiformes', 'Coraciiformes',
       'Bucerotiformes', 'Ciconiiformes', 'Podicipediformes',
       'Falconiformes', 'Musophagiformes', 'Tinamiformes', nan,
       'Gaviiformes', 'Eurypygiformes', 'Suliformes', 'Cariamiformes',
       'Sphenisciformes', 'Phaethontiformes', 'Opisthocomiformes',
       'Phoenicopteriformes', 'Leptosomiformes', 'Pteroclidiformes',
       'Casuariiformes', 'Otidiformes', 'Coliiformes',
       'Mesitornithiformes', 'Rheiformes', 'Struthioniformes',
       'Apterygiformes'], dtype=object)
In [12]:
df['family'].unique().size
Out[12]:
224
In [13]:
df['genus'].unique().size
Out[13]:
2024
In [14]:
df['species'].unique().size
Out[14]:
8503
In [15]:
df[df['XC_ID'] == '100119'].head()
Out[15]:
gbifID datasetKey occurrenceID kingdom phylum class order family genus species ... dateIdentified license rightsHolder recordedBy typeStatus establishmentMeans lastInterpreted mediaType issue XC_ID
0 1934871083 b1047888-ae52-4179-9dd5-5448ea342a24 http://data.biodiversitydata.nl/xeno-canto/obs... Animalia Chordata Aves Passeriformes Parulidae Seiurus Seiurus aurocapilla ... NaN CC_BY_NC_4_0 Mike Nelson Mike Nelson NaN NaN 2019-01-03T10:18:26.934Z SOUND;STILLIMAGE NaN 100119

1 rows × 46 columns

In [16]:
project.get_fragments_path(df, '100119')
Out[16]:
'/Users/clemens/Sites/groningenml/deeptweet/data/fragments/Animalia/Chordata/Aves/Passeriformes/Parulidae/Seiurus/Seiurus aurocapilla/'
In [ ]: