Source code for py4pm.chemutilities

import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
import pandas as pd
import seaborn as sns
import sqlite3
from py4pm.dateutilities import add_season

MAPPER_METALS_NAME_TO_SYMBOLE = {
    "Potassium": "K",
    "Calcium": "Ca",
    "Titanium": "Ti",
    "Vanadium": "V",
    "Chromium": "Cr",
    "Manganese": "Mn",
    "Iron": "Fe",
    "Cobalt": "Co",
    "Nickel": "Ni",
    "Copper": "Cu",
    "Zinc": "Zn",
    "Gallium": "Ga",
    "Germanium": "Ge",
    "Arsenic": "As",
    "Selenium": "Se",
    "Bromine": "Br",
    "Yttrium": "Yt",
    "Molybdenum": "Mo",
    "Cadmium": "Cd",
    "Tin": "Sn",
    "Antimony": "Sb",
    "Mercury": "Hg",
    "Thallium": "Tl",
    "Lead": "Pb",
    "Bismuth": "Bi",
}

[docs]def replace_QL(dftmp, species=None, conn=None): """Replace the -1 and -2 in the dataframe by the appropriate DL and QL values The change are done inplace. :dftmp: pandas DataFrame """ stations = dftmp.station.unique() if species is None: species = dftmp.columns if conn is None: conn = sqlite3.connect("/home/webersa/Documents/BdD/BdD_PM/aerosols.db") sqlquery = """ SELECT {sp} FROM QL WHERE station IN ("{stations}") AND "sample ID" LIKE "%QL%"; """.format( sp='", "'.join(species), stations='", "'.join(stations) ) print(sqlquery) QLtmp = pd.read_sql(sqlquery, con=conn) print(QLtmp) conn.close() QLtmp = QLtmp.apply(pd.to_numeric, errors='ignore').dropna(how="all", axis=1) for station in stations: QLtmpmean = QLtmp[QLtmp.station==station].mean() to_replace = { c: {-2: QLtmpmean[c]/2, -1: QLtmpmean[c]/2} for c in QLtmpmean.index } for c in dftmp.columns: if c not in species: continue if (c in to_replace.keys()) and (pd.notna(to_replace[c][-1])): idx = dftmp.station == station dftmp.loc[idx, c] = dftmp.loc[idx, c].clip_lower(to_replace[c][-1])
[docs]def get_sourceColor(source=None): """Return the hexadecimal color of the source(s) If no option, then return the whole dictionary Optional Parameters =================== source : str The name of the source """ color = { "Traffic": "#000000", "Traffic 1": "#000000", "Traffic 2": "#102262", "Road traffic": "#000000", "Primary traffic": "#000000", "Traffic_ind": "#000000", "Traffic_exhaust": "#000000", "Traffic_dir": "#444444", "Traffic_non-exhaust": "#444444", "Resuspended_dust": "#444444", "Oil/Vehicular": "#000000", "Road traffic/oil combustion": "#000000", "Biomass_burning": "#92d050", "Biomass burning": "#92d050", "Biomass_burning1": "#92d050", "Biomass_burning2": "#92d050", "Sulfate-rich": "#ff2a2a", "Sulfate_rich": "#ff2a2a", "Sulfate rich": "#ff2a2a", "Nitrate-rich": "#217ecb", # "#ff7f2a", "Nitrate_rich": "#217ecb", # "#ff7f2a", "Nitrate rich": "#217ecb", # "#ff7f2a", "Secondary_inorganics": "#0000cc", "MSA_rich": "#ff7f2a", # 8c564b", "Secondary_oxidation": "#ff87dc", "Marine SOA": "#ff7f2a", # 8c564b", "Biogenic SOA": "#8c564b", "Anthropogenic SOA": "#8c564b", "Marine/HFO": "#a37f15", #8c564b", "Aged seasalt/HFO": "#8c564b", "Marine_biogenic": "#fc564b", "HFO": "#70564b", "HFO (stainless)": "#70564b", "Oil": "#70564b", "Vanadium rich": "#70564b", "Cadmium rich": "#70564b", "Marine": "#33b0f6", "Marin": "#33b0f6", "Salt": "#00b0f0", "Seasalt": "#00b0f0", "Sea-road salt": "#209ecc", "Sea/road salt": "#209ecc", "Fresh seasalt": "#00b0f0", "Aged_salt": "#97bdff", #00b0f0", "Aged seasalt": "#97bdff", #00b0f0", "Fungal spores": "#ffc000", "Primary_biogenic": "#ffc000", "Primary biogenic": "#ffc000", "Biogenique": "#ffc000", "Biogenic": "#ffc000", "Dust": "#dac6a2", "Mineral dust": "#dac6a2", "Crustal_dust": "#dac6a2", "Industrial": "#7030a0", "Industries": "#7030a0", "Indus/veh": "#5c304b", "Industry/traffic": "#5c304b", #7030a0", "Arcellor": "#7030a0", "Siderurgie": "#7030a0", "Plant debris": "#2aff80", "Plant_debris": "#2aff80", "Débris végétaux": "#2aff80", "Choride": "#80e5ff", "PM other": "#cccccc", "Traffic/dust (Mix)": "#333333", "SOA/sulfate (Mix)": "#6c362b", "Sulfate rich/HFO": "#8c56b4", "nan": "#ffffff" } color = pd.DataFrame(index=["color"], data=color) if source: if source not in color.keys(): print("WARNING: no {} found in colors".format(source)) return "#666666" return color.loc["color", source] else: return color
[docs]def get_sourcesCategories(profiles): """Get the sources category according to the sources name. Ex. Aged sea salt → Aged_sea_salt :profiles: list :returns: list """ possible_sources = { "Vehicular": "Traffic", "VEH": "Traffic", "VEH ind": "Traffic_ind", "Traffic_exhaust": "Traffic_exhaust", "Traffic_non-exhaust": "Traffic_non-exhaust", "VEH dir": "Traffic_dir", "Oil/Vehicular": "Traffic", "Oil": "Oil", "Vanadium rich": "Vanadium rich", "Road traffic/oil combustion": "Traffic", "Traffic": "Road traffic", "Traffic 1": "Traffic 1", "Traffic 2": "Traffic 2", "Primary traffic": "Road traffic", "Road traffic": "Road traffic", "Road trafic": "Road traffic", "Road traffic/dust": "Traffic/dust (Mix)", "Bio. burning": "Biomass_burning", "Bio burning": "Biomass_burning", "Comb fossile/biomasse": "Biomass_burning", "BB": "Biomass_burning", "Biomass_burning": "Biomass_burning", "Biomass Burning": "Biomass_burning", "Biomass burning": "Biomass_burning", "BB1": "Biomass_burning1", "BB2": "Biomass_burning2", "Sulfate-rich": "Sulfate_rich", "Sulphate-rich": "Sulfate_rich", "Nitrate-rich": "Nitrate_rich", "Sulfate rich": "Sulfate_rich", "Sulfate_rich": "Sulfate_rich", "Nitrate rich": "Nitrate_rich", "Nitrate_rich": "Nitrate_rich", "Secondary inorganics": "Secondary_inorganics", "Secondaire": "MSA_rich", "Secondary bio": "MSA_rich", "Secondary biogenic": "MSA_rich", "Secondary organic": "MSA_rich", "Secondary oxidation": "Secondary_oxidation", "Secondaire organique": "MSA_rich", # "Marine SOA": "Marine SOA", "Marine SOA": "MSA_rich", "MSA_rich": "MSA_rich", "MSA rich": "MSA_rich", "Secondary biogenic/sulfate": "SOA/sulfate (Mix)", "Marine SOA/SO4": "SOA/sulfate (Mix)", "Marine/HFO": "Marine/HFO", "Marine biogenic/HFO": "Marine/HFO", "Secondary biogenic/HFO": "Marine/HFO", "Marine bio/HFO": "Marine/HFO", "Marin bio/HFO": "Marine/HFO", "Sulfate rich/HFO": "Marine/HFO", "Marine secondary": "MSA_rich", "Marin secondaire": "MSA_rich", "HFO": "HFO", "HFO (stainless)": "HFO", "Marin": "MSA_rich", "Sea/road salt": "Sea-road salt", "Sea-road salt": "Sea-road salt", "sea-road salt": "Sea-road salt", "Road salt": "Salt", "Sea salt": "Salt", "Seasalt": "Salt", "Salt": "Salt", "Fresh seasalt": "Salt", "Sels de mer": "Salt", "Aged_salt": "Aged_salt", "Aged sea salt": "Aged_salt", "Aged seasalt": "Aged_salt", "Aged seasalt": "Aged_salt", "Aged salt": "Aged_salt", "Primary_biogenic": "Primary_biogenic", "Primary bio": "Primary_biogenic", "Primary biogenic": "Primary_biogenic", "Biogénique primaire": "Primary_biogenic", "Biogenique": "Primary_biogenic", "Biogenic": "Primary_biogenic", "Mineral dust": "Dust", "Mineral dust ": "Dust", "Resuspended_dust": "Resuspended_dust", "Resuspended dust": "Resuspended_dust", "Dust": "Dust", "Crustal dust": "Dust", "Dust (mineral)": "Dust", "Dust/biogénique marin": "Dust", "AOS/dust": "Dust", "Industrial": "Industrial", "Industry": "Industrial", "Industrie": "Industrial", "Industries": "Industrial", "Industry/vehicular": "Industry/traffic", "Industry/traffic": "Industry/traffic", "Industries/trafic": "Industry/traffic", "Cadmium rich": "Cadmium rich", "Fioul lourd": "HFO", "Arcellor": "Industrial", "Siderurgie": "Industrial", "Débris végétaux": "Plant_debris", "Chlorure": "Chloride", "PM other": "Other" } s = [possible_sources[k] for k in profiles] return s
[docs]def get_site_typology(): import collections site_typologie = collections.OrderedDict() site_typologie["Urban"] = ["Talence", "Lyon", "Poitiers", "Nice", "MRS-5av", "PdB", "Aix-en-provence", "Nogent", "Poitiers", "Lens-2011-2012", "Lens-2013-2014", "Lens", "Rouen"] site_typologie["Valley"] = ["Chamonix", "Passy", "Marnaz", "GRE-cb", "VIF", "GRE-fr", "Passy_decombio"] site_typologie["Traffic"] = ["Roubaix", "STG-cle"] site_typologie["Rural"] = ["Revin", "Peyrusse", "ANDRA-PM10", "ANDRA-PM2.5"] site_typologie_SOURCES = collections.OrderedDict() site_typologie_SOURCES["Urban"] = [ "LEN", "LY", "MRS", "NGT", "NIC", "POI", "PdB", "PROV", "TAL", "ROU" ] site_typologie_SOURCES["Valley"] = ["CHAM", "GRE"] site_typologie_SOURCES["Traffic"] = ["RBX", "STRAS"] site_typologie_SOURCES["Rural"] = ["REV"] for typo in site_typologie.keys(): site_typologie[typo] += site_typologie_SOURCES[typo] return site_typologie
[docs]def get_OC_from_OC_star_and_organic(df): """ Re-compute OC taking into account the organic species OC = OC* + sum(eqC_sp) """ OC = df.loc['OC*'].copy() equivC = { 'Oxalate': 0.27, 'Arabitol': 0.40, 'Mannitol': 0.40, 'Sorbitol': 0.40, 'Polyols': 0.40, 'Levoglucosan': 0.44, 'Mannosan': 0.44, 'Galactosan': 0.44, 'MSA': 0.12, 'Glucose': 0.44, 'Cellulose': 0.44, 'Maleic': 0.41, 'Succinic': 0.41, 'Citraconic': 0.46, 'Glutaric': 0.45, 'Oxoheptanedioic': 0.48, 'MethylSuccinic': 0.53, 'Adipic': 0.49, 'Methylglutaric': 0.49, '3-MBTCA': 0.47, 'Phtalic': 0.58, 'Pinic': 0.58, 'Suberic': 0.55, 'Azelaic': 0.57, 'Sebacic': 0.59, } for sp in equivC.keys(): if sp in df.index: OC += df.loc[sp] * equivC[sp] return OC
[docs]def get_sample_where(sites=None, date_min=None, date_max=None, species=None, min_sample=None, particle_size=None, con=None): """Get dataframe that meet conditions :sites: TODO :date_min: TODO :date_max: TODO :min_sample: int, minimum samples size :particle_size: :con: sqlite3 connection :returns: TODO """ df = pd.read_sql("SELECT * FROM values_all;", con=con) df["Date"] = pd.to_datetime(df["Date"]) if date_min: df = df.loc[date_min < df["Date"]] if date_max: df = df.loc[df["Date"] < date_max] if species: df = df.loc[df[species].notnull().all(axis=1)] if particle_size: df["Station"] = df["Station"]+"—"+df["Particle_size"] if min_sample: keep_stations = df.groupby("Station").size() keep_stations = list(keep_stations.loc[keep_stations > min_sample].index) df = df.loc[df["Station"].isin(keep_stations)] return df
def _format_ions(text): map_ions = { "Cl-": "Cl$^-$", "Na+": "Na$^+$", "K+": "K$^+$", "NO3-": "NO$_3^-$", "NH4+": "NH$_4^+$", "SO42-": "SO$_4^{2-}$", "Mg2+": "Mg$^{2+}$", "Ca2+": "Ca$^{2+}$", "nss-SO42-": "nss-SO$_4^{2-}$" } if text in map_ions.keys(): return map_ions[text] else: return text
[docs]def format_ions(text): if isinstance(text, list): mapped = [_format_ions(x) for x in text] elif isinstance(text, str): mapped = _format_ions(text) else: raise KeyError( "`text` must be a {x,y}ticklabels, a list of string or string" ) return mapped
[docs]class plot(): def _mainComponentOfPM(dff, station): COLORS = { "OM": "#008000", "EC": "#000000", "Cl-": "#59B2B2", "NO3-": "#0000FF", "SO42-": "#FF0000", "NH4+": "#FF8000", "Ca2+": "#CED770", "Other ions": "#710077", "Metals": "#804000", "Anhydrous monosaccharides": "#004000", "Organic acids": "#CE9E8E", "Polyols": "#A0A015", "Oxalate": "#7D0000", "MSA": "#2D00BB", "Glucose": "#4B8A08", "Cellulose": "#0B3B0B", "HULIS": "#58ACFA" } TEXTCOLORS = { "OM": "#000000", "EC": "#FFFFFF", "Cl-": "#000000", "NO3-": "#FFFFFF", "SO42-": "#000000", "NH4+": "#000000", "Ca2+": "#000000", "Other ions": "#FFFFFF", "Metals": "#FFFFFF", "Anhydrous monosaccharides": "#FFFFFF", "Organic acids": "#000000", "Polyols": "#000000", "Oxalate": "#FFFFFF", "MSA": "#FFFFFF", "Glucose": "#FFFFFF", "Cellulose": "#FFFFFF", "HULIS": "#000000" } ORGANICS = ["HULIS", "Anhydrous monosaccharides", "Polyols", "Organic acids", "Oxalate", "MSA", "Glucose", "Cellulose"] # 2 dataframes: one for the 'main' components, one for the organics df_proportion_perday = pd.DataFrame() nonorganics = list(set(dff.columns)-set(ORGANICS)) for c in nonorganics: df_proportion_perday[c] = dff[c]/dff[nonorganics].sum(axis=1) df_proportion_OM_perday = pd.DataFrame() for c in dff.columns: if c in ORGANICS: df_proportion_OM_perday[c] = dff[c]/dff["OM"] d = pd.DataFrame(index=list(df_proportion_OM_perday.columns) + list(df_proportion_perday.columns)) d["other"] = df_proportion_perday.median() d["organics"] = df_proportion_OM_perday.median() d.loc[ORGANICS, "other"] = pd.np.nan df_mg_per_gOM = df_proportion_OM_perday.median() * 1000 # Plot part order1 = ["OM", "EC", "Cl-", "NO3-", "SO42-", "NH4+", "Ca2+", "Other ions", "Metals"] order2 = ORGANICS.copy() d = d.reindex(order1+order2) d.dropna(axis=0, how="all", inplace=True) OMidentified = df_proportion_OM_perday.median().sum() * 100 dnormalize = d/d.sum() * 100 # d1 = d["other"].reindex(order1, axis=0) # d2 = d["organics"].reindex(order2, axis=0) # d1 = d1/d1.sum() # d2 = d2/d2.sum() f, ax = plt.subplots(figsize=(9.5,7.5)) dnormalize.T.plot.bar( stacked=True, color=dnormalize.index.map(COLORS).dropna(), rot=0, ax=ax, ) xpos = {"other": 0, "organics": 1} texts = {"other": [], "organics": []} for xvar in ["other", "organics"]: val = dnormalize[xvar].reset_index().melt(id_vars=["index"]) cumsum = 0 for i, v in zip(val["index"], val["value"]): if pd.np.isnan(v): continue cumsum += v if xvar == "other": annot = ax.annotate("{}".format(format_ions(i)), (xpos[xvar]-0.28, (cumsum -v/2) ), ha="right", va="center" ) else: text = "{}\n({:.2f} mg.g$_{{OM}}^{{-1}}$)".format(i, df_mg_per_gOM.loc[i]) if len(text)<40: text = text.replace("\n", " ") annot = ax.annotate(text, (xpos[xvar]+0.28, (cumsum -v/2) ), ha="left", va="center" ) texts[xvar].append(annot) ax.annotate("{:.0f}%".format(v), (xpos[xvar], (cumsum - v/2) ), ha="center", va="center", color=TEXTCOLORS[i], fontweight="bold" ) # texts = pd.Series(plt.gcf().get_children()[1].get_children()) # idx = [type(t)==matplotlib.text.Annotation for t in texts] # texts = texts[idx].tolist() # adjust_text( # texts["organics"], # # arrowprops=dict(arrowstyle="->", color='r', lw=0.5), # autoalign='', only_move={'points': 'y', 'text': 'y'} # ) yOMidentified = OMidentified * dnormalize.loc["OM", "other"]/100 ax.annotate("{:.0f}% identified".format(OMidentified), (xpos["other"], yOMidentified/2), ha="center", va="center", color="#FFFFFF", fontweight="bold" ) ax.plot([0.25, 0.75], [yOMidentified, 100], "-k") ax.plot([-0.25, 0.25], [yOMidentified, yOMidentified], '-w') ax.set_title(station, fontsize=16) ax.set_xticklabels([]) f.subplots_adjust(top=0.88, bottom=0.11, left=0.125, right=0.85, hspace=0.2, wspace=0.2) ax.legend('', frameon=False) ax.yaxis.set_major_formatter(FuncFormatter('{0:.0f}%'.format)) sns.despine()
[docs] def mainCompentOfPM(station, dateStart, dateEnd, seasonal=False, savefig=False, savedir=None): """ Plot a stacked bar plot of the different constitutant of the PM Parameters ---------- station : str name of the station dateStart, dateEnd : str starting and ending date seasonal : boolean, default False Either to make separate graph per season savefig : boolean, default False Save the fig in png and pdf savedir : str path, default None Where to save the figures """ TO_GROUP = { "Metals": [ "Al", "As", "Cd", "Cr", "Cu", "Fe", "Mn", "Mo", "Ni", "Pb", "Rb", "Sb", "Se", "Sn", "Ti", "V", "Zn" ], "Anhydrous monosaccharides": ["Levoglucosan", "Mannosan", "Galactosan"], "Polyols": ["Arabitol", "Sorbitol", "Mannitol"], "Organic acids": [ "Maleic", "Succinic", "Citraconic", "Glutaric", "Oxoheptanedioic", "MethylSuccinic", "Adipic", "Methylglutaric", "3-MBTCA", "Phtalic", "Pinic", "Suberic", "Azelaic", "Sebacic" ], "Other ions": [ "Na+", "K+", "Mg2+", ] } TO_MICROGRAMME = ["OM", "EC", "HULIS"] conn = sqlite3.connect("/home/webersa/Documents/BdD/BdD_PM/aerosols.db") df = pd.read_sql( "SELECT * FROM values_all WHERE station IN ('{}');".format(station), con=conn ) df.date = pd.to_datetime(df.date) df.set_index("date", inplace=True, drop=True) df = df[(dateStart < df.index) & (df.index < dateEnd)] if seasonal: df = add_season(df) # Metals = [ # "Al", "As", "Ba", "Cd", "Co", "Cr", "Cs", "Cu", "Fe", "La", "Mn", # "Mo", "Ni", "Pb", "Rb", "Sb", "Se", "Sn", "Sr", "Ti", "V", "Zn" # ] dff = pd.DataFrame() for k in TO_GROUP.keys(): df[k] = df[TO_GROUP[k]].sum(axis=1, min_count=1) # Get only the columns we have dff = df.reindex(TO_GROUP.keys(), axis=1) dff["OM"] = df["OC"]*1.8 to_keep = ["EC", "NO3-", "NH4+", "Cl-", "SO42-", "Ca2+", "Oxalate", "MSA", "Glucose", "Cellulose", "HULIS"] for k in to_keep: if k in df.columns: dff[k] = df[k] dff.apply(pd.to_numeric) if seasonal: dff["season"] = df["season"] # Convert ng to µg for i in TO_MICROGRAMME: dff[i] *= 1000 DF = [] seasonName = [] if seasonal: for season in df["season"].unique(): DF.append(dff[dff["season"] == season].drop("season", axis=1)) seasonName.append(season) else: DF = [dff] seasonName = ["annual"] for dfff, season in zip(DF, seasonName): plot._mainComponentOfPM(dfff, station) ax = plt.gca() if season: title = ax.get_title() plt.title(title+" "+season) if savefig: plt.savefig( "{BDIR}/{station}_{temp}.png".format( BDIR=savedir, station=station, temp=season ) ) plt.savefig( "{BDIR}/{station}_{temp}.pdf".format( BDIR=savedir, station=station, temp=season ) )
[docs] def what_do_we_have(sites=None, date_min=None, date_max=None, species=None, min_sample=None, particle_size=None, con=None): """TODO: Docstring for what_do_we_have. :sites: TODO :date_min: TODO :date_max: TODO :species: TODO :min_sample: TODO :con: TODO :returns: TODO """ df = get_sample_where( sites=sites, date_min=date_min, date_max=date_max, species=species, min_sample=min_sample, particle_size=particle_size, con=con ) df.set_index(["Station", "Date"], inplace=True) stations = df.index.get_level_values("Station").unique() fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(12, 8)) for i, station in enumerate(stations): date = df.loc[station].index ax.plot(date, [i]*len(date), "-o", label=station) ax.set_yticks(range(len(stations))) ax.set_ylim(-0.5, len(stations)-0.5) ax.set_yticklabels(stations)