import os
import pandas as pd
import numpy as np
from zipfile import ZipFile
import matplotlib.pyplot as plt
import matplotlib.patches as mpatch
import scipy.stats as st
import seaborn as sns
from matplotlib.colors import TABLEAU_COLORS
import matplotlib.lines as mlines
import matplotlib.text as mtext
import itertools
from py4pm.chemutilities import *
[docs]class LegendTitle(object):
def __init__(self, text_props=None):
self.text_props = text_props or {}
super(LegendTitle, self).__init__()
[docs] def legend_artist(self, legend, orig_handle, fontsize, handlebox):
x0, y0 = handlebox.xdescent, handlebox.ydescent
title = mtext.Text(x0, y0, orig_handle, **self.text_props)
handlebox.add_artist(title)
return title
[docs]def to_relativeMass(df, totalVar="PM10"):
"""
Normalize the profile df to the relative mass with regard to the totalVar
(=PM10 or PM2.5).
"""
if totalVar not in df.index:
# print("WARNING: totalVar {} not in index.".format(totalVar))
totalVar = df[df.index.str.contains("PM")].index[0]
# print("Taking {}".format(totalVar))
dftmp = df.copy()
if isinstance(dftmp, pd.DataFrame):
dftmp.dropna(axis=1, how="all", inplace=True)
dftmp /= dftmp.loc[totalVar]
dftmp.drop(totalVar, inplace=True)
return dftmp
[docs]def compute_SID(df1, df2, factor1=None, factor2=None, isRelativeMass=True):
"""
Compute the SID of the factors `factor1` and `factor2` in the profile
`df1`and `df2`.
"""
if isinstance(df1, pd.Series) and isinstance(df2, pd.Series):
p1 = df1
p2 = df2
else:
if not(factor2):
factor2 = factor1
if factor1 not in df1.dropna(axis=1, how="all").columns:
return np.nan
if factor2 not in df2.dropna(axis=1, how="all").columns:
return np.nan
p1 = df1.loc[:, factor1]
p2 = df2.loc[:, factor2]
if not isRelativeMass:
p1 = to_relativeMass(p1)
p2 = to_relativeMass(p2)
sp = p1.index.intersection(p2.index)
if len(sp) > 3:
ID = pd.np.abs(p1[sp]-p2[sp])
MAD = p1[sp] + p2[sp]
SID = pd.np.sqrt(2)/len(sp) * (ID/MAD).sum()
else:
SID = pd.np.nan
return SID
[docs]def compute_PD(df1, df2, factor1=None, factor2=None, isRelativeMass=True):
"""
Compute the PD of the factors `factor1` and `factor2` in the profile
`df1`and `df2`.
"""
if isinstance(df1, pd.Series) and isinstance(df2, pd.Series):
p1 = df1
p2 = df2
else:
if not(factor2):
factor2 = factor1
if factor1 not in df1.dropna(axis=1, how="all").columns:
return np.nan
if factor2 not in df2.dropna(axis=1, how="all").columns:
return np.nan
p1 = df1.loc[:, factor1]
p2 = df2.loc[:, factor2]
if not isRelativeMass:
p1 = to_relativeMass(p1)
p2 = to_relativeMass(p2)
if p1.index.str.contains("PM").any():
p1 = p1.loc[~p1.index.str.contains("PM")]
if p2.index.str.contains("PM").any():
p2 = p2.loc[~p2.index.str.contains("PM")]
# Remove this part (impact performance)
# p1.dropna(inplace=True)
# p2.dropna(inplace=True)
sp = p1.index.intersection(p2.index)
if len(sp) > 3:
PD = 1 - pd.np.corrcoef(p1[sp], p2[sp])[1, 0]**2
else:
PD = pd.np.nan
return PD
[docs]def plot_similarityplot(PMF_profile, station1, station2, source1, source2=None,
SID=None, PD=None, isRelativeMass=False, ax=None, plot_kw={}):
"""
Plot the distance in the SID/PD space of 2 profiles for 2 stations.
"""
if not source2:
source2 = source1
for station, source in itertools.product([station1, station2], [source1, source2]):
if PMF_profile.loc[station, source].isnull().all():
print("There is no {} in {}".format(source, station))
return
newFig = False
if not ax:
f = plt.figure()
newFig = True
ax = plt.gca()
if (SID is None) and (PD is None):
SID = compute_SID(
PMF_profile.loc[station1, :], PMF_profile.loc[station2, :],
source1, source2,
isRelativeMass=isRelativeMass
)
PD = compute_PD(
PMF_profile.loc[station1, :], PMF_profile.loc[station2, :],
source1, source2,
isRelativeMass=isRelativeMass
)
ax.plot(SID, PD, "o", **plot_kw)
if newFig:
plot_deltatool_pretty(ax=ax)
[docs]def get_all_SID_PD(PMF_profile, stations, factor2=None, isRelativeMass=False):
"""
Compute the SID and PD for all profiles in PMF_profile for the stations
`stations`.
"""
factors = PMF_profile.dropna(axis=1, how='all').columns
SID = pd.DataFrame(index=pd.MultiIndex.from_product((factors, stations)),
columns=list(stations))
PD = pd.DataFrame(index=pd.MultiIndex.from_product((factors, stations)),
columns=list(stations))
MAD = pd.DataFrame()
list_stations1 = []
if factor2 is None:
sameFactor = True
else:
sameFactor = False
for factor1 in factors:
if sameFactor:
factor2 = factor1
else:
factor2 = factor2
print(factor1)
for station1 in stations:
list_stations1.append(station1)
if all(PMF_profile.loc[station1, factor1].isnull()):
continue
for station2 in stations:
# print(station1, station2)
# if station2 in list_stations1:
# continue
if all(PMF_profile.loc[station2, factor1].isnull()):
continue
profile1 = PMF_profile.loc[station1, factor1]
profile2 = PMF_profile.loc[station2, factor2]
SID.loc[(factor1, station1), station2] = compute_SID(
profile1,
profile2,
factor1=factor1,
factor2=factor2,
isRelativeMass=isRelativeMass
)
PD.loc[(factor1, station1), station2] = compute_PD(
profile1,
profile2,
factor1=factor1,
factor2=factor2,
isRelativeMass=isRelativeMass
)
list_stations1 = []
return (SID, PD)
[docs]def plot_similarity_profile(SID, PD, err="ci", plotAll=False):
"""
Plot a point in the SID/PD space (+/-err) for all profile in SID and PD.
SID : DataFrame with index (factor, station) and column (station) : the SID matrix
PD : DataFrame with index (factor, station) and column (station) : the PD matrix
err : "ci" or "sd", the type of error for xerr and yerr.
plotAll: boolean. Either or not plot each pair of profile.
Returns
-------
similarity : pd.DataFrame
columns: x, y, xerr, yerr, n
index: profiles
handles_labels: tuple of handles and labels
legend of the plot
"""
from py4pm.chemutilities import get_sourceColor
stations = list(SID.index.get_level_values("station").unique())
profiles = list(SID.index.get_level_values("profile").unique())
similarity = pd.DataFrame(columns=["x", "y", "xerr", "yerr", "n"],
index=profiles)
for p in profiles:
if p not in SID.index:
continue
x = SID.loc[(p, stations), stations] # .sort_index(axis=1)
x = x.where(np.triu(x, k=1).astype(bool))
x = x.reset_index().melt(id_vars=["profile", "station"]).dropna()
x = x.round(3)
y = PD.loc[(p, stations), stations] # .sort_index(axis=1)
y = y.where(np.triu(y, k=1).astype(bool))
y = y.reset_index().melt(id_vars=["profile", "station"]).dropna()
y = y.round(3)
similarity.loc[p, "x"] = x["value"].mean()
similarity.loc[p, "y"] = y["value"].mean()
if err == "ci":
similarity.loc[p, "xerr"] = x["value"].mean() \
- st.t.interval(
0.95, len(x["value"])-1,
loc=np.mean(x["value"]),
scale=st.sem(x["value"])
)[0]
similarity.loc[p, "yerr"] = y["value"].mean() \
- st.t.interval(
0.95, len(y["value"])-1,
loc=np.mean(y["value"]),
scale=st.sem(y["value"])
)[0]
elif err == "sd":
similarity.loc[p, "xerr"] = x["value"].std()
similarity.loc[p, "yerr"] = y["value"].std()
similarity.loc[:, "xerr"] = similarity.loc[:, "xerr"].fillna(0)
similarity.loc[:, "yerr"] = similarity.loc[:, "yerr"].fillna(0)
similarity.loc[p, "n"] = x["value"].notnull().sum()
if plotAll:
f = plt.figure(figsize=(7, 5))
ax = plt.gca()
ax.plot(x["value"], y["value"], "o", color=get_sourceColor(p))
ax.set_title(p)
plot_deltatool_pretty(ax)
# ax.set_xlabel("SID")
# ax.set_ylabel("PD")
# rect = mpatch.Rectangle((0, 0), width=1, height=0.4, facecolor="green", alpha=0.1, zorder=-1)
# ax.add_patch(rect)
# ax.set_xlim(0, 1.5)
# ax.set_ylim(0, 1)
plt.savefig("distance_all_profile_{p}.pdf".format(
p=p.replace(" ", "-").replace("/", "-")
))
# ---- plot part
f = plt.figure(figsize=(8,5))
ax = plt.gca()
maxNumber = similarity.loc[:, "n"].max()
for p in profiles:
print(p)
if similarity.loc[p, :].isnull().any():
continue
ax.errorbar(similarity.loc[p, "x"], similarity.loc[p, "y"],
fmt="o", markersize=14*similarity.loc[p, "n"]/maxNumber,
color=get_sourceColor(p), alpha=0.5,
xerr=similarity.loc[p, "xerr"],
yerr=similarity.loc[p, "yerr"],
label=p
)
plot_deltatool_pretty(ax)
ax.set_title('Similarity between all pairs of profile')
plt.subplots_adjust(top=0.88, bottom=0.11, left=0.100, right=0.700)
handles, labels = ax.get_legend_handles_labels()
newLabels = []
for l in labels:
newLabels.append("{l} ({n})".format(l=l.replace("_", " "), n=similarity.loc[l, "n"]))
ax.legend(handles, newLabels, bbox_to_anchor=(1, 1), loc="upper left",
frameon=False, fontsize=12)
return (similarity, (handles, newLabels))
[docs]def plot_all_stations_similarity_by_source(PMF_profile):
"""
Plot all individual pair of profile for each common source.
"""
stations = PMF_profile.index.get_level_values("station").unique()
sources = PMF_profile.columns
for source in sources:
if PMF_profile.loc[:, source].isnull().all():
continue
f, ax = plt.subplots()
color = sourcesColor(source)
for station1, station2 in itertools.product(stations, stations):
if station1 == station2:
continue
plot_similarityplot(
PMF_profile,
station1, station2, source,
ax=ax, plot_kw={"color":color}
)
plot_deltatool_pretty(ax=ax)
f.suptitle(source)
[docs]def plot_relativeMass(PMF_profile, source="Biomass burning",
isRelativeMass=True, totalVar="PM10", naxe=1,
site_typologie=None):
if not site_typologie:
site_typologie = get_site_typology()
# site_typologie = collections.OrderedDict()
# site_typologie["Urban"] = ["Talence", "Lyon", "Poitiers", "Nice", "MRS-5av",
# "PdB", "Aix-en-provence", "Nogent",
# "Lens-2011-2012", "Lens-2013-2014"]
# site_typologie["Valley"] = ["Chamonix", "GRE-fr"]
# site_typologie["Traffic"] = ["Roubaix", "STG-cle"]
# site_typologie["Rural"] = ["Revin"]
carboneous = ["OC*", "EC"]
ions = ["Cl-", "NO3-", "SO42-", "Na+", "NH4+", "K+", "Mg2+", "Ca2+"]
organics = [
"MSA", "Polyols", "Levoglucosan", "Mannosan",
]
metals = [
"Al", "As", "Ba", "Cd", "Co", "Cr", "Cs", "Cu", "Fe", "La", "Mn",
"Mo", "Ni", "Pb", "Rb", "Sb", "Se", "Sn", "Sr", "Ti", "V", "Zn"
]
keep_index = ["PM10"] + carboneous + ions + organics + metals
if naxe==2:
keep_index1 = carboneous + [
"NO3-", "SO42-", "NH4+", "Na+", "K+", "Ca2+", "Mg2+", "Cl-",
"Polyols", "Fe", "Al", "Levoglucosan", "Mannosan", "MSA"
]
keep_index2 = []#"MSA"]
keep_index2tmp = list(set(keep_index) - set(keep_index1) - set(["PM10",
"Mg2+"]))
keep_index2tmp.sort()
keep_index2 += keep_index2tmp
dfperµg = pd.DataFrame(columns=keep_index)
for station, df in PMF_profile.reset_index().groupby("station"):
df = df.set_index("specie").drop("station", axis=1)
if isRelativeMass:
dfperµg.loc[station, :] = df.loc[:, source]
else:
dfperµg.loc[station, :] = to_relativeMass(df.loc[:, source],
totalVar=totalVar).T
dfperµg = dfperµg.convert_objects(convert_numeric=True)
# FIGURE
f, axes = plt.subplots(nrows=naxe, ncols=1, figsize=(12.67, 6.54))
if naxe == 1:
axes = [axes, None]
d = dfperµg.T.copy()
d["specie"] = d.index
# for i, keep_index in enumerate([keep_index1, keep_index2]):
if naxe == 1:
xtick_list = [keep_index]
elif naxe == 2:
xtick_list = [keep_index1, keep_index2]
for i, keep_index in enumerate(xtick_list):
dd = d.reindex(keep_index)
# dd.rename(rename_station, inplace=True, axis="columns")
dd = dd.melt(id_vars=["specie"])
# if not percent:
dd.replace({0: pd.np.nan}, inplace=True)
axes[i].set_yscale("log")
sns.boxplot(data=dd, x="specie", y="value", ax=axes[i], color="white",
showcaps=False,
showmeans=False, meanprops={"marker": "d"})
ntypo = len(site_typologie.keys())
colors = list(TABLEAU_COLORS.values())
# for sp, specie in enumerate(keep_index):
for t, typo in enumerate(site_typologie.keys()):
if typo == "Urban":
marker = "*"
elif (typo == "Valley") or typo == ("Urban+Alps"):
marker = "o"
elif typo == "Traffic":
marker = "p"
else:
marker = "d"
step = 0.1
j = 0
for site in site_typologie[typo]:
if site not in PMF_profile.index.get_level_values("station").unique():
continue
axes[i].scatter(
pd.np.arange(0,len(keep_index))-ntypo*step/2+step/2+t*step,
d.loc[keep_index, site],
marker=marker, color=colors[j], alpha=0.8
)
j += 1
# sns.swarmplot(data=dd, x="specie", y="value", color=".2", alpha=0.5, size=4,
# ax=axes[i])
if naxe == 1:
axes[0].set_ylim([1e-5,2])
else:
axes[0].set_ylim([1e-3,1])
axes[1].set_ylim([1e-5,1e-2])
for ax in axes:
if ax:
ax.set_ylabel("µg/µg of PM$_{10}$")
ax.set_xlabel("")
for tick in ax.get_xticklabels():
tick.set_rotation(90)
# ax.legend(loc="center",
# ncol=(len(dfperµg.columns))//2,
# bbox_to_anchor=(0.5, -.55),
# frameon=False)
# create custom legend
labels = []
artists = []
for typo in site_typologie.keys():
if typo == "Urban":
marker = "*"
elif (typo == "Valley") or (typo == "Urban+Alps"):
marker = "o"
elif typo == "Traffic":
marker = "p"
else:
marker = "d"
noSiteYet = True
j = 0
for site in site_typologie[typo]:
if site not in PMF_profile.index.get_level_values("station").unique():
continue
if noSiteYet:
artist = typo
label = ""
artists.append(artist)
labels.append(label)
noSiteYet = False
artist = mlines.Line2D([], [], ls='', marker=marker, color=colors[j],
label=site)
artists.append(artist)
labels.append(site)
j += 1
axes[0].legend(artists, labels, bbox_to_anchor=(1.1, 1),
handler_map={str: LegendTitle()},
frameon=False
)
# ax.legend('', frameon=False)
f.suptitle(source)
# plt.subplots_adjust(left=0.07, right=0.83, bottom=0.15, top=0.900, hspace=0.5)
plt.subplots_adjust(
top=0.925,
bottom=0.07,
left=0.053,
right=0.899,
hspace=0.489,
wspace=0.2
)
[docs]def get_profile_from_PMF(pmfs):
"""Get a profile matrix from a list of PMF object
:pmfs: TODO
:returns: TODO
"""
profiles = pd.DataFrame()
for pmf in pmfs:
dftmp = pmf.to_relative_mass()
possible_sources = {
p: get_sourcesCategories([p])[0]
for p in dftmp.columns
}
dftmp.rename(columns=possible_sources, inplace=True)
dftmp.sort_index(axis=1, inplace=True)
dftmp["station"] = pmf._site+"_"+pmf._program
profiles = pd.concat([profiles, dftmp])
return profiles
## TO ADAPT! ==================================================================
# conn = sqlite3.connect("/home/webersa/Documents/BdD/BdD_PM/db.sqlite")
# contrib = pd.read_sql(
# "SELECT * FROM PMF_contributions WHERE programme IN ('SOURCES');",
# con=conn,
# parse_dates=["date"]
# )
# contrib.set_index(['station', 'date'], drop=True, inplace=True)
# contrib.drop(["programme", "index"], inplace=True, axis=1)
# profile = pd.read_sql(
# "SELECT * FROM PMF_profiles WHERE programme IN ('SOURCES');",
# con=conn
# )
# profile.set_index(['station', 'specie'], drop=True, inplace=True)
# profile.drop(["programme", "index"], inplace=True, axis=1)
### ===========================================================================
# stations = profile.index.get_level_values('station').unique()