Source code for seismic.ASDFdatabase.plot_data_quality

#!/usr/env python
"""
Description:
    Reads waveform data from a FederatedASDFDataSet and generates data-quality plots into a multi-page PDF file

    For each station, the program read-in all the waveform data over the period specified (usually over a  year),
    and then perform statistics analysis over the waveform data for subsequent plotting.

    The first page of the PDF shows the stations on a basemap with marker colored according to the number of usable days
    ("good" data available in the day, not gap, not all zeros) And  the color is determined by c=mapper.to_rgba(days):
    Relatively speaking, black/blue/cold-ish colors indicate higher percentage of good data in the station.
    And red/yellow/warm-ish colors indicate higher percentage of problematic data.

    The following pages of the PDF will be plotting the daily-averaged seismic wave data
    with gaps and all-zeros days are shaded.

CreationDate:   19/09/19
Developer:      rakib.hassan@ga.gov.au

Revision History:
    LastUpdate:     19/09/2019   RH
    LastUpdate:     27/05/2020   FZ     Refactoring and docs run examples etc.

Todo:
    The script currently have a low pylint score 4.3/10.
    Need to refactor to  comply with Python standard pep-8: add required docstrings.
    use smaller function to modularize better.
    Consider to generate an additional page to executive summarise info for user.
"""

import gc
import logging
from collections import defaultdict

import click
import matplotlib
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.backends.backend_pdf import PdfPages
from matplotlib.dates import DateFormatter, AutoDateLocator
from mpi4py import MPI
from mpl_toolkits.basemap import Basemap
from obspy import UTCDateTime
from ordered_set import OrderedSet as set
from tqdm import tqdm

from seismic.ASDFdatabase.FederatedASDFDataSet import FederatedASDFDataSet

logging.basicConfig()


[docs]def split_list(lst, npartitions):
    k, m = divmod(len(lst), npartitions)
    return [lst[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(npartitions)]


# end func


[docs]def setup_logger(name, log_file, level=logging.INFO):
    """
    Function to setup a logger; adapted from stackoverflow
    """
    formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
    handler = logging.FileHandler(log_file, mode='w')
    handler.setFormatter(formatter)

    logger = logging.getLogger(name + log_file)
    logger.setLevel(level)
    logger.addHandler(handler)
    return logger


# end func


[docs]def process_data(rank, fds, stations, start_time, end_time):
    """

    :param rank: processor rank in parallel runs
    :param fds: FederatedASDFDataSer instance
    :param stations: list containing tuples (net, sta, loc, cha, lon, lat)
    :param start_time: start-time in UTCDateTime format
    :param end_time: end-time in UTCDateTime format
    :return: a list containing UTCDateTimes marking the start of each day
             and their corresponding means in each row
    """
    results = []

    day = 3600 * 24
    # Discard aux channels not in the following list
    chaSet = {'BH1', 'BH2', 'BHE', 'BHN', 'BHZ', 'BNZ', 'EHE', 'EHN', 'EHZ', 'HHE', 'HHN',
              'HHZ', 'LHE', 'LHN', 'LHZ', 'SHE', 'SHN', 'SHZ'}
    for s in tqdm(stations, desc='Rank: %d' % (rank)):
        # discard aux channels
        if (s[3] not in chaSet):
            results.append(([], []))
            continue
        # end if

        st, et = fds.get_global_time_range(s[0], s[1])

        if (start_time > st): st = start_time
        if (end_time < et): et = end_time

        # align st to day
        st = UTCDateTime(year=st.year, month=st.month, day=st.day)

        ct = st
        times = []
        means = []
        while (ct < et):
            times.append(ct)

            debug = False
            if not debug:
                stream = fds.get_waveforms(s[0], s[1], s[2], s[3],
                                           ct, ct + day,
                                           trace_count_threshold=200)
                if len(stream):
                    try:
                        tr = stream.merge()
                        means.append(np.mean(tr[0].data))
                    except:
                        # empty or streams that cannot be merged are marked by nans
                        means.append(np.nan)
                else:
                    means.append(np.nan)
                # end if
            else:
                # Used for debug purposes only
                if np.int_(np.round(np.random.random())):
                    means.append(np.random.random())
                else:
                    means.append(np.nan)
                # end if
            # end if

            ct += day
        # end while

        results.append([times, means])
    # end for

    return results


# end func


[docs]def plot_results(stations, results, output_basename):
    # collate indices for each channel for each station
    assert len(stations) == len(results)
    groupIndices = defaultdict(list)
    for i in np.arange(len(results)):
        groupIndices['%s.%s' % (stations[i][0], stations[i][1])].append(i)
    # end for

    # gather number of days of usable data for each station
    usableStationDays = defaultdict(int)
    maxUsableDays = -1e32
    minUsableDays = 1e32
    for k, v in groupIndices.items():
        for i, index in enumerate(v):
            x, means = results[index]

            means = np.array(means)
            days = np.sum(~np.isnan(means) & np.bool_(means != 0))
            if usableStationDays[k] < days:
                usableStationDays[k] = days
                maxUsableDays = max(maxUsableDays, days)
                minUsableDays = min(minUsableDays, days)
            # end if
        # end for
    # end for

    # Plot station map
    pdf = PdfPages('%s.pdf' % output_basename)

    fig = plt.figure(figsize=(20, 30))
    ax1 = fig.add_axes([0.05, 0.05, 0.9, 0.7])
    ax2 = fig.add_axes([0.05, 0.7, 0.9, 0.3])
    ax2.set_visible(False)

    minLon = 1e32
    maxLon = -1e32
    minLat = 1e32
    maxLat = -1e32
    for s in stations:
        lon, lat = s[4], s[5]
        if lon < 0:
            lon += 360
        # end if
        minLon = min(minLon, lon)
        maxLon = max(maxLon, lon)
        minLat = min(minLat, lat)
        maxLat = max(maxLat, lat)
    # end for

    minLon -= 1
    maxLon += 1
    minLat -= 1
    maxLat += 1

    m = Basemap(ax=ax1, projection='merc',
                resolution='i', llcrnrlat=minLat, urcrnrlat=maxLat,
                llcrnrlon=minLon, urcrnrlon=maxLon,
                lat_0=(minLat + maxLat) / 2., lon_0=(minLon + maxLon) / 2.)
    # draw coastlines.
    m.drawcoastlines()

    # draw grid
    parallels = np.linspace(np.around(minLat / 5) * 5 - 5, np.around(maxLat / 5) * 5 + 5, 6)
    m.drawparallels(parallels, labels=[True, True, False, False], fontsize=20)
    meridians = np.linspace(np.around(minLon / 5) * 5 - 5, np.around(maxLon / 5) * 5 + 5, 6)
    m.drawmeridians(meridians, labels=[False, False, True, True], fontsize=20)

    # plot stations
    norm = matplotlib.colors.Normalize(vmin=minUsableDays, vmax=maxUsableDays, clip=True)
    mapper = cm.ScalarMappable(norm=norm, cmap=cm.jet_r)
    plotted = set()
    for s in stations:
        if s[1] in plotted:
            continue
        else:
            plotted.add(s[1])
        # end if

        lon, lat = s[4], s[5]

        px, py = m(lon, lat)
        pxl, pyl = m(lon, lat - 0.1)
        days = usableStationDays['%s.%s' % (s[0], s[1])]
        m.scatter(px, py, s=400, marker='v',
                  c=mapper.to_rgba(days),
                  edgecolor='none', label='%s: %d' % (s[1], days))
        ax1.annotate(s[1], xy=(px + 0.05, py + 0.05), fontsize=22)
    # end for

    fig.axes[0].set_title("Network Name: %s" % s[0], fontsize=30, y=1.05)
    fig.axes[0].legend(prop={'size': 16}, loc=(0.2, 1.3),
                       ncol=5, fancybox=True, title='No. of Usable Days',
                       title_fontsize=16)

    pdf.savefig()
    plt.close()

    # Plot results
    for k, v in groupIndices.items():
        axesCount = 0
        for i in v:
            assert (k == '%s.%s' % (stations[i][0], stations[i][1]))
            # only need axes for non-null results
            a, b = results[i]
            if len(a) and len(b):
                axesCount += 1
            # end if
        # end for
        fig, axes = plt.subplots(axesCount, sharex=True)
        fig.set_size_inches(20, 15)

        axes = np.atleast_1d(axes)

        if len(axes):
            axesIdx = 0
            for i, index in enumerate(v):
                try:
                    x, means = results[index]

                    if len(x) and len(means):
                        x = [a.matplotlib_date for a in x]
                        d = np.array(means)

                        if len(d):
                            d[0] = np.nanmedian(d)
                        # end if

                        dnorm = d
                        dnormmin = np.nanmin(dnorm)
                        dnormmax = np.nanmax(dnorm)

                        axes[axesIdx].scatter(x, dnorm, marker='.', s=20)
                        axes[axesIdx].plot(x, dnorm, c='k', label='24 hr mean\n'
                                                                  'Gaps indicate no-data', lw=2, alpha=0.7)
                        axes[axesIdx].grid(axis='x', linestyle=':', alpha=0.3)

                        axes[axesIdx].fill_between(x, dnormmax * np.int_(d == 0), dnormmin * np.int_(d == 0),
                                                   where=dnormmax * np.int_(d == 0) - dnormmin * np.int_(d == 0) > 0,
                                                   color='r', alpha=0.5, label='All 0 Samples')

                        axes[axesIdx].fill_between(x, dnormmax * np.int_(np.isnan(d)), dnormmin * np.int_(np.isnan(d)),
                                                   where=dnormmax * np.int_(np.isnan(d)) - dnormmin * np.int_(
                                                       np.isnan(d)) > 1,
                                                   color='b', alpha=0.5, label='No Data')

                        axes[axesIdx].xaxis.set_major_locator(AutoDateLocator())
                        axes[axesIdx].xaxis.set_major_formatter(DateFormatter('%Y-%m-%d'))

                        for tick in axes[axesIdx].get_xticklabels():
                            tick.set_rotation(45)
                        # end for
                        axes[axesIdx].legend(loc='upper right', prop={'size': 12})
                        axes[axesIdx].tick_params(axis='both', labelsize=16)
                        stn = stations[index]
                        axes[axesIdx].set_title('Channel %s.%s' % (stn[2], stn[3]),
                                                fontsize=18, y=0.95, va='top')
                        axes[axesIdx].set_xlim(xmin=min(x), xmax=max(x))
                        axes[axesIdx].set_ylim(ymin=dnormmin, ymax=dnormmax)
                        axes[axesIdx].set_ylabel('Ampl.', fontsize=16)

                        axesIdx += 1
                    # end if
                except:
                    # plotting fails when each axes contain <2 values; just move on in those instances
                    logging.warning('Plotting failed on station %s' % k)
                # end try
            # end for
            axes[-1].set_xlabel('Days', fontsize=16)
        # end if

        plt.suptitle('%s Data Availability (~%d days)' % (k, usableStationDays[k]),
                     y=0.96, fontsize=20)
        pdf.savefig()
        plt.close()
        gc.collect()

    # end for

    pdf.close()


# end func


CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help'])


@click.command(context_settings=CONTEXT_SETTINGS)
@click.argument('asdf-source', required=True,
                type=click.Path(exists=True))
@click.argument('start-time', required=True,
                type=str)
@click.argument('end-time', required=True,
                type=str)
@click.argument('net', required=True,
                type=str)
@click.argument('sta', required=True,
                type=str)
@click.argument('cha', required=True,
                type=str)
@click.argument('output-basename', required=True,
                type=str)
def process(asdf_source, start_time, end_time, net, sta, cha, output_basename):
    """
    ASDF_SOURCE: Text file containing a list of paths to ASDF files\n
    START_TIME: Start time in UTCDateTime format\n
    END_TIME: End time in UTCDateTime format\n
    NET: Network name\n
    STA: Station name ('*' for all stations; note that * must be in quotation marks)\n
    CHA: Channel name ('*' for all channels; note that * must be in quotation marks) \n
    OUTPUT_BASENAME: Basename of output file

    Example usage:
    mpirun -np 112 python plot_data_quality.py asdf_files.txt 1980:01:01 2020:01:01 OA '*' '*' data_quality.oa
    """

    start_time = UTCDateTime(start_time)
    end_time = UTCDateTime(end_time)
    if (sta == '*'): sta = None
    if (cha == '*'): cha = None

    comm = MPI.COMM_WORLD
    nproc = comm.Get_size()
    rank = comm.Get_rank()

    l = setup_logger(name=output_basename, log_file='%s.log' % output_basename)
    fds = FederatedASDFDataSet(asdf_source, logger=l)

    stations = []
    if rank == 0:
        stations = fds.get_stations(start_time, end_time, network=net, station=sta, channel=cha)

        stations = split_list(sorted(stations), nproc)
    # end if

    stations = comm.bcast(stations, root=0)
    results = process_data(rank, fds, sorted(stations[rank]), start_time, end_time)

    results = comm.gather(results, root=0)
    if rank == 0:
        results = [item for sublist in results for item in sublist]  # flatten sublists for each proc
        stations = [item for sublist in stations for item in sublist]  # flatten sublists for each proc
        plot_results(stations, results, output_basename)
    # end if


# end func


if (__name__ == '__main__'):
    """
    Example usage:
    mpirun -np 112 python plot_data_quality.py asdf_files.txt 1980:01:01 2020:01:01 OA '*' '*' data_quality.oa
    
    How to run without MPI in background 

    Example commandline:
    (hiperseispy37) fxz547@vdi-n16 /g/data/ha3/fxz547/Githubz/hiperseis (develop)
    $ nohup python seismic/ASDFdatabase/plot_data_quality.py /g/data/ha3/GASeisDataArchive/DevSpace/test_asdf_files.txt 2019:01:01 2019:03:01 'AU' '*' '*' test_plot_data_quality_AU &

    """

    process()