pylot/pylot/core/util/dataprocessing.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import glob
import logging
import os
import sys

import numpy as np
from obspy import UTCDateTime, read_inventory, read
from obspy.io.xseed import Parser

from pylot.core.util.utils import key_for_set_value, find_in_list, \
    gen_Pool


class Metadata(object):

    def __init__(self, inventory=None, verbosity=1):
        self.inventories = []
        # saves read metadata objects (Parser/inventory) for a filename
        self.inventory_files = {}
        # saves filenames holding metadata for a seed_id
        # seed id as key, path to file as value
        self.seed_ids = {}
        self.stations_dict = {}
        # saves which metadata files are from obspy dmt
        self.obspy_dmt_invs = []
        if inventory:
            if os.path.isdir(inventory):
                self.add_inventory(inventory)
            if os.path.isfile(inventory):
                self.add_inventory_file(inventory)
        self.verbosity = verbosity

    def __str__(self):
        repr = 'PyLoT Metadata object including the following inventories:\n\n'
        ntotal = len(self.inventories)
        for index, inventory in enumerate(self.inventories):
            if index < 2 or (ntotal - index) < 3:
                repr += '{}\n'.format(inventory)
            if ntotal > 4 and int(ntotal / 2) == index:
                repr += '...\n'
        if ntotal > 4:
            repr += '\nTotal of {} inventories. Use Metadata.inventories to see all.'.format(ntotal)
        return repr

    def __repr__(self):
        return self.__str__()

    def add_inventory(self, path_to_inventory, obspy_dmt_inv=False):
        """
        Add path to list of inventories.
        :param path_to_inventory: Path to a folder
        :type path_to_inventory: str
        :return: None
        """
        assert (os.path.isdir(path_to_inventory)), '{} is no directory'.format(path_to_inventory)
        if path_to_inventory not in self.inventories:
            self.inventories.append(path_to_inventory)
        if obspy_dmt_inv == True:
            self.obspy_dmt_invs.append(path_to_inventory)

    def add_inventory_file(self, path_to_inventory_file):
        """
        Add the folder in which the file exists to the list of inventories.
        :param path_to_inventory_file: full path including filename
        :type path_to_inventory_file: str
        :return: None
        """
        assert (os.path.isfile(path_to_inventory_file)), '{} is no file'.format(path_to_inventory_file)
        self.add_inventory(os.path.split(path_to_inventory_file)[0])
        if path_to_inventory_file not in self.inventory_files.keys():
            self.read_single_file(path_to_inventory_file)

    def remove_all_inventories(self):
        self.__init__()

    def remove_inventory(self, path_to_inventory):
        """
        Remove a path from inventories list. If path is not in inventories list, do nothing.
        :param path_to_inventory: Path to a folder
        """
        if not path_to_inventory in self.inventories:
            print('Path {} not in inventories list.'.format(path_to_inventory))
            return
        self.inventories.remove(path_to_inventory)
        for filename in list(self.inventory_files.keys()):
            if filename.startswith(path_to_inventory):
                del (self.inventory_files[filename])
        for seed_id in list(self.seed_ids.keys()):
            if self.seed_ids[seed_id].startswith(path_to_inventory):
                del (self.seed_ids[seed_id])
        # have to clean self.stations_dict as well
        # this will be rebuilt for the next init of the arraymap anyway, so just reset it
        self.stations_dict = {}

    def clear_inventory(self):
        for inv in self.obspy_dmt_invs:
            self.remove_inventory(inv)
        self.obspy_dmt_invs = []

    def get_metadata(self, seed_id, time=None):
        """
        Get metadata for seed id at time. When time is not specified, metadata for current time is fetched.
        :param seed_id: Seed id such as BW.WETR..HHZ (Network.Station.Location.Channel)
        :type seed_id: str
        :param time: Time for which the metadata should be returned
        :type time: UTCDateTime
        :return: Dictionary with keys data and invtype.
        data is a obspy.io.xseed.parser.Parser or an obspy.core.inventory.inventory.Inventory depending on the metadata
        file.
        invtype is a string denoting of which type the value of the data key is. It can take the values 'dless',
        'dseed', 'xml', 'resp', according to the filetype of the metadata.
        :rtype: dict
        """
        # try most recent data if no time is specified
        if not time:
            time = UTCDateTime()
        # get metadata for a specific seed_id, if not already read, try to read from inventories
        if not seed_id in self.seed_ids.keys():
            self._read_inventory_data(seed_id)
        # if seed id is not found read all inventories and try to find it there
        if not seed_id in self.seed_ids.keys():
            if self.verbosity:
                print('No data found for seed id {}. Trying to find it in all known inventories...'.format(seed_id))
            self.read_all()
            for inv_fname, metadata_dict in self.inventory_files.items():
                # use get_coordinates to check for seed_id
                try:
                    metadata_dict['data'].get_coordinates(seed_id, time)
                    self.seed_ids[seed_id] = inv_fname
                    if self.verbosity:
                        print('Found metadata for station {}!'.format(seed_id))
                    return metadata_dict
                except Exception as e:
                    continue
            print('Could not find metadata for station {}'.format(seed_id))
            return None
        fname = self.seed_ids[seed_id]
        return self.inventory_files[fname]

    def read_all(self):
        """
        Read all metadata files found in all inventories
        """
        # iterate over all inventory folders
        for inventory in self.inventories:
            # iterate over all inventory files in the current folder
            for inv_fname in os.listdir(inventory):
                inv_fname = os.path.join(inventory, inv_fname)
                if not self.read_single_file(inv_fname):
                    continue

    def read_single_file(self, inv_fname):
        """
        Try to read a single file as Parser/Inventory and add its dictionary to inventory files if reading sudceeded.
        :param inv_fname: path/filename of inventory file
        :type inv_fname: str
        :rtype: None
        """
        # return if it was read already
        if self.inventory_files.get(inv_fname, None):
            return

        try:
            invtype, robj = self._read_metadata_file(inv_fname)
            if robj is None:
                return
        except Exception as e:
            print('Could not read file {}'.format(inv_fname))
            return
        self.inventory_files[inv_fname] = {'invtype': invtype,
                                           'data': robj}
        return True

    def get_coordinates(self, seed_id, time=None):
        """
        Get coordinates of given seed id.
        :param seed_id: Seed id such as BW.WETR..HHZ (Network.Station.Location.Channel)
        :type seed_id: str
        :param time: Used when a station has data available at multiple time intervals
        :type time: UTCDateTime
        :return: dict containing position information of the station
        :rtype: dict
        """
        # try most recent data if no time is specified
        if not time:
            time = UTCDateTime()
        metadata = self.get_metadata(seed_id, time)
        if not metadata:
            return
        try:
            return metadata['data'].get_coordinates(seed_id, time)
        # no specific exception defined in obspy inventory
        except Exception as e:
            logging.warning(f'Could not get metadata for {seed_id}')

    def get_all_coordinates(self):
        def stat_info_from_parser(parser):
            for station in parser.stations:
                station_name = station[0].station_call_letters
                network_name = station[0].network_code
                if not station_name in self.stations_dict.keys():
                    st_id = '{}.{}'.format(network_name, station_name)
                    self.stations_dict[st_id] = {'latitude': station[0].latitude,
                                                 'longitude': station[0].longitude,
                                                 'elevation': station[0].elevation}

        def stat_info_from_inventory(inventory):
            for network in inventory.networks:
                for station in network.stations:
                    station_name = station.code
                    network_name = network.code
                    if not station_name in self.stations_dict.keys():
                        st_id = '{}.{}'.format(network_name, station_name)
                        self.stations_dict[st_id] = {'latitude': station[0].latitude,
                                                     'longitude': station[0].longitude,
                                                     'elevation': station[0].elevation}

        read_stat = {'xml': stat_info_from_inventory,
                     'dless': stat_info_from_parser}

        self.read_all()
        for item in self.inventory_files.values():
            inventory = item['data']
            invtype = item['invtype']
            read_stat[invtype](inventory)

        return self.stations_dict

    def get_paz(self, seed_id, time):
        """

        :param seed_id: Seed id such as BW.WETR..HHZ (Network.Station.Location.Channel)
        :type seed_id: str
        :param time: Used when a station has data available at multiple time intervals
        :type time: UTCDateTime
        :rtype: dict
        """
        metadata = self.get_metadata(seed_id)
        if not metadata:
            return
        if metadata['invtype'] in ['dless', 'dseed']:
            return metadata['data'].get_paz(seed_id, time)
        elif metadata['invtype'] in ['resp', 'xml']:
            resp = metadata['data'].get_response(seed_id, time)
            return resp.get_paz(seed_id)

    def _read_inventory_data(self, seed_id):
        for inventory in self.inventories:
            if self._read_metadata_iterator(path_to_inventory=inventory, station_seed_id=seed_id):
                return

    def _read_metadata_iterator(self, path_to_inventory, station_seed_id):
        """
        Search for metadata for a specific station iteratively.
        """
        network, station, location, channel = station_seed_id.split('.')
        # seach for station seed id in filenames in invetory
        fnames = glob.glob(os.path.join(path_to_inventory, '*' + station_seed_id + '*'))
        if not fnames:
            # search for station name in filename
            fnames = glob.glob(os.path.join(path_to_inventory, '*' + station + '*'))
        if not fnames:
            # search for network name in filename
            fnames = glob.glob(os.path.join(path_to_inventory, '*' + network + '*'))
        if not fnames:
            if self.verbosity:
                print('Could not find filenames matching station name, network name or seed id')
            return
        for fname in fnames:
            if fname in self.inventory_files.keys():
                if self.inventory_files[fname]:
                    # file already read
                    continue
            invtype, robj = self._read_metadata_file(os.path.join(path_to_inventory, fname))
            try:
                # robj.get_coordinates(station_seed_id)  # TODO: Commented out, failed with Parser, is this needed?
                self.inventory_files[fname] = {'invtype': invtype,
                                               'data': robj}
                if station_seed_id in self.seed_ids.keys():
                    print('WARNING: Overwriting metadata for station {}'.format(station_seed_id))
                self.seed_ids[station_seed_id] = fname
                return True
            except Exception as e:
                continue
        print('Could not find metadata for station_seed_id {} in path {}'.format(station_seed_id, path_to_inventory))

    def _read_metadata_file(self, path_to_inventory_filename):
        """
        function reading metadata files (either dataless seed, xml or resp)
        :param path_to_inventory_filename:
        :return: file type/ending, inventory object (Parser or Inventory)
        :rtype: (str, obspy.io.xseed.Parser or obspy.core.inventory.inventory.Inventory)
        """
        # functions used to read metadata for different file endings (or file types)
        read_functions = {'dless': self._read_dless,
                          'dataless': self._read_dless,
                          'dseed': self._read_dless,
                          'xml': self._read_inventory_file,
                          'resp': self._read_inventory_file}
        file_ending = path_to_inventory_filename.split('.')[-1]
        if file_ending in read_functions.keys():
            robj, exc = read_functions[file_ending](path_to_inventory_filename)
            if exc is not None:
                raise exc
            return file_ending, robj
        # in case file endings did not match the above keys, try and error
        for file_type in ['dless', 'xml']:
            try:
                robj, exc = read_functions[file_type](path_to_inventory_filename)
                if exc is None:
                    if self.verbosity:
                        print('Read file {} as {}'.format(path_to_inventory_filename, file_type))
                    return file_type, robj
            except Exception as e:
                if self.verbosity:
                    print('Could not read file {} as {}'.format(path_to_inventory_filename, file_type))
        return None, None

    @staticmethod
    def _read_dless(path_to_inventory):
        exc = None
        try:
            parser = Parser(path_to_inventory)
        except Exception as exc:
            parser = None
        return parser, exc

    @staticmethod
    def _read_inventory_file(path_to_inventory):
        exc = None
        try:
            inv = read_inventory(path_to_inventory)
        except Exception as exc:
            inv = None
        return inv, exc


def check_time(datetime):
    """
    Function takes in date and time as list and validates it's values by trying to make an UTCDateTime object from it
    :param datetime: list of integers [year, month, day, hour, minute, second, microsecond]
    :type datetime: list
    :return: returns True if Values are in supposed range, returns False otherwise

    >>> check_time([1999, 1, 1, 23, 59, 59, 999000])
    True
    >>> check_time([1999, 1, 1, 23, 59, 60, 999000])
    False
    >>> check_time([1999, 1, 1, 23, 59, 59, 1000000])
    False
    >>> check_time([1999, 1, 1, 23, 60, 59, 999000])
    False
    >>> check_time([1999, 1, 1, 23, 60, 59, 999000])
    False
    >>> check_time([1999, 1, 1, 24, 59, 59, 999000])
    False
    >>> check_time([1999, 1, 31, 23, 59, 59, 999000])
    True
    >>> check_time([1999, 2, 30, 23, 59, 59, 999000])
    False
    >>> check_time([1999, 2, 29, 23, 59, 59, 999000])
    False
    >>> check_time([2000, 2, 29, 23, 59, 59, 999000])
    True
    >>> check_time([2000, 13, 29, 23, 59, 59, 999000])
    False
    """
    try:
        UTCDateTime(*datetime)
        return True
    except ValueError:
        return False


def restitute_trace(input_tuple):
    def no_metadata(tr, seed_id):
        print('no metadata file found '
              'for trace {0}'.format(seed_id))
        return tr, True

    tr, metadata, unit, force = input_tuple

    remove_trace = False

    seed_id = tr.get_id()

    mdata = metadata.get_metadata(seed_id, time=tr.stats.starttime)
    if not mdata:
        return no_metadata(tr, seed_id)

    invtype = mdata['invtype']
    inobj = mdata['data']

    # check, whether this trace has already been corrected
    if 'processing' in tr.stats.keys() \
            and np.any(['remove' in p for p in tr.stats.processing]) \
            and not force:
        print("Trace {0} has already been corrected!".format(seed_id))
        return tr, False
    stime = tr.stats.starttime
    prefilt = get_prefilt(tr)
    if invtype == 'resp':
        fresp = find_in_list(inobj, seed_id)
        if not fresp:
            return no_metadata(tr, seed_id)
        fname = fresp
        seedresp = dict(filename=fname,
                        date=stime,
                        units=unit)
        kwargs = dict(paz_remove=None, pre_filt=prefilt, seedresp=seedresp)
    elif invtype == 'dless':
        if type(inobj) is list:
            fname = Parser(find_in_list(inobj, seed_id))
        else:
            fname = inobj
        paz = fname.get_paz(tr.id, datetime=tr.stats.starttime)
        kwargs = dict(pre_filt=prefilt, paz_remove=paz, remove_sensitivity=True)
    elif invtype == 'xml':
        invlist = inobj
        if len(invlist) > 1:
            inventory = find_in_list(invlist, seed_id)
        else:
            inventory = invlist[0]
    elif invtype is None:
        return no_metadata(tr, seed_id)
    else:
        remove_trace = True
    # apply restitution to data
    print("Correcting instrument at station %s, channel %s" \
          % (tr.stats.station, tr.stats.channel))
    try:
        if invtype in ['resp', 'dless']:
            try:
                tr.simulate(**kwargs)
                print("Done")
            except ValueError as e:
                vmsg = '{0}'.format(e)
                print(vmsg)

        else:
            try:
                tr.attach_response(inventory)
                tr.remove_response(output=unit,
                                   pre_filt=prefilt)
            except UnboundLocalError as e:
                vmsg = '{0}'.format(e)
                print(vmsg)

    except ValueError as e:
        msg0 = 'Response for {0} not found in Parser'.format(seed_id)
        msg1 = 'evalresp failed to calculate response'
        if msg0 not in e.message or msg1 not in e.message:
            raise
        else:
            # restitution done to copies of data thus deleting traces
            # that failed should not be a problem
            remove_trace = True

    return tr, remove_trace


def restitute_data(data, metadata, unit='VEL', force=False, ncores=0):
    """
    takes a data stream and a path_to_inventory and returns the corrected
    waveform data stream
    :param data: seismic data stream
    :param unit: unit to correct for (default: 'VEL')
    :param force: force restitution for already corrected traces (default:
    False)
    :return: corrected data stream
    """

    # data = remove_underscores(data)

    # loop over traces
    input_tuples = []
    for tr in data:
        input_tuples.append((tr, metadata, unit, force))
        data.remove(tr)

    pool = gen_Pool(ncores)
    result = pool.imap_unordered(restitute_trace, input_tuples)
    pool.close()

    for tr, remove_trace in result:
        if not remove_trace:
            data.traces.append(tr)

    # check if ALL traces could be restituted, take care of large datasets
    # better try restitution for smaller subsets of data (e.g. station by
    # station)

    return data


def get_prefilt(trace, tlow=(0.5, 0.9), thi=(5., 2.), verbosity=0):
    """
    takes a `obspy.core.stream.Trace` object, taper parameters tlow and thi and
    returns the pre-filtering corner frequencies for the cosine taper for
    further processing
    :param trace: seismic data trace
    :type trace: `obspy.core.stream.Trace`
    :param tlow: tuple or list containing the desired lower corner
    frequenices for a cosine taper
    :type tlow: tuple or list
    :param thi: tuple or list containing the percentage values of the
    Nyquist frequency for the desired upper corner frequencies of the cosine
    taper
    :type thi: tuple or list
    :param verbosity: verbosity level
    :type verbosity: int
    :return: pre-filt cosine taper corner frequencies
    :rtype: tuple

    ..example::

    >>> st = read()
    >>> get_prefilt(st[0])
    (0.5, 0.9, 47.5, 49.0)
    """
    if verbosity:
        print("Calculating pre-filter values for %s, %s ..." % (
            trace.stats.station, trace.stats.channel))
    # get corner frequencies for pre-filtering
    fny = trace.stats.sampling_rate / 2
    fc21 = fny - (fny * thi[0] / 100.)
    fc22 = fny - (fny * thi[1] / 100.)
    return tlow[0], tlow[1], fc21, fc22


if __name__ == "__main__":
    import doctest

    doctest.testmod()