Unverified Commit 07f506b3 authored by Lauren Adoram-Kershner's avatar Lauren Adoram-Kershner Committed by GitHub
Browse files

'Convenience of Life' functions and kwargs (#496)

* initial commit

* adding type check bug fix

* addressing comments round 1

* removing iterative kwargs from function calls

* addressing comments round 2

* adding note, fixing typo

* changing name of function results kw for easier parsing
parent 2d7e81b3
Loading
Loading
Loading
Loading
+3 −57
Original line number Diff line number Diff line
@@ -11,7 +11,7 @@ from skimage import transform as tf
from scipy.spatial import ConvexHull
from scipy.spatial import Voronoi
import shapely.geometry
from shapely.geometry import Polygon, Point
from shapely.geometry import Polygon, MultiPolygon, Point
from shapely.affinity import scale
from shapely import wkt

@@ -267,61 +267,6 @@ def nearest(pt, search):
    """
    return np.argmin(np.sum((search - pt)**2, axis=1))

def find_side(side, Session):
    """
    Parameters
    ----------
    side: str
            describes which extrema you cube you want; can equal 'east' or 'west'

    geom : obj
           A shapely geom object

    Returns
    -------
    lon : float
            longitude

    lat : float
            latitude

    """

    side = side.lower()

    func = {'east': 'st_xmax', 'west': 'st_xmin'}
    func = func[side]
    order = {'east': 'desc', 'west': 'asc'}
    order = order[side]

    query = f"""
    select ST_AsText(geom) from images
    order by {func}(geom) {order}
    limit 1 """

    session = Session()
    geom = session.execute(query).first()
    geom = wkt.loads(geom[0])
    session.close()

    #find eastern/wertern most side of east_geom/west_geom
    fp = geom.minimum_rotated_rectangle
    coords = np.column_stack(fp.exterior.xy)
    fp_lon, fp_lat = zip(*coords)

    # always a counter clockwise motion so find minimum/maximum lon index
    # and use i and i+1 lat lons points as return value
    if side == 'east':
        i = np.argmax(fp_lon)
        lon = fp_lon[i:i+2]
        lat = fp_lat[i:i+2]
    elif side == 'west':
        i = np.argmin(fp_lon)
        lon = fp_lon[i:i+2]
        lat = fp_lat[i:i+2]

    return np.array(lon), np.array(lat)

def create_points_along_line(p1, p2, npts):
    """
    Compute a set of nodes equally spaced between
@@ -393,8 +338,8 @@ def distribute_points_classic(geom, nspts, ewpts, **kwargs):
    valid : list
            of point coordinates in the form [(x1,y1), (x2,y2), ..., (xn, yn)]
    """
    geom_coords = np.column_stack(geom.exterior.xy)

    geom_coords = np.column_stack(geom.envelope.exterior.xy)
    coords = np.array(list(zip(*geom.envelope.exterior.xy))[:-1])

    ll = coords[0]
@@ -458,6 +403,7 @@ def distribute_points_new(geom, nspts, ewpts, Session):
            of point coordinates in the form [(x1,y1), (x2,y2), ..., (xn, yn)]
    """
    coords = np.array(list(zip(*geom.envelope.exterior.xy))[:-1])

    ll = coords[0]
    lr = coords[1]
    ur = coords[2]
+123 −5
Original line number Diff line number Diff line
@@ -1002,10 +1002,12 @@ class NetworkEdge(Edge):
        mask = [i for i in range(len(points)) if self.intersection.contains(points[i])]
        return mask

    @property
    def measures(self):
    def measures(self, filters={}):
        with self.parent.session_scope() as session:
            res = session.query(Measures).filter(sqlalchemy.or_(Measures.imageid == self.source['node_id'], Measures.imageid == self.destination['node_id'])).all()
            query = session.query(Measures).filter(sqlalchemy.or_(Measures.imageid == self.source['node_id'], Measures.imageid == self.destination['node_id']))
            for attr, value in filters.items():
                query = query.filter(getattr(Measures,attr)==value)
            res = query.all()
            session.expunge_all()
        return res

@@ -1113,3 +1115,119 @@ class NetworkEdge(Edge):
                bad[o.source_measure_id] = 1
                bad[o.destin_measure_id] = 1
        return Counter(bad)

    def find_IQR_outliers(self,
                          scaling=1.5,
                          filters={'template_metric': 1, 'template_shift': 0},
                          n_tolerance=10):
        """
        Based on the interquartile range, find the measure outliers from line and sample shifts.

        Parameters
        ----------
        scaling: float
                 scaling factor to use on IQR when determining outlier range

        filters: dict
                 filters used on a match's source measure, only source measures which
                 satisfy filter will be used in outlier calculation

        n_tolerance: int
                     minimum number of measures needed to calculate outliers

        Returns
        -------
        measure_ids: list
                     list of measure ids corresponding to outliers

        resultlog:  dict
                    status of finding outlier
        """
        resultlog = []
        currentlog = {'edge': f'({self.source}, {self.destination})',
                      'status': ''}

        ref_measures = self.measures(filters=filters)
        ref_measure_ids = [m.id for m in ref_measures]

        # use matches where the source measure is the reference measure
        matches = self.matches
        new_match_idx = []
        for i, row in matches.iterrows():
            if (row['source_measure_id'] in ref_measure_ids):
                new_match_idx.append(i)
        matches = matches.loc[new_match_idx]

        # check these is enough data to stastically calculate outliers
        if len(new_match_idx) == 0:
            currentlog['status'] = 'no filtered measures in edge.'
            resultlog.append(currentlog)
            return None, resultlog
        elif len(new_match_idx) < n_tolerance:
            currentlog['status'] = f'{len(new_match_idx)} < {n_tolerance} filtered measures are not statistically signigicant.'
            resultlog.append(currentlog)
            return None, resultlog

        # calculate outliers
        sampleShift = matches['destination_x'] - matches['destination_apriori_x']
        lineShift =  matches['destination_y'] - matches['destination_apriori_y']

        sample_q1, sample_q3 = np.percentile(sampleShift,[25,75])
        line_q1, line_q3 = np.percentile(lineShift,[25,75])
        sample_iqr = sample_q3 - sample_q1
        line_iqr = line_q3 - line_q1

        sample_lowerBound = sample_q1 -(scaling * sample_iqr)
        sample_upperBound = sample_q3 +(scaling * sample_iqr)
        line_lowerBound = line_q1 -(scaling * line_iqr)
        line_upperBound = line_q3 +(scaling * line_iqr)

        sample_outlier_matches = sampleShift[(sampleShift <= sample_lowerBound) | (sampleShift >= sample_upperBound)]
        line_outlier_matches = lineShift[(lineShift <= line_lowerBound) | (lineShift >= line_upperBound)]
        outlier_match_idx = np.concatenate((sample_outlier_matches.index.values, line_outlier_matches.index.values))

        measure_ids = matches.loc[outlier_match_idx]['destin_measure_id'].values
        currentlog['status'] = f'{len(measure_ids)} outliers found.'
        resultlog.append(currentlog)

        return measure_ids, resultlog


    def ignore_outliers(self, outlier_method='IQR', **kwargs):
        """
        Find and ignore outlier measures as determined by outlier method


        Parameters
        ----------
        outlier_method: str
                        method used to determine outliers.
                        Current methods:
                           - interquartile range ('IQR') of line/sample shift

        Returns
        -------
        resultlog: dict
                   status of finding and ignoring outliers


        """
        outlier_dict = {'IQR': self.find_IQR_outliers}
        outlier_func = outlier_dict[outlier_method]

        outlier_destination_mids, resultlog = outlier_func(**kwargs)

        if outlier_destination_mids is None:
            return resultlog

        # ignore outlier measures
        with self.parent.session_scope() as session:
            for m in outlier_destination_mids:
                currentlog = {'measure id': m,
                              'status': 'Ignored'}
                session.query(Measures).filter(Measures.id==m).update({'ignore': True})
                resultlog.append(currentlog)

        return resultlog

+28 −8
Original line number Diff line number Diff line
@@ -1351,7 +1351,11 @@ class NetworkCandidateGraph(CandidateGraph):
                'overlaps': Overlay,
                'overlap' : Overlay,
                'o' :Overlay,
                4: Overlay
                4: Overlay,
                'image': Images,
                'images': Images,
                'i': Images,
                5: Images
            }

    def config_from_file(self, filepath):
@@ -1568,7 +1572,18 @@ class NetworkCandidateGraph(CandidateGraph):
                                   json.dumps(msg, cls=JsonEncoder))
        return job_counter + 1

    def apply(self, function, on='edge', args=(), walltime='01:00:00', chunksize=1000, arraychunk=25, filters={}, query_string='', reapply=False, **kwargs):
    def apply(self,
            function,
            on='edge',
            args=(),
            walltime='01:00:00',
            chunksize=1000,
            arraychunk=25,
            filters={},
            query_string='',
            reapply=False,
            log_dir=None,
            **kwargs):
        """
        A mirror of the apply function from the standard CandidateGraph object. This implementation
        dispatches the job to the cluster as an independent operation instead of applying an arbitrary function
@@ -1622,6 +1637,9 @@ class NetworkCandidateGraph(CandidateGraph):
        reapply : bool
                  Flag indicating whether you want to resubmit jobs that are still on the queue
                  after an initial apply due to an slurm launching errors.
        log_dir: str
                 absolute path of directory used to store the jobs logs, defaults to location
                 indicated in the configuration file.

        kwargs : dict
                 Of keyword arguments passed to the function being applied
@@ -1648,6 +1666,8 @@ class NetworkCandidateGraph(CandidateGraph):
        >>> njobs = ncg.apply('spatial.overlap.place_points_in_overlap',\
            on='overlaps', distribute_points_kwargs=distribute_points_kwargs)
        """
        if log_dir is None:
            log_dir=self.config['cluster']['cluster_log_dir']

        job_counter = self.queue_length

@@ -1669,7 +1689,7 @@ class NetworkCandidateGraph(CandidateGraph):
                job_counter = self._push_row_messages(onobj, on, function, walltime, filters, query_string, args, kwargs)
            elif isinstance(onobj, list):
                job_counter = self._push_iterable_message(onobj, function, walltime, args, kwargs)
            elif isinstance(onobj, (Node, NetworkNode, Edge, NetworkEdge)):
            elif isinstance(onobj, (nx.classes.reportviews.EdgeView, nx.classes.reportviews.NodeView)):
                job_counter = self._push_obj_messages(onobj, function, walltime, args, kwargs)
            else:
                raise TypeError('The type of the `on` argument is not understood. Must be a database model, iterable, Node or Edge.')
@@ -1695,7 +1715,7 @@ class NetworkCandidateGraph(CandidateGraph):
                     mem_per_cpu=self.config['cluster']['processing_memory'],
                     time=walltime,
                     partition=self.config['cluster']['queue'],
                     output=self.config['cluster']['cluster_log_dir']+f'/autocnet.{function_name}-%j')
                     output=log_dir+f'/autocnet.{function}-%j')
        submitter.submit(array='1-{}%{}'.format(job_counter,arraychunk), chunksize=chunksize)
        return job_counter

+55 −0
Original line number Diff line number Diff line
@@ -145,3 +145,58 @@ def update_measure_from_jigsaw(point, path, ncg=None, **kwargs):

        session.commit()
    return resultlog


# This is not a permanent placement for this function
# TO DO: create a new module for parsing/cleaning points from a controlnetwork
from scipy.stats import zscore
from plio.io.io_gdal import GeoDataset
from autocnet.io.db.model import Images
import pvl
def null_measure_ignore(point, size_x, size_y, valid_tol, verbose=False, ncg=None, **kwargs):

    if not ncg.Session:
        raise BrokenPipeError('This func requires a database session from a NetworkCandidateGraph.')

    isis2np_types = {
            "UnsignedByte" : "uint8",
            "SignedWord" : "int16",
            "Real" : "float64"}

    resultlog = []
    with ncg.session_scope() as session:
        pid = point.id
        print('point id: ', pid)
        measures = session.query(Measures).filter(Measures.pointid==pid).order_by(Measures.id).all()
        print('number of measures: ', len(measures))
        for measure in measures:
            currentlog = {'measureid': measure.id,
                          'status': 'No change'}
            m_imageid = measure.imageid
            m_image = session.query(Images).filter(Images.id==m_imageid).one()
            cube = GeoDataset(m_image.path)

            center_x = measure.sample
            center_y = measure.line

            start_x = int(center_x - size_x)
            start_y = int(center_y - size_y)
            stop_x = int(center_x + size_x)
            stop_y = int(center_y + size_y)

            pixels = list(map(int, [start_x, start_y, stop_x-start_x, stop_y-start_y]))
            dtype = isis2np_types[pvl.load(cube.file_name)["IsisCube"]["Core"]["Pixels"]["Type"]]
            arr = cube.read_array(pixels=pixels, dtype=dtype)

            z = zscore(arr, axis=0)
            nn= sum(sum(np.isnan(z)))
            percent_valid = (1 - nn/z.size)*100
            if percent_valid < valid_tol:
                session.query(Measures).\
                        filter(Measures.pointid==pid, Measures.id==measure.id).\
                        update({'ignore': True})
                currentlog['status'] = 'Ignored'

            resultlog.append(currentlog)
    return resultlog
+88 −93

File changed.

Preview size limit exceeded, changes collapsed.

Loading