import datetime import logging import re from typing import Any from collections import defaultdict, OrderedDict from math import isnan from mimetypes import guess_type from zipfile import ZipFile, BadZipFile import geopandas as gpd import numpy as np import pandas as pd from shapely.wkb import dumps as dumps_wkb from sqlalchemy import and_ from sqlalchemy.sql.schema import Column from gisaf.config import conf #from .models.admin import FileImport from gisaf.models.admin import FeatureImportData, BasketImportResult from gisaf.models.raw_survey import RawSurveyModel from gisaf.models.survey import Surveyor, Accuracy, Equipment, AccuracyEquimentSurveyorMapping from gisaf.models.tags import Tags from gisaf.redis_tools import store as redis_store from gisaf.registry import registry from gisaf.utils import ( delete_df, upsert_df, SHAPELY_TYPE_TO_MAPBOX_TYPE, DEFAULT_MAPBOX_LAYOUT, MAPBOX_COLOR_ATTRIBUTE_NAME, MAPBOX_OPACITY_ATTRIBUTE_NAME, DEFAULT_MAPBOX_PAINT, gisTypeSymbolMap ) logger = logging.getLogger(__name__) class ImportError(Exception): pass class Importer: """ Base class for all importers. The main process is executed by do_import(file) Subclasses should define read_file and process_df. """ basket: Any = None # type hint: baskets.Basket async def do_import(self, file_record, dry_run=False, **kwargs) -> BasketImportResult: """ Return: a BasketImportResult instance, or a message string, or a tuple or a list like (message, details for user feedback). """ df = await self.read_file(file_record) model = self.get_model(file_record) return await self.import_df(df, model, file_record, dry_run=dry_run, **kwargs) async def read_file(self, file_record): """ Subclasses should define the operation for reading the file into a DataFrame. """ raise NotImplementedError('Nothing defined for reading the file') async def import_df(self, df, model, file_record, dry_run=False, **kwargs): """ Subclasses should define the operation for importing the DataFrame (save to DB). """ raise NotImplementedError('Nothing defined for importing the file') async def get_model(self, file_record): """ Subclasses should define the operation for returning the model for the file. """ raise NotImplementedError('Nothing defined for identifying the model') def preprocess(self, df): """ Hook for pre-processing the dataframe read from the file. """ pass class RawSurveyImporter(Importer): """ Importer for raw survey points """ def get_model(self, file_record): return RawSurveyModel async def read_file(self, file_record): try: df = pd.read_csv(file_record.path, header=None) except pd.errors.ParserError: raise ImportError('File format error (cannot parse)') ## Take only the first 5 columns if len(df.columns) < 5: raise ImportError('File format error (at least 5 columns needed)') df = df[range(5)] df.dropna(inplace=True) df.columns = ['orig_id', 'easting', 'northing', 'elevation', 'category'] ## Strip extra characters df['category'] = df['category'].str.strip() df['orig_id'] = df.orig_id.astype(str) ## Create a Geodataframe gdf = gpd.GeoDataFrame( df, geometry=gpd.points_from_xy(df.easting, df.northing, df.elevation), crs=conf.raw_survey['spatial_sys_ref'] ) return gdf async def import_df(self, gdf, model, file_record, dry_run=False, remove_misplaced=False, **kwargs): user_feedback = OrderedDict() user_feedback['Points in file'] = len(gdf) ## Date from the file name ## TODO: should be added to FileImport (or other metadata) fname_search = re.match('^(\S+)-(\d\d\d\d)-(\d\d)-(\d\d).*$', file_record['name']) if not fname_search: raise ImportError('The file name is not OK ' '(format should be: "PPP-DESCRIPTION-YYYY-MM-DD", ' 'PPP being the project name, DESCRITION is optional and discarded)') date = datetime.date(day=int(fname_search.group(4)), month=int(fname_search.group(3)), year=int(fname_search.group(2))) ## TODO: Check if date == file_record.date accuracies = await AccuracyEquimentSurveyorMapping.get_df(with_related=True) ## Cleanup accuracies.rename(columns={ 'gisaf_survey_surveyor_id': 'surveyor_id', 'gisaf_survey_surveyor_name': 'surveyor', 'gisaf_survey_equipment_id': 'equipment_id', 'gisaf_survey_equipment_name': 'equipment', 'gisaf_survey_accuracy_id': 'accuracy_id', 'gisaf_survey_accuracy_name': 'accuracy_name', 'gisaf_survey_accuracy_accuracy': 'accuracy', 'geometry__3': 'type', }, inplace=True) accuracies.drop(columns=['surveyor__1', 'equipment_2', 'accuracy__4'], inplace=True) accuracy = accuracies.loc[ (accuracies.surveyor==file_record.surveyor) & (accuracies.equipment==file_record.equipment) & (accuracies.type=='Point') ] if len(accuracy) == 0: raise ImportError(f'No accuracy defined for surveyor {file_record.surveyor} '\ f'and equipment {file_record.equipment}') elif len(accuracy) > 1: raise ImportError(f'More than 1 accuracy defined for surveyor {file_record.surveyor} '\ f'and equipment {file_record.equipment}') accuracy = accuracy.iloc[0] def _apply_point_hash_function(rec): return point_hash_function( date, file_record.project_id, file_record.surveyor_id, file_record.equipment_id, rec.name + 1 ## Index in the file ) gdf['id'] = gdf.apply(_apply_point_hash_function, axis=1) ## Add information to all records in the gdf gdf['date'] = pd.to_datetime(date) gdf['project_id'] = file_record.project_id gdf['srvyr_id'] = file_record.surveyor_id gdf['equip_id'] = file_record.equipment_id gdf['accur_id'] = accuracy.accuracy_id df_raw_survey = await model.get_df( where=and_( model.date==date, model.project_id==file_record.project_id, model.srvyr_id==file_record.surveyor_id, model.equip_id==file_record.equipment_id, ) ) user_feedback['Existing raw_survey points'] = len(df_raw_survey) ## Import to raw_survey_data table ## PostGis specific: add SRID gdf['geom'] = gdf['geometry'].apply(lambda g: dumps_wkb(g, srid=conf.geo.raw_survey.srid, hex=True)) if not dry_run: await upsert_df(gdf, model) ## Keep track of points without known categories for user feedback df_unknown_categories = gdf[~gdf.category.isin(registry.categories.index)] unknown_categories = df_unknown_categories[['category', 'orig_id']].groupby('category').count() for row in unknown_categories.itertuples(): user_feedback[f'Category {row[0]} (unknown)'] = row[1] gdf = gdf.merge(registry.categories, left_on='category', right_index=True) ## Import to raw survey models tables for category_name, category_gdf in gdf.groupby('category'): category = registry.categories.loc[category_name] user_feedback[f"Category {category.name} ({category.store_name})"] = len(category_gdf) if not dry_run: await upsert_df(category_gdf, category.raw_survey_model) ## Auto import: import from raw survey models tables to geometry tables gdf['geom'] = gdf.to_crs(conf.srid).geometry.apply(lambda g: dumps_wkb(g, srid=conf.srid, hex=True)) for category_name, category_gdf in gdf.groupby('category'): category = registry.categories.loc[category_name] if category.auto_import and category.gis_type == 'Point': if not dry_run: await upsert_df(category_gdf, category.model) if remove_misplaced: gdf_db = await category.model.get_geo_df() gdf_db.reset_index(inplace=True) ## Finding by id does not work since points might ## have been placed in a different category #matches = gdf_db.loc[gdf_db['id'].isin(category_gdf.id)] matches = gdf_db.loc[ gdf_db.orig_id.isin(category_gdf.orig_id) & (gdf_db.srvyr_id == file_record.surveyor_id) & (gdf_db.equip_id == file_record.equipment_id) & (gdf_db.date.dt.date == date) ] matches_merged = matches.merge( category_gdf.to_crs(conf.srid), on='orig_id', suffixes=('', '_new') ) misplaced = matches_merged.loc[ matches_merged['geometry'] != matches_merged['geometry_new'] ] if len(misplaced) > 0: await delete_df(misplaced, category.model) ## Filter matches with orig_id and different geometry pass user_feedback['Points imported'] = len(gdf) return BasketImportResult( message=f"Import successful{' (dry run)' if dry_run else ''}", details=user_feedback, ) class GeoDataImporter(Importer): """ Base class for geo data importers """ def get_model(self, file_record): return registry.geom.get(file_record.store) async def read_file(self, file_record): ## TODO: guess_type supports pathlib.Path from Python 3.8, so it should be: #if guess_type(file_record.path)[0] == 'application/zip': ## For Python 3.7 compatibility: if guess_type(str(file_record.path.absolute()))[0] == 'application/zip': return gpd.read_file(f'zip://{file_record.path}') else: return gpd.read_file(file_record.path) async def get_metadata_from_raw_points(self, gdf, model, file_record): pass async def import_df(self, gdf_file_input, model, file_record, dry_run=False, live_publish_results=False): ## XXX: Removed qml processing (Mapbox) #qml_file_ = glob.glob(path.join(tmpdir, '*.qml')) #if qml_file_: # process_qml(qml_file_[0], model) new_or_updated_items = {} now = datetime.datetime.now() bad_geometry_types = {} user_feedback = OrderedDict() to_keep = [] ## If the model has a project (eg. all survey models, see BaseSurveyModel), ## get only the features with the project given in the fileImport object if hasattr(model, 'project_id'): if np.isnan(file_record.project_id): raise ImportError('No project defined for that file') query = model.project_id==file_record.project_id else: query = None if file_record.status and isinstance(model.status, Column): query = and_(query, model.status==file_record.status) ## Explode the input (in case of multi-geometries) gdf_file = gdf_file_input.explode(ignore_index=True) gdf_db = await model.get_geo_df(query, with_related=False) gdf_db.reset_index(inplace=True) ## Cleanup input: drop empty geometries and duplicates nb_all = len(gdf_file) gdf_file.dropna(subset=['geometry'], inplace=True) gdf_file = gdf_file[~gdf_file.geometry.is_empty] nb_empty_geom = nb_all - len(gdf_file) gdf_file.drop_duplicates(inplace=True) nb_duplicate = nb_all - nb_empty_geom - len(gdf_file) ## Reproject gdf_file = gdf_file.copy().to_crs(conf.crs['db']) synonyms = getattr(model, 'synonyms', {'ID': 'id'}) ## Apply synonyms for k, v in synonyms.items(): if k in gdf_file.columns: gdf_file[v] = gdf_file[k] if 'orig_id' in gdf_file.columns: gdf_file.drop(columns='orig_id', inplace=True) ## Make sure the types of the columns of gdf_file and gdf_db match for col_name, col_type in gdf_db.dtypes.items(): if col_name in gdf_file.columns: try: coerce = gdf_file[col_name].dtypes != col_type except TypeError: ## Workaround for strange bug in Pandas: data type not understood coerce = gdf_file[col_name].dtypes.name != col_type.name if coerce: gdf_file[col_name] = gdf_file[col_name].astype(col_type) ## Add status if 'status' not in gdf_file.columns: gdf_file['status'] = file_record.status ## TODO: define exactly the meaning of the 'id' attribute in shapefiles, ## it gets quite confusing. ## Drop the id, as it migt conflict with the one in the DB ## For now in this import process only, rename eventual 'id' to 'orig_id' if 'id' not in gdf_file.columns: gdf_file['id'] = None gdf_file.rename(columns={'id': 'orig_id'}, inplace=True) gdf_file['orig_id'] = gdf_file['orig_id'].astype(str).astype(object) ## Get geometries as pygeos for gdf_file and gdf_db gdf_file['ewkb'] = gdf_file.geometry.apply(lambda g: dumps_wkb(g, srid=conf.srid, hex=True)) gdf_db['ewkb'] = gdf_db.geometry.apply(lambda g: dumps_wkb(g, srid=conf.srid, hex=True)) self.preprocess(gdf_file) await self.get_metadata_from_raw_points(gdf_file, model, file_record) ## Diffs: using attribute and spatial merges, see https://geopandas.org/mergingdata.html ## Attribute diff: identify identical rows, ## so wont be touched and don't need spatial join df_identical = gdf_db.merge(gdf_file, how='inner') ## Need an id if 'id' not in gdf_file.columns: gdf_file['id'] = gdf_file.index gdf_diff = gdf_db.merge(gdf_file, on='ewkb', how='outer', indicator=True, suffixes=('', '__file')) to_update = gdf_diff.loc[gdf_diff._merge=='both'] ## Filter out the identical rows from to_update: update non-geometry attributes, ## keeping the id to_update = to_update.loc[~to_update.id.isin(df_identical.id)] to_delete = gdf_diff.loc[gdf_diff._merge=='left_only'] ## TODO?: try with geometric operations to determine nearest geometry ## from the file's to the DB's. ## See: # - _binary_predicate in geopandas.array, which returns True/False, # when it could return the id of the matching geometry ## Meanwhile, just wait for sjoin to be implemented: # - https://github.com/geopandas/geopandas/pull/1271 #almost_equals = to_delete.geom_almost_equals(gdf_db.geometry) #equals = to_delete.geom_equals(gdf_db.geometry) #equals_exact = to_delete.geom_equals_exact(gdf_db.geometry, tolerance=0.1) to_insert = gdf_diff.loc[gdf_diff._merge=='right_only'] ## Take column names from the gdf_file (*__file) and rename to the gdf_db. ## Geometry is a bit of an exception. to_insert_columns = [col for col in to_insert.columns if col.endswith('__file') and col != 'geometry__file'] to_insert = to_insert.drop(columns={col[:-6] for col in to_insert_columns})\ .rename(columns={col: col[:-6] for col in to_insert_columns})\ .merge(gdf_diff[['orig_id', 'ewkb']], left_index=True, right_index=True)\ .drop(columns=['id']) to_update_columns = [col for col in to_update.columns if col.endswith('__file') and col not in ('geometry__file', 'id__file')] to_update = to_update.drop(columns={col[:-6] for col in to_update_columns})\ .rename(columns={col: col[:-6] for col in to_update_columns}) to_insert['geometry'] = to_insert['geometry__file'] to_insert.rename(columns={'orig_id_x': 'orig_id', 'ewkb_x': 'ewkb'}, inplace=True) to_insert.drop(columns=['orig_id_y', 'ewkb_y'], inplace=True) ## Add project to all records in the new records to_insert['project_id'] = file_record.project_id ## Write to DB if not dry_run: for _gdf in [to_insert, to_update]: _gdf['geom'] = _gdf.geometry.apply( lambda g: dumps_wkb(g, srid=conf.srid, hex=True) ) if len(to_insert) > 0: inserted = await upsert_df(to_insert, model) to_insert['id'] = inserted['id'] await upsert_df(to_update, model) await delete_df(to_delete, model) ## Add feature_import_data feature_import_data = await FeatureImportData.get_df( where=FeatureImportData.store==file_record.store ) fid_updated = feature_import_data[['id', 'feature_id']]\ .merge(to_update, left_on='feature_id', right_on='id') fid_updated['file_path'] = str(file_record.path) fid_updated['store'] = file_record.store fid_updated['origin'] = 'shapefile' ## TODO: gpkg fid_updated['time'] = now fid_updated['file_md5'] = file_record.md5 fid_updated.drop(columns=['orig_id'], inplace=True) fid_updated.rename(columns={'orig_id__file': 'orig_id'}, inplace=True) ## Write to DB await upsert_df(fid_updated, FeatureImportData) ## Same for to_insert if len(to_insert) > 0: fid_inserted = pd.DataFrame(columns=fid_updated.columns) fid_inserted['feature_id'] = inserted['id'] ## FIXME: fid_inserted['orig_id'] #fid_inserted['orig_id'] = inserted['orig_id'] fid_inserted['store'] = file_record.store fid_inserted['file_path'] = str(file_record.path) fid_inserted['origin'] = 'shapefile' ## TODO: gpkg fid_inserted['time'] = now fid_inserted['file_md5'] = file_record.md5 await upsert_df(fid_inserted, FeatureImportData) ## Find and update tags if len(to_insert) > 0: tags = await Tags.get_geo_df(where=Tags.store==file_record.store) if len(tags) > 0: matching_tags = gpd.sjoin(to_insert, tags.reset_index()) new_ref_ids = tags.merge(matching_tags, left_on='ref_id', right_on='ref_id')[['ref_id', 'id_left']] new_tags = tags.merge(new_ref_ids, on='ref_id') new_tags.drop(columns='ref_id', inplace=True) new_tags.rename(columns={'id_left': 'ref_id'}, inplace=True) await upsert_df(new_tags, Tags) ## Publish on Gisaf live gis_type = gdf_file.gis_type.iloc[0] mapbox_type = SHAPELY_TYPE_TO_MAPBOX_TYPE.get(gis_type, None) mapbox_paint = DEFAULT_MAPBOX_PAINT.get(mapbox_type, {}).copy() mapbox_layout = DEFAULT_MAPBOX_LAYOUT.get(mapbox_type, {}).copy() symbol = gisTypeSymbolMap.get(gis_type, '\ue02e') live_layers = { 'insert': { 'data': to_insert, 'color': 'blue', 'opacity': 0.3, }, 'update': { 'data': to_update, 'color': 'green', 'opacity': 0.3, }, 'delete': { 'data': to_delete, 'color': 'red', 'opacity': 0.3, }, } if live_publish_results: for name, defn in live_layers.items(): mapbox_paint[MAPBOX_COLOR_ATTRIBUTE_NAME[mapbox_type]] = defn['color'] mapbox_paint[MAPBOX_OPACITY_ATTRIBUTE_NAME[mapbox_type]] = defn['opacity'] await redis_store.publish_gdf( f'Last import: {name}', defn['data'], status='*', mapbox_paint=mapbox_paint, mapbox_layout=mapbox_layout, viewable_role='reviewer', ) user_feedback['Total in file'] = nb_all if nb_empty_geom > 0: user_feedback['Empty single geoms in file'] = nb_empty_geom if nb_duplicate > 0: user_feedback['Duplicates in file'] = nb_duplicate user_feedback['Total in database'] = len(gdf_db) user_feedback['Identical (no change)'] = len(df_identical) user_feedback['Update attributes'] = len(to_update) user_feedback['Insert'] = len(to_insert) user_feedback['Delete'] = len(to_delete) #summary = len(gdf_db) + len(to_insert) - len(to_delete) - len(gdf_file) #user_feedback['Summary (+/-)'] = f'{summary:d}' return BasketImportResult( time=now, message=f"Import successful{' (dry run)' if dry_run else ''}", details=user_feedback, ) class LineWorkImporter(GeoDataImporter): """ Importer for line work geo data files (from survey) """ async def get_metadata_from_raw_points(self, gdf, model, file_record): """ Find raw points in the shape and set equip_id, srvyr_id, accur_id """ raw_points = await model.raw_model.get_geo_df(where=model.raw_model.project_id==file_record.project_id) raw_points.drop(columns='orig_id', inplace=True) raw_points.to_crs(conf.crs['db'], inplace=True) ## Take a small buffer around the points ## due to approximations due to reprojection, inaccurate line work... buf_raw_points = raw_points.copy() buf_raw_points['geometry'] = raw_points.buffer(0.0000005, resolution=3) ## Drop columns which interfer with metadata coming from raw points ## In future, the importer might detect and use such metadata in the input file gdf.drop( columns=set(gdf.columns).intersection( {'date', 'accur_id', 'srvyr_id', 'equip_id', 'project_id'} ), inplace=True ) sjoin = gpd.sjoin(gdf, buf_raw_points, op='intersects', how='left') sjoin.drop(columns=['accur_id', 'status_left', 'status_right', 'ewkb'], inplace=True) sjoin.rename(columns={'index_right': 'raw_point_id'}, inplace=True) sjoin['raw_point_id'] = sjoin['raw_point_id'].astype('Int64') ## For surveyor info ## Get the last surveyor, equipment (by date) ## Sort raw points by date, maintaining the index (raw points) cols_surveyor_info = ['srvyr_id', 'equip_id', 'date'] sjoin.set_index([sjoin.index, 'date'], inplace=True) sjoin.sort_index(ascending=False, inplace=True) sjoin.reset_index(inplace=True) sjoin.set_index('level_0', inplace=True) sjoin.index.name = 'feature_id' gdf[cols_surveyor_info] = sjoin.groupby(sjoin.index).head(1)[cols_surveyor_info] ## Fix issue when all dates are NaT (reset_index converts that column to float, ## see https://github.com/pandas-dev/pandas/issues/30517) gdf.date.fillna(pd.NaT, inplace=True) ## A bit the same for accuracies: take the lowest one of the raw points for each feature accuracies = await Accuracy.get_df() accuracies.set_index('id', inplace=True) accuracy_mappings = await AccuracyEquimentSurveyorMapping.get_df( where=AccuracyEquimentSurveyorMapping.geometry_type == 'Line_work' ) accuracy_mappings.drop(columns='geometry_type', inplace=True) sjoin = sjoin.reset_index().merge( accuracy_mappings, left_on=['srvyr_id', 'equip_id'], right_on=['surveyor_id', 'equipment_id'] ) sjoin = sjoin.merge(accuracies[['accuracy']], left_on='accuracy_id', right_index=True) sjoin.set_index(['feature_id', 'accuracy'], inplace=True) sjoin.sort_index(ascending=False, inplace=True) sjoin.reset_index(inplace=True) sjoin.set_index('feature_id', inplace=True) gdf['accur_id'] = sjoin.groupby(sjoin.index).head(1)['accuracy_id'] hash_max = 2**32 hash_date_min = datetime.date(year=1900, month=1, day=1).toordinal() def point_hash_function(date, project_id, surveyor_id, equipment_id, i): """ A function which takes arguments, and generates a "unique", positive integer hash, reasonably speaking. We could just use hash() %% 2*32, that would be quite reasonable, but hard to reverse. A simply resersable hash, would be ideally with a number generated with human readability in mind, like: return int('{:02d}{:03d}{:03d}{:01d}{:d}'.format( project_id % hash_project_max, date.toordinal() - hash_date_min, surveyor_id % hash_surveyor_max, equipment_id % hash_equipment_max, i )) % 2**32 But i don't think it's reliable enough and would be super confusing actually as soon as one of the parameters overflows its limits. Using https://stackoverflow.com/questions/4273466/reversible-hash-function, comment 7, try a reversable function with some mathematical base. Choose 2 big primes from https://www.bigprimes.net/archive/prime/14000000/ : 32416190039 and 32416190071 num = '{:03d}{:05d}{:03d}{:02d}{:05d}'.format( project_id % hash_project_max, date.toordinal() - hash_date_min, surveyor_id % hash_surveyor_max, equipment_id % hash_equipment_max, i ) The reverse hash function, for the record, if we ever use a reversable hash based on coprimes, would look like, maybe: return (h_ * 32416190071) % 2**32 # Then we would return the inverse of the format function in point_hash_function, easy ;) No good, so hash() and no smart reverse: have to use brute force to reverse. """ h_ = hash(( project_id, date.toordinal() - hash_date_min, surveyor_id, equipment_id, i )) return h_ % hash_max