gisaf-backend/src/gisaf/models/models_base.py

from typing import Any, Dict, Tuple, Type, ClassVar
import logging

from sqlmodel import Field, SQLModel, MetaData, JSON, TEXT, Relationship, Column, select
from pydantic import computed_field
import numpy as np
import pandas as pd
import geopandas as gpd
import shapely
from sqlalchemy.sql import sqltypes
from geoalchemy2.types import Geometry

from gisaf.database import BaseModel

pandas_cast_map = {
    sqltypes.Integer: 'Int64',
    sqltypes.Float: 'float64',
}

logger = logging.getLogger('model_base_base')

class Model(BaseModel):
    """
    Base mixin class for models that can be converted to a Pandas dataframe with get_df
    """
    # status: ClassVar[str] = 'E'

    def __new__(cls, *args, **kwargs):
        if not hasattr(cls, 'query'):
            cls.query = select(cls)
        return super().__new__(cls, *args, **kwargs)

    class Meta:
        filtered_columns_on_map: list[str] = []

    @classmethod
    def get_store_name(cls):
        if hasattr(cls, '__table__'):
            return cls.__table__.fullname
        elif hasattr(cls, '__table_args__') and 'schema' in cls.__table_args__:
            return f"{cls.__table_args__.schema}.{cls.__tablename__}"
        else:
            return f'{cls.metadata.schema}.{cls.__tablename__}'

    @classmethod
    def get_table_name_prefix(cls):
        return "{}_{}".format(cls.metadata.schema, cls.__tablename__)

    # @classmethod
    # async def get_df(cls, where=None,
    #                  with_related=None, recursive=True,
    #                  cast=True,
    #                  with_only_columns=None,
    #                  geom_as_ewkt=False,
    #                  **kwargs):
    #     """
    #     Return a Pandas dataframe of all records
    #     Optional arguments:
    #     * an SQLAlchemy where clause
    #     * with_related: automatically get data from related columns, following the foreign keys in the model definitions
    #     * cast: automatically transform various data in their best python types (eg. with date, time...)
    #     * with_only_columns: fetch only these columns (list of column names)
    #     * geom_as_ewkt: convert geometry columns to EWKB (handy for eg. using upsert_df)
    #     :return:
    #     """
    #     breakpoint()
    #     if hasattr(cls, 'query'):
    #         query = cls.query
    #     else:
    #         query = select(cls)

    #     # if with_related is not False:
    #     #     if with_related or getattr(cls, 'get_gdf_with_related', False):
    #     #         joins = get_join_with(cls, recursive)
    #     #         model_loader = cls.load(**joins)
    #     #         query = _get_query_with_table_names(model_loader)

    #     if where is not None:
    #         query.append_whereclause(where)

    #     if with_only_columns:
    #         query = query.with_only_columns([getattr(cls, colname) for colname in with_only_columns])

    #     ## Got idea from https://github.com/MagicStack/asyncpg/issues/173.
    #     breakpoint()
    #     async with query.bind.raw_pool.acquire() as conn:
    #         ## Convert hstore fields to dict
    #         await conn.set_builtin_type_codec('hstore', codec_name='pg_contrib.hstore')

    #         compiled = query.compile()
    #         stmt = await conn.prepare(compiled.string)
    #         columns = [a.name for a in stmt.get_attributes()]
    #         data = await stmt.fetch(*[compiled.params.get(param) for param in compiled.positiontup])
    #         df = pd.DataFrame(data, columns=columns)

    #         ## Convert primary key columns to Int64:
    #         ## allows NaN, fixing type convertion to float with merge
    #         for pk in [c.name for c in cls.__table__.primary_key.columns]:
    #             if pk in df.columns and df[pk].dtype=='int64':
    #                 df[pk] = df[pk].astype('Int64')

    #         if cast:
    #             ## Cast the type for known types (datetime, ...)
    #             for column_name in df.columns:
    #                 col = getattr(query.columns, column_name, None)
    #                 if col is None:
    #                     logger.debug(f'Cannot get column {column_name} in query for model {cls.__name__}')
    #                     continue
    #                 column_type = getattr(query.columns, column_name).type
    #                 ## XXX: Needs refinment, eg. nullable -> Int64 ...
    #                 if column_type.__class__ in pandas_cast_map:
    #                     df[column_name] = df[column_name].astype(pandas_cast_map[column_type.__class__])
    #                 elif isinstance(column_type, (sqltypes.Date, sqltypes.DateTime)):
    #                     ## Dates, times
    #                     df[column_name] = pd.to_datetime(df[column_name])
    #                 #elif isinstance(column_type, (sqltypes.Integer, sqltypes.Float)):
    #                 #    ## Numeric
    #                 #    df[column_name] = pd.to_numeric(df[column_name], errors='coerce')
    #                 ## XXX: keeping this note about that is about "char" SQL type, but the fix of #9694 makes it unnessary
    #                 #elif isinstance(column_type, sqltypes.CHAR) or (isinstance(column_type, sqltypes.String) and column_type.length == 1):
    #                 #    ## Workaround for bytes being used for string of length 1 (not sure - why???)
    #                 #    df[column_name] = df[column_name].str.decode('utf-8')

    #         ## Rename the columns, removing the schema_table prefix for the columns in that model
    #         prefix = cls.get_table_name_prefix()
    #         prefix_length = len(prefix) + 1
    #         rename_map = {colname: colname[prefix_length:] for colname in df.columns if colname.startswith(prefix)}
    #         df.rename(columns=rename_map, inplace=True)

    #         ## Eventually convert geometry columns to EWKB
    #         if geom_as_ewkt:
    #             geometry_columns = [col.name for col in cls.__table__.columns if isinstance(col.type, Geometry)]
    #             for column in geometry_columns:
    #                 df[column] = shapely.to_wkb(shapely.from_wkb(df.geom), hex=True, include_srid=True)

    #         return df
Migrate joins to sqlalchemy's query options Use native pandas read_sql_query and geopandas from_postgis Fix definiiton of status in models Fix table names Fix category fields 2024-01-02 00:09:08 +05:30			`from typing import Any, Dict, Tuple, Type, ClassVar`
Add gisaf models Restructure config (map.tileServer) 2023-11-19 12:13:39 +05:30			`import logging`

Migrate joins to sqlalchemy's query options Use native pandas read_sql_query and geopandas from_postgis Fix definiiton of status in models Fix table names Fix category fields 2024-01-02 00:09:08 +05:30			`from sqlmodel import Field, SQLModel, MetaData, JSON, TEXT, Relationship, Column, select`
Add gisaf models Restructure config (map.tileServer) 2023-11-19 12:13:39 +05:30			`from pydantic import computed_field`
			`import numpy as np`
			`import pandas as pd`
			`import geopandas as gpd`
			`import shapely`
			`from sqlalchemy.sql import sqltypes`
			`from geoalchemy2.types import Geometry`

Migrate joins to sqlalchemy's query options Use native pandas read_sql_query and geopandas from_postgis Fix definiiton of status in models Fix table names Fix category fields 2024-01-02 00:09:08 +05:30			`from gisaf.database import BaseModel`

Add gisaf models Restructure config (map.tileServer) 2023-11-19 12:13:39 +05:30			`pandas_cast_map = {`
			`sqltypes.Integer: 'Int64',`
			`sqltypes.Float: 'float64',`
			`}`

			`logger = logging.getLogger('model_base_base')`

Migrate joins to sqlalchemy's query options Use native pandas read_sql_query and geopandas from_postgis Fix definiiton of status in models Fix table names Fix category fields 2024-01-02 00:09:08 +05:30			`class Model(BaseModel):`
Add gisaf models Restructure config (map.tileServer) 2023-11-19 12:13:39 +05:30			`"""`
			`Base mixin class for models that can be converted to a Pandas dataframe with get_df`
			`"""`
Migrate joins to sqlalchemy's query options Use native pandas read_sql_query and geopandas from_postgis Fix definiiton of status in models Fix table names Fix category fields 2024-01-02 00:09:08 +05:30			`# status: ClassVar[str] = 'E'`

			`def __new__(cls, args, *kwargs):`
			`if not hasattr(cls, 'query'):`
			`cls.query = select(cls)`
			`return super().__new__(cls, args, *kwargs)`
Add gisaf models Restructure config (map.tileServer) 2023-11-19 12:13:39 +05:30
			`class Meta:`
			`filtered_columns_on_map: list[str] = []`

			`@classmethod`
			`def get_store_name(cls):`
Migrate joins to sqlalchemy's query options Use native pandas read_sql_query and geopandas from_postgis Fix definiiton of status in models Fix table names Fix category fields 2024-01-02 00:09:08 +05:30			`if hasattr(cls, '__table__'):`
			`return cls.__table__.fullname`
			`elif hasattr(cls, '__table_args__') and 'schema' in cls.__table_args__:`
			`return f"{cls.__table_args__.schema}.{cls.__tablename__}"`
			`else:`
			`return f'{cls.metadata.schema}.{cls.__tablename__}'`
Add gisaf models Restructure config (map.tileServer) 2023-11-19 12:13:39 +05:30
			`@classmethod`
			`def get_table_name_prefix(cls):`
Misc: Basic registry, with survey stores Move to standard src/ dir versions: sqlmodel official, pydantic v2 etc... 2023-12-13 01:25:00 +05:30			`return "{}_{}".format(cls.metadata.schema, cls.__tablename__)`
Add gisaf models Restructure config (map.tileServer) 2023-11-19 12:13:39 +05:30
Migrate joins to sqlalchemy's query options Use native pandas read_sql_query and geopandas from_postgis Fix definiiton of status in models Fix table names Fix category fields 2024-01-02 00:09:08 +05:30			`# @classmethod`
			`# async def get_df(cls, where=None,`
			`# with_related=None, recursive=True,`
			`# cast=True,`
			`# with_only_columns=None,`
			`# geom_as_ewkt=False,`
			`# **kwargs):`
			`# """`
			`# Return a Pandas dataframe of all records`
			`# Optional arguments:`
			`# * an SQLAlchemy where clause`
			`# * with_related: automatically get data from related columns, following the foreign keys in the model definitions`
			`# * cast: automatically transform various data in their best python types (eg. with date, time...)`
			`# * with_only_columns: fetch only these columns (list of column names)`
			`# * geom_as_ewkt: convert geometry columns to EWKB (handy for eg. using upsert_df)`
			`# :return:`
			`# """`
			`# breakpoint()`
			`# if hasattr(cls, 'query'):`
			`# query = cls.query`
			`# else:`
			`# query = select(cls)`

			`# # if with_related is not False:`
			`# # if with_related or getattr(cls, 'get_gdf_with_related', False):`
			`# # joins = get_join_with(cls, recursive)`
			`# # model_loader = cls.load(**joins)`
			`# # query = _get_query_with_table_names(model_loader)`

			`# if where is not None:`
			`# query.append_whereclause(where)`

			`# if with_only_columns:`
			`# query = query.with_only_columns([getattr(cls, colname) for colname in with_only_columns])`

			`# ## Got idea from https://github.com/MagicStack/asyncpg/issues/173.`
			`# breakpoint()`
			`# async with query.bind.raw_pool.acquire() as conn:`
			`# ## Convert hstore fields to dict`
			`# await conn.set_builtin_type_codec('hstore', codec_name='pg_contrib.hstore')`

			`# compiled = query.compile()`
			`# stmt = await conn.prepare(compiled.string)`
			`# columns = [a.name for a in stmt.get_attributes()]`
			`# data = await stmt.fetch(*[compiled.params.get(param) for param in compiled.positiontup])`
			`# df = pd.DataFrame(data, columns=columns)`

			`# ## Convert primary key columns to Int64:`
			`# ## allows NaN, fixing type convertion to float with merge`
			`# for pk in [c.name for c in cls.__table__.primary_key.columns]:`
			`# if pk in df.columns and df[pk].dtype=='int64':`
			`# df[pk] = df[pk].astype('Int64')`

			`# if cast:`
			`# ## Cast the type for known types (datetime, ...)`
			`# for column_name in df.columns:`
			`# col = getattr(query.columns, column_name, None)`
			`# if col is None:`
			`# logger.debug(f'Cannot get column {column_name} in query for model {cls.__name__}')`
			`# continue`
			`# column_type = getattr(query.columns, column_name).type`
			`# ## XXX: Needs refinment, eg. nullable -> Int64 ...`
			`# if column_type.__class__ in pandas_cast_map:`
			`# df[column_name] = df[column_name].astype(pandas_cast_map[column_type.__class__])`
			`# elif isinstance(column_type, (sqltypes.Date, sqltypes.DateTime)):`
			`# ## Dates, times`
			`# df[column_name] = pd.to_datetime(df[column_name])`
			`# #elif isinstance(column_type, (sqltypes.Integer, sqltypes.Float)):`
			`# # ## Numeric`
			`# # df[column_name] = pd.to_numeric(df[column_name], errors='coerce')`
			`# ## XXX: keeping this note about that is about "char" SQL type, but the fix of #9694 makes it unnessary`
			`# #elif isinstance(column_type, sqltypes.CHAR) or (isinstance(column_type, sqltypes.String) and column_type.length == 1):`
			`# # ## Workaround for bytes being used for string of length 1 (not sure - why???)`
			`# # df[column_name] = df[column_name].str.decode('utf-8')`

			`# ## Rename the columns, removing the schema_table prefix for the columns in that model`
			`# prefix = cls.get_table_name_prefix()`
			`# prefix_length = len(prefix) + 1`
			`# rename_map = {colname: colname[prefix_length:] for colname in df.columns if colname.startswith(prefix)}`
			`# df.rename(columns=rename_map, inplace=True)`

			`# ## Eventually convert geometry columns to EWKB`
			`# if geom_as_ewkt:`
			`# geometry_columns = [col.name for col in cls.__table__.columns if isinstance(col.type, Geometry)]`
			`# for column in geometry_columns:`
			`# df[column] = shapely.to_wkb(shapely.from_wkb(df.geom), hex=True, include_srid=True)`

			`# return df`