stac-utils/pgstac

Support for the STAC API free-text extension

hrodmn opened this issue · 2 comments

The STAC API free-text extension enables a user to provide a parameter q with some free-text search query parameters. This is perhaps redundant with the filter extension, but it is much easier to write a text query using the q parameter. e.g. landsat OR sentinel to get records that contain the term sentinel or landsat, landsat AND sentinel to get records that contain both terms.

Adding support for the q parameter would be particularly useful for improving the collection search functionality!

Postgres has a rich set of functions for text search operations: https://www.postgresql.org/docs/current/functions-textsearch.html

websearch_to_tsquery comes really close to doing all of the work for us, but it can't handle AND operators or parenthesized queries. To solve this we could write a function that converts a q query into a tsquery object:

CREATE OR REPLACE FUNCTION q_to_tsquery (input text)
    RETURNS tsquery
    AS $$
DECLARE
    processed_text text;
BEGIN
    -- replace commas outside quoted text with " | "
    processed_text := regexp_replace(input, ',(?=(?:[^"]*"[^"]*")*[^"]*$)', ' | ', 'g');
    -- replace the logical operators with tsquery equivalents
    processed_text := regexp_replace(processed_text, '\s+AND\s+', ' & ', 'g');
    processed_text := regexp_replace(processed_text, '\s+OR\s+', ' | ', 'g');
    -- surround quoted text contents with single quotes for to_tsquery syntax
    processed_text := regexp_replace(processed_text, '"([^"]+)"', '''\1''', 'g');
    RETURN to_tsquery(processed_text);
END;
$$
LANGUAGE plpgsql;

Then we could do a query on this simulated collections table like this:

SELECT
    collections.id,
    collections.title,
    collections.description,
    ts_rank(
        setweight(
            to_tsvector('english', coalesce(collections.title, '')), 'A'
        ) || setweight(
            to_tsvector('english', coalesce(collections.description, '')), 'B'
        ),
        query
    ) AS rank
FROM
    collections,
    q_to_tsquery (q) AS query
WHERE (
    setweight(to_tsvector('english', coalesce(collections.title, '')), 'A') || 
    setweight(to_tsvector('english', coalesce(collections.description, '')), 'B')
) @@ query
ORDER BY   rank DESC;
CREATE OR REPLACE FUNCTION stac_search_to_where(j jsonb) RETURNS text AS $$
DECLARE
    where_segments text[];
    _where text;
    dtrange tstzrange;
    collections text[];
    geom geometry;
    sdate timestamptz;
    edate timestamptz;
    filterlang text;
    filter jsonb = j->'filter';
    ft_query tsquery;
BEGIN
    RAISE DEBUG 'STAC SEARCH_TO_WHERE: %', j;
    IF j ? 'ids' THEN
        where_segments := where_segments || format('id = ANY (%L) ', to_text_array(j->'ids'));
    END IF;

    IF j ? 'collections' THEN
        collections := to_text_array(j->'collections');
        where_segments := where_segments || format('collection = ANY (%L) ', collections);
    END IF;

    IF j ? 'datetime' THEN
        dtrange := parse_dtrange(j->'datetime');
        sdate := lower(dtrange);
        edate := upper(dtrange);

        where_segments := where_segments || format(' datetime <= %L::timestamptz AND end_datetime >= %L::timestamptz ',
            edate,
            sdate
        );
    END IF;

    IF j ? 'q' THEN
        ft_query := q_to_tsquery(j->>'q');
        where_segments := where_segments || format( $quote$
         ( to_tsvector('english', content->'properties'->>'description') ||
           to_tsvector('english', content->'properties'->>'title') ||
           to_tsvector('english', content->'properties'->'keywords')
         ) @@ %L
         $quote$,
         ft_query
        );
    END IF;

    geom := stac_geom(j);
    IF geom IS NOT NULL THEN
        where_segments := where_segments || format('st_intersects(geometry, %L)',geom);
    END IF;

    filterlang := COALESCE(
        j->>'filter-lang',
        get_setting('default_filter_lang', j->'conf')
    );
    IF NOT filter @? '$.**.op' THEN
        filterlang := 'cql-json';
    END IF;

    IF filterlang NOT IN ('cql-json','cql2-json') AND j ? 'filter' THEN
        RAISE EXCEPTION '% is not a supported filter-lang. Please use cql-json or cql2-json.', filterlang;
    END IF;

    IF j ? 'query' AND j ? 'filter' THEN
        RAISE EXCEPTION 'Can only use either query or filter at one time.';
    END IF;

    IF j ? 'query' THEN
        filter := query_to_cql2(j->'query');
    ELSIF filterlang = 'cql-json' THEN
        filter := cql1_to_cql2(filter);
    END IF;
    RAISE DEBUG 'FILTER: %', filter;
    where_segments := where_segments || cql2_query(filter);
    IF cardinality(where_segments) < 1 THEN
        RETURN ' TRUE ';
    END IF;

    _where := array_to_string(array_remove(where_segments, NULL), ' AND ');

    IF _where IS NULL OR BTRIM(_where) = '' THEN
        RETURN ' TRUE ';
    END IF;
    RETURN _where;

END;
$$ LANGUAGE PLPGSQL STABLE;