Skip to content

filters

All filter conversion functions. Validation done in UI, but conversion to Expressions is done via these functions

check_flags(flags, filters) #

Converts flags & values to boolean vaex expression for use as a filter.

Note

Registered as a vaex.expression.Expression method via the register_function decorator.

Parameters:

Name Type Description Default
flags Expression

bit flags expressions

required
filters Expression

which filters to apply

required

Returns: Boolean expression of filters

Source code in src/sdss_explorer/util/filters.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
@vx.register_function(multiprocessing=True)
def check_flags(flags: vx.Expression, filters: vx.Expression) -> vx.Expression:
    """Converts flags & values to boolean vaex expression for use as a filter.

    Note:
        Registered as a `vaex.expression.Expression` method via the `register_function` decorator.

    Args:
        flags: bit flags expressions
        filters: which filters to apply
    Returns:
        Boolean expression of filters
    """
    return np.logical_and(flags, filters).any(axis=1)

filter_carton_mapper(df, mapping, carton, mapper, combotype='AND', invert=False) #

Filters a list of cartons and mappers

Based on code written by Andy Casey for github.com/sdss/semaphore

Source code in src/sdss_explorer/util/filters.py
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
def filter_carton_mapper(
    df: vx.DataFrame,
    mapping: vx.DataFrame,
    carton: list[str],
    mapper: list[str],
    combotype: str = "AND",
    invert: bool = False,
) -> vx.Expression | None:
    """
    Filters a list of cartons and mappers

    Based on code written by Andy Casey for github.com/sdss/semaphore
    """
    if len(mapper) != 0 or len(carton) != 0:
        # mask
        if len(mapper) == 0:
            mask = mapping["alt_name"].isin(carton).values
        elif len(carton) == 0:
            mask = mapping["mapper"].isin(mapper).values
        else:
            mask = operator_map[combotype](
                mapping["mapper"].isin(mapper).values,
                mapping["alt_name"].isin(carton).values,
            )

        # determine active bits via mask and get flag_number & offset
        # NOTE: hardcoded nbits as 8, and nflags as 57
        bits = np.arange(len(mapping))[mask]
        num, offset = np.divmod(bits, 8)
        setbits = 57 > num  # ensure bits in flags

        # construct hashmap for each unique flag
        filters = np.zeros(57, dtype="uint8")
        unique_nums, indices = np.unique(num[setbits], return_inverse=True)
        for i, unique in enumerate(unique_nums):
            offsets = 1 << offset[setbits][indices == i]
            filters[unique] = np.bitwise_or.reduce(offsets)

        cmp_filter = df.func.check_flags(df["sdss5_target_flags"], filters)
    else:
        cmp_filter = None

    if invert and (cmp_filter is not None):
        logger.debug("inverting cmpfilter")
        return ~cmp_filter
    else:
        return cmp_filter

filter_crossmatch(df, crossmatch, cmtype) #

Generates a filter for flags

Parameters:

Name Type Description Default
df DataFrame

dataframe to filter

required
crossmatch str

multiline string of identifiers

required
cmtype str

identifier type

required

Returns:

Name Type Description
None Expression | None

if nothing parsed to crossmatch

Expression | None

vx.Expression: if there is a valid filter

Raises:

Type Description
ValueError

if crossmatch fails to convert all to integers

TypeError

if tic_v8 with spall (not supported)

AssertionError

if users pass

Source code in src/sdss_explorer/util/filters.py
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
def filter_crossmatch(df: vx.DataFrame, crossmatch: str,
                      cmtype: str) -> vx.Expression | None:
    """
    Generates a filter for flags

    Args:
        df: dataframe to filter
        crossmatch: multiline string of identifiers
        cmtype: identifier type

    Returns:
        None: if nothing parsed to crossmatch
        vx.Expression: if there is a valid filter

    Raises:
        ValueError: if crossmatch fails to convert all to integers
        TypeError: if tic_v8 with spall (not supported)
        AssertionError: if users pass

    """
    assert cmtype in crossmatchList.keys(
    ), "unspported crossmatch column passed"

    # bhm doesnt fetch tic_v8's so flag
    if (cmtype == "tic_v8") and (df["pipeline"].unique()[0] == "spall"):
        raise TypeError("tic_v8 not supported with spall dataset")
    if len(crossmatch) > 0:
        # NOTE: you can rate limit, but it adds O(n) to every crossmatch operation,
        # where n is length of str
        #
        # assert len(crossmatch.count('\n')) < 100_000, 'too many identifiers!'

        # for checking our dtype
        col = df[crossmatchList[cmtype]]
        try:
            if col.dtype == "string":
                identifiers = crossmatch.lstrip().rstrip().split("\n")
            else:
                # we have to make sure all are integers
                identifiers = list(
                    map(int,
                        crossmatch.lstrip().rstrip().split("\n")))
        except Exception:
            # makes errors more informative
            raise ValueError("failed to convert to integer identifiers")

        return col.isin(identifiers)
    else:
        return None

filter_expression(df, columns, expression, invert=False) #

Converts expression to valid filter

Source code in src/sdss_explorer/util/filters.py
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
def filter_expression(
    df: vx.DataFrame,
    columns: list[str],
    expression: str,
    invert: bool = False,
):
    """Converts expression to valid filter"""
    # first, remove all spaces
    expr = expression.replace(" ", "")
    logger.debug(f"expr: {expr}")
    num_regex = r"^-?[0-9]+(?:\.[0-9]+)?(?:e-?\d+)?$"

    # get expression in parts, saving split via () regex
    subexpressions = re.split(r"(&|\||\)|\()", expr)
    n = 1
    for i, expr in enumerate(subexpressions):
        # saved regex info -> skip w/o enumerating
        if expr in ["", "&", "(", ")", "|"]:
            continue

        # guardrail ace here -- very important
        illegals = ["eval", "exec", "import", "__main__"]
        for illegal in illegals:
            if illegal in expression:
                logger.critical(
                    "this user attempted to use ACE-like expressions!")
                assert False, "Your session and IP has been logged."

        parts = re.split(r"(>=|<=|<|>|==|!=)", expr)
        if len(parts) == 1:
            assert False, f"expression {n} is invalid: no comparator"
        elif len(parts) == 5:
            # first, check that parts 2 & 4 are lt or lte comparators
            assert (
                re.fullmatch(r"<=|<", parts[1]) is not None
                and re.fullmatch(r"<=|<", parts[3]) is not None
            ), (f"expression {n} is invalid: not a proper 3-part inequality (a < col <= b)"
                )

            # check middle
            assert parts[2] in columns, (
                f"expression {n} is invalid: must be comparing a data column (a < col <= b)"
            )

            # check a and b & if a < b
            assert re.match(num_regex, parts[0]) is not None, (
                f"expression {n} is invalid: must be numeric for numerical data column"
            )
            assert float(parts[0]) < float(parts[-1]), (
                f"expression {n} is invalid: invalid inequality (a > b for a < col < b)"
            )

            # change the expression to valid format
            subexpressions[i] = (
                f"(({parts[0]}{parts[1]}{parts[2]})&({parts[2]}{parts[3]}{parts[4]}))"
            )

        elif len(parts) == 3:
            check = (parts[0] in columns, parts[2] in columns)
            if np.any(check):
                if check[0]:
                    col = parts[0]
                    num = parts[2]
                elif check[1]:
                    col = parts[2]
                    num = parts[0]
                dtype = str(df[col].dtype)
                if "float" in dtype or "int" in dtype:
                    assert re.match(num_regex, num) is not None, (
                        f"expression {n} is invalid: must be numeric for numerical data column"
                    )
            else:
                assert False, f"expression {n} is invalid: one part must be column"
            assert re.match(r">=|<=|<|>|==|!=", parts[1]) is not None, (
                f"expression {n} is invalid: middle is not comparator")

            # change the expression in subexpression
            subexpressions[i] = "(" + expr + ")"
        else:
            assert False, f"expression {n} is invalid: too many comparators"

        # enumerate the expr counter
        n = n + 1

    # create expression as str
    expr = "(" + "".join(subexpressions) + ")"
    logger.debug(f"expr final: {expr}")

    # set filter corresponding to inverts & exit
    if invert:  # NOTE: df will never be None unless something horrible happens
        logger.debug("inverting expression")
        return ~df[expr]
    else:
        return df[expr]

filter_flags(df, flags, dataset, invert=False) #

Generates a filter for flags

Parameters:

Name Type Description Default
df DataFrame

dataframe to filter

required
flags list[str]

list of flags to update

required
dataset str

specific dataset to filter on, used to check whether result_flags is present

required
invert bool

whether to invert

False
Source code in src/sdss_explorer/util/filters.py
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
def filter_flags(df: vx.DataFrame,
                 flags: list[str],
                 dataset: str,
                 invert: bool = False) -> vx.Expression | None:
    """
    Generates a filter for flags

    Args:
        df: dataframe to filter
        flags: list of flags to update
        dataset: specific dataset to filter on, used to check whether result_flags is present
        invert: whether to invert
    """
    filters = []
    for flag in flags:
        # Skip iteration if the subset's dataset is 'best' and the flag is 'Purely non-flagged'
        if (dataset == "mwmlite") and (flag == "purely non-flagged"):
            continue
        # boss-only pipeline exceptions for zwarning_flags filtering
        elif np.isin(
                dataset,
            ("spall", "lineforest"),
        ) and (flag == "purely non-flagged"):
            filters.append("zwarning_flags!=0")
            continue
        filters.append(flagList[flag])

    # Determine the final concatenated filter
    if filters:
        # Join the filters with ")&(" and wrap them in outer parentheses
        concat_filter = f"(({')&('.join(filters)}))"
        concat_filter: vx.Expression = df[concat_filter]
        if invert and (concat_filter is not None):
            logger.debug("inverting flagfilter")
            concat_filter = ~concat_filter
    else:
        concat_filter = None
    return concat_filter