snputils.stats

 1from .fstats import f2, f3, f4, d_stat, f4_ratio, fst
 2
 3__all__ = [
 4    "f2",
 5    "f3",
 6    "f4",
 7    "d_stat",
 8    "f4_ratio",
 9    "fst",
10]
def f2( data: Union[Any, Tuple[numpy.ndarray, numpy.ndarray, List[str]]], pop1: Optional[Sequence[str]] = None, pop2: Optional[Sequence[str]] = None, sample_labels: Optional[Sequence[str]] = None, apply_correction: bool = True, block_size: int = 5000, blocks: Optional[numpy.ndarray] = None, ancestry: Union[int, str, NoneType] = None, laiobj: Optional[Any] = None, include_self: bool = False) -> pandas.core.frame.DataFrame:
248def f2(
249    data: Union[Any, Tuple[np.ndarray, np.ndarray, List[str]]],
250    pop1: Optional[Sequence[str]] = None,
251    pop2: Optional[Sequence[str]] = None,
252    sample_labels: Optional[Sequence[str]] = None,
253    apply_correction: bool = True,
254    block_size: int = 5000,
255    blocks: Optional[np.ndarray] = None,
256    ancestry: Optional[Union[str, int]] = None,
257    laiobj: Optional[Any] = None,
258    include_self: bool = False,
259) -> pd.DataFrame:
260    """
261    Compute f2-statistics with block-jackknife standard errors.
262
263    Args:
264        data: Either a SNPObject or a tuple (afs, counts, pops), where `afs` and `counts` are arrays of
265              shape (n_snps, n_pops). If a SNPObject is provided, `sample_labels` are used to aggregate samples to populations.
266        pop1, pop2: Populations to compute f2 for. 
267            If None:
268                - with include_self=False (default), compute only off-diagonal pairs i<j.
269                - with include_self=True, compute all pairs including diagonals (i<=j).
270        sample_labels: Population label per sample (aligned with SNPObject.samples) when `data` is a SNPObject.
271        apply_correction: Apply small-sample correction p*(1-p)/(n-1) per population.
272            When True, SNPs with n<=1 in either population are excluded at that SNP.
273        block_size: Number of SNPs per jackknife block (default 5000 SNPs). Ignored if `blocks` is provided.
274        blocks: Optional explicit block id per SNP. If provided, overrides `block_size`.
275        ancestry: Optional ancestry code to mask genotypes to a specific ancestry before aggregation. Requires LAI.
276        laiobj: Optional `LocalAncestryObject` used to derive SNP-level LAI if it is not already present on the SNPObject.
277
278    Returns:
279        Pandas DataFrame with columns: pop1, pop2, est, se, z, p, n_blocks, n_snps
280    """
281    afs, counts, pops = _prepare_inputs(data, sample_labels, ancestry=ancestry, laiobj=laiobj)
282    n_snps, n_pops = afs.shape
283    block_ids, block_lengths = _build_blocks(n_snps, blocks, block_size)
284    n_blocks = block_lengths.size
285
286    if pop1 is None and pop2 is None:
287        if include_self:
288            pop_pairs = [(pops[i], pops[j]) for i in range(n_pops) for j in range(i, n_pops)]
289        else:
290            pop_pairs = [(pops[i], pops[j]) for i in range(n_pops) for j in range(i + 1, n_pops)]
291    else:
292        if pop1 is None or pop2 is None:
293            raise ValueError("Both pop1 and pop2 must be provided when one is provided")
294        if len(pop1) != len(pop2):
295            raise ValueError("pop1 and pop2 must have the same length")
296        pop_pairs = list(zip(pop1, pop2))
297
298    name_to_idx = {p: i for i, p in enumerate(pops)}
299
300    rows: List[Dict[str, Union[str, float, int]]] = []
301    # Precompute per-block index lists for efficiency
302    block_bins = [np.where(block_ids == b)[0] for b in range(n_blocks)]
303
304    for p1, p2 in pop_pairs:
305        i = name_to_idx[p1]
306        j = name_to_idx[p2]
307        p_i = afs[:, i]
308        p_j = afs[:, j]
309        n_i = counts[:, i].astype(float)
310        n_j = counts[:, j].astype(float)
311
312        # per-SNP f2 with optional small-sample correction.
313        # If apply_correction is True, SNPs with n<=1 for either population are excluded.
314        with np.errstate(divide="ignore", invalid="ignore"):
315            num = (p_i - p_j) ** 2
316            if apply_correction:
317                corr_i = np.where(n_i > 1, (p_i * (1.0 - p_i)) / (n_i - 1.0), np.nan)
318                corr_j = np.where(n_j > 1, (p_j * (1.0 - p_j)) / (n_j - 1.0), np.nan)
319                num = num - corr_i - corr_j
320        # SNPs contribute only if the correction was defined (finite) and AFs present
321        if apply_correction:
322            snp_mask = np.isfinite(num)
323        else:
324            snp_mask = np.isfinite(num) & (n_i > 0) & (n_j > 0)
325
326        # Aggregate per block: sums of num and counts. Use NaN to mark empty blocks.
327        num_block_sums = np.full(n_blocks, np.nan, dtype=float)
328        den_block_sums = np.full(n_blocks, np.nan, dtype=float)
329        for b, idx in enumerate(block_bins):
330            if idx.size == 0:
331                continue
332            idx2 = idx[snp_mask[idx]]
333            if idx2.size == 0:
334                continue
335            num_block_sums[b] = float(np.nansum(num[idx2]))
336            den_block_sums[b] = float(idx2.size)
337
338        res = _jackknife_ratio_from_block_sums(num_block_sums, den_block_sums)
339        rows.append(
340            {
341                "pop1": p1,
342                "pop2": p2,
343                "est": res.est,
344                "se": res.se,
345                "z": res.z,
346                "p": res.p,
347                "n_blocks": res.n_blocks,
348                "n_snps": res.n_snps,
349            }
350        )
351
352    return pd.DataFrame(rows)

Compute f2-statistics with block-jackknife standard errors.

Arguments:
  • data: Either a SNPObject or a tuple (afs, counts, pops), where afs and counts are arrays of shape (n_snps, n_pops). If a SNPObject is provided, sample_labels are used to aggregate samples to populations.
  • pop1, pop2: Populations to compute f2 for. If None: - with include_self=False (default), compute only off-diagonal pairs i
  • sample_labels: Population label per sample (aligned with SNPObject.samples) when data is a SNPObject.
  • apply_correction: Apply small-sample correction p*(1-p)/(n-1) per population. When True, SNPs with n<=1 in either population are excluded at that SNP.
  • block_size: Number of SNPs per jackknife block (default 5000 SNPs). Ignored if blocks is provided.
  • blocks: Optional explicit block id per SNP. If provided, overrides block_size.
  • ancestry: Optional ancestry code to mask genotypes to a specific ancestry before aggregation. Requires LAI.
  • laiobj: Optional LocalAncestryObject used to derive SNP-level LAI if it is not already present on the SNPObject.
Returns:

Pandas DataFrame with columns: pop1, pop2, est, se, z, p, n_blocks, n_snps

def f3( data: Union[Any, Tuple[numpy.ndarray, numpy.ndarray, List[str]]], target: Optional[Sequence[str]] = None, ref1: Optional[Sequence[str]] = None, ref2: Optional[Sequence[str]] = None, sample_labels: Optional[Sequence[str]] = None, apply_correction: bool = True, block_size: int = 5000, blocks: Optional[numpy.ndarray] = None, ancestry: Union[int, str, NoneType] = None, laiobj: Optional[Any] = None) -> pandas.core.frame.DataFrame:
355def f3(
356    data: Union[Any, Tuple[np.ndarray, np.ndarray, List[str]]],
357    target: Optional[Sequence[str]] = None,
358    ref1: Optional[Sequence[str]] = None,
359    ref2: Optional[Sequence[str]] = None,
360    sample_labels: Optional[Sequence[str]] = None,
361    apply_correction: bool = True,
362    block_size: int = 5000,
363    blocks: Optional[np.ndarray] = None,
364    ancestry: Optional[Union[str, int]] = None,
365    laiobj: Optional[Any] = None,
366) -> pd.DataFrame:
367    """
368    Compute f3-statistics f3(target; ref1, ref2) with block-jackknife SE.
369
370    - `block_size` is the number of SNPs per jackknife block (default 5000 SNPs). Ignored if `blocks` is provided.
371    - If `target`, `ref1`, and `ref2` are all None, compute f3 for all combinations where each role can be any population.
372    - If `ancestry` is provided, genotypes will be masked to the specified ancestry using LAI before aggregation.
373    - If `apply_correction` is True, subtract the finite sample term p_t*(1-p_t)/(n_t-1) from the per-SNP product.
374        When True, SNPs with n_t<=1 are excluded.
375    """
376    afs, counts, pops = _prepare_inputs(data, sample_labels, ancestry=ancestry, laiobj=laiobj)
377    n_snps, _ = afs.shape
378    block_ids, block_lengths = _build_blocks(n_snps, blocks, block_size)
379    n_blocks = block_lengths.size
380
381    if target is None and ref1 is None and ref2 is None:
382        triples = [(a, b, c) for a in pops for b in pops for c in pops]
383    else:
384        if target is None or ref1 is None or ref2 is None:
385            raise ValueError("target, ref1, and ref2 must all be provided if any is provided")
386        if not (len(target) == len(ref1) == len(ref2)):
387            raise ValueError("target, ref1, ref2 must have the same length")
388        triples = list(zip(target, ref1, ref2))
389
390    name_to_idx = {p: i for i, p in enumerate(pops)}
391    block_bins = [np.where(block_ids == b)[0] for b in range(n_blocks)]
392    rows: List[Dict[str, Union[str, float, int]]] = []
393
394    for t, r1, r2 in triples:
395        it = name_to_idx[t]
396        i1 = name_to_idx[r1]
397        i2 = name_to_idx[r2]
398        pt = afs[:, it]
399        p1 = afs[:, i1]
400        p2 = afs[:, i2]
401        nt = counts[:, it].astype(float)
402        n1 = counts[:, i1].astype(float)
403        n2 = counts[:, i2].astype(float)
404
405        with np.errstate(invalid="ignore", divide="ignore"):
406            num = (pt - p1) * (pt - p2)
407            if apply_correction:
408                corr_t = np.where(nt > 1, (pt * (1.0 - pt)) / (nt - 1.0), np.nan)
409                num = num - corr_t
410        if apply_correction:
411            # Require a valid correction on the target, references can be n>0
412            snp_mask = np.isfinite(num) & (n1 > 0) & (n2 > 0)
413        else:
414            snp_mask = np.isfinite(num) & (nt > 0) & (n1 > 0) & (n2 > 0)
415
416        num_block_sums = np.full(n_blocks, np.nan, dtype=float)
417        den_block_sums = np.full(n_blocks, np.nan, dtype=float)
418        for b, idx in enumerate(block_bins):
419            if idx.size == 0:
420                continue
421            idx2 = idx[snp_mask[idx]]
422            if idx2.size == 0:
423                continue
424            num_block_sums[b] = float(np.nansum(num[idx2]))
425            den_block_sums[b] = float(idx2.size)
426
427        res = _jackknife_ratio_from_block_sums(num_block_sums, den_block_sums)
428        rows.append(
429            {
430                "target": t,
431                "ref1": r1,
432                "ref2": r2,
433                "est": res.est,
434                "se": res.se,
435                "z": res.z,
436                "p": res.p,
437                "n_blocks": res.n_blocks,
438                "n_snps": res.n_snps,
439            }
440        )
441
442    return pd.DataFrame(rows)

Compute f3-statistics f3(target; ref1, ref2) with block-jackknife SE.

  • block_size is the number of SNPs per jackknife block (default 5000 SNPs). Ignored if blocks is provided.
  • If target, ref1, and ref2 are all None, compute f3 for all combinations where each role can be any population.
  • If ancestry is provided, genotypes will be masked to the specified ancestry using LAI before aggregation.
  • If apply_correction is True, subtract the finite sample term p_t*(1-p_t)/(n_t-1) from the per-SNP product. When True, SNPs with n_t<=1 are excluded.
def f4( data: Union[Any, Tuple[numpy.ndarray, numpy.ndarray, List[str]]], a: Optional[Sequence[str]] = None, b: Optional[Sequence[str]] = None, c: Optional[Sequence[str]] = None, d: Optional[Sequence[str]] = None, sample_labels: Optional[Sequence[str]] = None, block_size: int = 5000, blocks: Optional[numpy.ndarray] = None, ancestry: Union[int, str, NoneType] = None, laiobj: Optional[Any] = None) -> pandas.core.frame.DataFrame:
445def f4(
446    data: Union[Any, Tuple[np.ndarray, np.ndarray, List[str]]],
447    a: Optional[Sequence[str]] = None,
448    b: Optional[Sequence[str]] = None,
449    c: Optional[Sequence[str]] = None,
450    d: Optional[Sequence[str]] = None,
451    sample_labels: Optional[Sequence[str]] = None,
452    block_size: int = 5000,
453    blocks: Optional[np.ndarray] = None,
454    ancestry: Optional[Union[str, int]] = None,
455    laiobj: Optional[Any] = None,
456) -> pd.DataFrame:
457    """
458    Compute f4-statistics f4(a, b; c, d) with block-jackknife SE.
459
460    - `block_size` is the number of SNPs per jackknife block (default 5000 SNPs). Ignored if `blocks` is provided.
461    - If `ancestry` is provided, genotypes will be masked to the specified ancestry using LAI before aggregation.
462    """
463    afs, counts, pops = _prepare_inputs(data, sample_labels, ancestry=ancestry, laiobj=laiobj)
464    n_snps, _ = afs.shape
465    block_ids, block_lengths = _build_blocks(n_snps, blocks, block_size)
466    n_blocks = block_lengths.size
467
468    if a is None and b is None and c is None and d is None:
469        quads = [(w, x, y, z) for w in pops for x in pops for y in pops for z in pops]
470    else:
471        if a is None or b is None or c is None or d is None:
472            raise ValueError("a, b, c, d must all be provided if any is provided")
473        if not (len(a) == len(b) == len(c) == len(d)):
474            raise ValueError("a, b, c, d must have the same length")
475        quads = list(zip(a, b, c, d))
476
477    name_to_idx = {p: i for i, p in enumerate(pops)}
478    block_bins = [np.where(block_ids == b)[0] for b in range(n_blocks)]
479    rows: List[Dict[str, Union[str, float, int]]] = []
480
481    for pa, pb, pc, dpop in quads:
482        ia = name_to_idx[pa]
483        ib = name_to_idx[pb]
484        ic = name_to_idx[pc]
485        id_ = name_to_idx[dpop]
486
487        A = afs[:, ia]
488        B = afs[:, ib]
489        C = afs[:, ic]
490        D = afs[:, id_]
491        na = counts[:, ia].astype(float)
492        nb = counts[:, ib].astype(float)
493        nc = counts[:, ic].astype(float)
494        nd = counts[:, id_].astype(float)
495
496        with np.errstate(invalid="ignore"):
497            num = (A - B) * (C - D)
498        snp_mask = np.isfinite(num) & (na > 0) & (nb > 0) & (nc > 0) & (nd > 0)
499
500        num_block_sums = np.full(n_blocks, np.nan, dtype=float)
501        den_block_sums = np.full(n_blocks, np.nan, dtype=float)
502        for b_idx, idx in enumerate(block_bins):
503            if idx.size == 0:
504                continue
505            idx2 = idx[snp_mask[idx]]
506            if idx2.size == 0:
507                continue
508            num_block_sums[b_idx] = float(np.nansum(num[idx2]))
509            den_block_sums[b_idx] = float(idx2.size)
510
511        res = _jackknife_ratio_from_block_sums(num_block_sums, den_block_sums)
512        rows.append(
513            {
514                "a": pa,
515                "b": pb,
516                "c": pc,
517                "d": dpop,
518                "est": res.est,
519                "se": res.se,
520                "z": res.z,
521                "p": res.p,
522                "n_blocks": res.n_blocks,
523                "n_snps": res.n_snps,
524            }
525        )
526
527    return pd.DataFrame(rows)

Compute f4-statistics f4(a, b; c, d) with block-jackknife SE.

  • block_size is the number of SNPs per jackknife block (default 5000 SNPs). Ignored if blocks is provided.
  • If ancestry is provided, genotypes will be masked to the specified ancestry using LAI before aggregation.
def d_stat( data: Union[Any, Tuple[numpy.ndarray, numpy.ndarray, List[str]]], a: Optional[Sequence[str]] = None, b: Optional[Sequence[str]] = None, c: Optional[Sequence[str]] = None, d: Optional[Sequence[str]] = None, sample_labels: Optional[Sequence[str]] = None, block_size: int = 5000, blocks: Optional[numpy.ndarray] = None, ancestry: Union[int, str, NoneType] = None, laiobj: Optional[Any] = None) -> pandas.core.frame.DataFrame:
530def d_stat(
531    data: Union[Any, Tuple[np.ndarray, np.ndarray, List[str]]],
532    a: Optional[Sequence[str]] = None,
533    b: Optional[Sequence[str]] = None,
534    c: Optional[Sequence[str]] = None,
535    d: Optional[Sequence[str]] = None,
536    sample_labels: Optional[Sequence[str]] = None,
537    block_size: int = 5000,
538    blocks: Optional[np.ndarray] = None,
539    ancestry: Optional[Union[str, int]] = None,
540    laiobj: Optional[Any] = None,
541) -> pd.DataFrame:
542    """
543    Compute D-statistics D(a, b; c, d) as ratio of sums:
544        D = sum_l (A-B)(C-D)  /  sum_l (A+B-2AB)(C+D-2CD)
545    with delete-one block jackknife SE.
546
547    - `block_size` is the number of SNPs per jackknife block (default 5000 SNPs). Ignored if `blocks` is provided.
548    - If `ancestry` is provided, genotypes will be masked to the specified ancestry using LAI before aggregation.
549    """
550    afs, counts, pops = _prepare_inputs(data, sample_labels, ancestry=ancestry, laiobj=laiobj)
551    n_snps, _ = afs.shape
552    block_ids, block_lengths = _build_blocks(n_snps, blocks, block_size)
553    n_blocks = block_lengths.size
554
555    if a is None and b is None and c is None and d is None:
556        quads = [(w, x, y, z) for w in pops for x in pops for y in pops for z in pops]
557    else:
558        if a is None or b is None or c is None or d is None:
559            raise ValueError("a, b, c, d must all be provided if any is provided")
560        if not (len(a) == len(b) == len(c) == len(d)):
561            raise ValueError("a, b, c, d must have the same length")
562        quads = list(zip(a, b, c, d))
563
564    name_to_idx = {p: i for i, p in enumerate(pops)}
565    block_bins = [np.where(block_ids == b)[0] for b in range(n_blocks)]
566    rows: List[Dict[str, Union[str, float, int]]] = []
567
568    for pa, pb, pc, dpop in quads:
569        ia = name_to_idx[pa]
570        ib = name_to_idx[pb]
571        ic = name_to_idx[pc]
572        id_ = name_to_idx[dpop]
573
574        A = afs[:, ia]
575        B = afs[:, ib]
576        C = afs[:, ic]
577        D = afs[:, id_]
578        na = counts[:, ia].astype(float)
579        nb = counts[:, ib].astype(float)
580        nc = counts[:, ic].astype(float)
581        nd = counts[:, id_].astype(float)
582
583        with np.errstate(invalid="ignore"):
584            num = (A - B) * (C - D)
585            den = (A + B - 2 * A * B) * (C + D - 2 * C * D)
586
587        snp_mask = np.isfinite(num) & np.isfinite(den) & (na > 0) & (nb > 0) & (nc > 0) & (nd > 0)
588
589        num_block_sums = np.full(n_blocks, np.nan, dtype=float)
590        den_block_sums = np.full(n_blocks, np.nan, dtype=float)
591        ct_block_sums = np.zeros(n_blocks, dtype=int)
592        for b_idx, idx in enumerate(block_bins):
593            if idx.size == 0:
594                continue
595            idx2 = idx[snp_mask[idx]]
596            if idx2.size == 0:
597                continue
598            num_block_sums[b_idx] = float(np.nansum(num[idx2]))
599            # drop near-zero denominators within a block for stability
600            block_den = float(np.nansum(den[idx2]))
601            den_block_sums[b_idx] = block_den if abs(block_den) > 1e-12 else np.nan
602            ct_block_sums[b_idx] = int(idx2.size)
603            
604        res = _jackknife_ratio_from_block_sums(num_block_sums, den_block_sums)
605        rows.append(
606            {
607                "a": pa,
608                "b": pb,
609                "c": pc,
610                "d": dpop,
611                "est": res.est,
612                "se": res.se,
613                "z": res.z,
614                "p": res.p,
615                "n_blocks": res.n_blocks,
616                "n_snps": int(ct_block_sums.sum()),
617            }
618        )
619
620    return pd.DataFrame(rows)

Compute D-statistics D(a, b; c, d) as ratio of sums: D = sum_l (A-B)(C-D) / sum_l (A+B-2AB)(C+D-2CD) with delete-one block jackknife SE.

  • block_size is the number of SNPs per jackknife block (default 5000 SNPs). Ignored if blocks is provided.
  • If ancestry is provided, genotypes will be masked to the specified ancestry using LAI before aggregation.
def f4_ratio( data: Union[Any, Tuple[numpy.ndarray, numpy.ndarray, List[str]]], num: Sequence[Tuple[str, str, str, str]], den: Sequence[Tuple[str, str, str, str]], sample_labels: Optional[Sequence[str]] = None, block_size: int = 5000, blocks: Optional[numpy.ndarray] = None, ancestry: Union[int, str, NoneType] = None, laiobj: Optional[Any] = None) -> pandas.core.frame.DataFrame:
623def f4_ratio(
624    data: Union[Any, Tuple[np.ndarray, np.ndarray, List[str]]],
625    num: Sequence[Tuple[str, str, str, str]],
626    den: Sequence[Tuple[str, str, str, str]],
627    sample_labels: Optional[Sequence[str]] = None,
628    block_size: int = 5000,
629    blocks: Optional[np.ndarray] = None,
630    ancestry: Optional[Union[str, int]] = None,
631    laiobj: Optional[Any] = None,
632) -> pd.DataFrame:
633    """
634    Compute f4-ratio statistics as ratio of two f4-statistics with block-jackknife SE.
635
636    Args:
637        num: Sequence of quadruples (a, b, c, d) for numerator f4(a, b; c, d)
638        den: Sequence of quadruples (a, b, c, d) for denominator f4(a, b; c, d)
639
640    Notes:
641        - `block_size` is the number of SNPs per jackknife block (default 5000 SNPs). Ignored if `blocks` is provided.
642        - If `ancestry` is provided, genotypes will be masked to the specified ancestry using LAI before aggregation.
643    """
644    if len(num) != len(den):
645        raise ValueError("'num' and 'den' must have the same length")
646
647    afs, counts, pops = _prepare_inputs(data, sample_labels, ancestry=ancestry, laiobj=laiobj)
648    n_snps, _ = afs.shape
649    block_ids, block_lengths = _build_blocks(n_snps, blocks, block_size)
650    n_blocks = block_lengths.size
651    block_bins = [np.where(block_ids == b)[0] for b in range(n_blocks)]
652    name_to_idx = {p: i for i, p in enumerate(pops)}
653
654    rows: List[Dict[str, Union[str, float, int]]] = []
655    for (na, nb, nc, nd), (da, db, dc, dd) in zip(num, den):
656        ia, ib, ic, id_ = name_to_idx[na], name_to_idx[nb], name_to_idx[nc], name_to_idx[nd]
657        ja, jb, jc, jd = name_to_idx[da], name_to_idx[db], name_to_idx[dc], name_to_idx[dd]
658
659        A, B, C, D = afs[:, ia], afs[:, ib], afs[:, ic], afs[:, id_]
660        E, F, G, H = afs[:, ja], afs[:, jb], afs[:, jc], afs[:, jd]
661        nA, nB, nC, nD = counts[:, ia], counts[:, ib], counts[:, ic], counts[:, id_]
662        nE, nF, nG, nH = counts[:, ja], counts[:, jb], counts[:, jc], counts[:, jd]
663
664        with np.errstate(invalid="ignore"):
665            num_snp = (A - B) * (C - D)
666            den_snp = (E - F) * (G - H)
667        mask_num = np.isfinite(num_snp) & (nA > 0) & (nB > 0) & (nC > 0) & (nD > 0)
668        mask_den = np.isfinite(den_snp) & (nE > 0) & (nF > 0) & (nG > 0) & (nH > 0)
669        mask_both = mask_num & mask_den
670
671        num_block_sums = np.full(n_blocks, np.nan, dtype=float)
672        den_block_sums = np.full(n_blocks, np.nan, dtype=float)
673        ct_block_sums = np.zeros(n_blocks, dtype=int)
674        for b_idx, idx in enumerate(block_bins):
675            if idx.size == 0:
676                continue
677            idx2 = idx[mask_both[idx]]
678            if idx2.size == 0:
679                continue
680            num_block_sums[b_idx] = float(np.nansum(num_snp[idx2]))
681            block_den = float(np.nansum(den_snp[idx2]))
682            den_block_sums[b_idx] = block_den if abs(block_den) > 1e-12 else np.nan
683            ct_block_sums[b_idx] = int(idx2.size)
684            
685        res = _jackknife_ratio_from_block_sums(num_block_sums, den_block_sums)
686        rows.append(
687            {
688                "num": f"({na},{nb};{nc},{nd})",
689                "den": f"({da},{db};{dc},{dd})",
690                "est": res.est,
691                "se": res.se,
692                "z": res.z,
693                "p": res.p,
694                "n_blocks": res.n_blocks,
695                "n_snps": int(ct_block_sums.sum()),
696            }
697        )
698
699    return pd.DataFrame(rows)

Compute f4-ratio statistics as ratio of two f4-statistics with block-jackknife SE.

Arguments:
  • num: Sequence of quadruples (a, b, c, d) for numerator f4(a, b; c, d)
  • den: Sequence of quadruples (a, b, c, d) for denominator f4(a, b; c, d)
Notes:
  • block_size is the number of SNPs per jackknife block (default 5000 SNPs). Ignored if blocks is provided.
  • If ancestry is provided, genotypes will be masked to the specified ancestry using LAI before aggregation.
def fst( data: Union[Any, Tuple[numpy.ndarray, numpy.ndarray, List[str]]], pop1: Optional[Sequence[str]] = None, pop2: Optional[Sequence[str]] = None, *, method: str = 'hudson', sample_labels: Optional[Sequence[str]] = None, block_size: int = 5000, blocks: Optional[numpy.ndarray] = None, ancestry: Union[int, str, NoneType] = None, laiobj: Optional[Any] = None, include_self: bool = False) -> pandas.core.frame.DataFrame:
702def fst(
703    data: Union[Any, Tuple[np.ndarray, np.ndarray, List[str]]],
704    pop1: Optional[Sequence[str]] = None,
705    pop2: Optional[Sequence[str]] = None,
706    *,
707    method: str = "hudson",
708    sample_labels: Optional[Sequence[str]] = None,
709    block_size: int = 5000,
710    blocks: Optional[np.ndarray] = None,
711    ancestry: Optional[Union[str, int]] = None,
712    laiobj: Optional[Any] = None,
713    include_self: bool = False,
714) -> pd.DataFrame:
715    """
716    Pairwise F_ST with delete-one block jackknife SE.
717
718    Methods:
719      - 'hudson' (a.k.a. "ratio of averages" following Hudson 1992 / Bhatia 2013):
720            per-SNP num = d_xy - 0.5*(pi_x + pi_y)
721            per-SNP den = d_xy
722        where d_xy = p_x*(1-p_y) + p_y*(1-p_x) and
723              pi_x = 2*p_x*(1-p_x) * n_x/(n_x - 1) (unbiased within-pop diversity on haplotypes).
724      - 'weir_cockerham' (Weir & Cockerham 1984; θ):
725            compute per-SNP variance components a,b,c (for r=2) from allele freqs and haplotype counts:
726                n1, n2 = haplotype counts; p1, p2 = allele freqs
727                n  = n1 + n2
728                n_bar = n / 2
729                p_bar = (n1*p1 + n2*p2) / n
730                s2    = (n1*(p1 - p_bar)**2 + n2*(p2 - p_bar)**2) / n_bar
731                h1, h2 = 2*p1*(1-p1), 2*p2*(1-p2)
732                h_bar = 0.5*(h1 + h2)
733                n_c   = (n - (n1**2 + n2**2)/n)  # equals 2*n1*n2/n for r=2
734                a = (n_bar / n_c) * (s2 - (p_bar*(1 - p_bar) - 0.5*s2 - 0.25*h_bar) / (n_bar - 1))
735                b = (n_bar / (n_bar - 1)) * (p_bar*(1 - p_bar) - 0.5*s2 - ((2*n_bar - 1)/(4*n_bar))*h_bar)
736                c = 0.5 * h_bar
737            and use num=a, den=(a+b+c), then ratio-of-sums jackknife.
738
739    Notes:
740      * Inputs are the same as f2/f3/f4: either SNPObject or (afs, counts, pops).
741      * For WC we use expected heterozygosity h_i = 2 p_i (1 - p_i) from allele freqs.
742      * SNPs with n<=1 in either pop or with invalid denominators are ignored.
743    """
744    method = str(method).strip().lower().replace(" ", "_").replace("-", "_")
745    if method in ("wc", "weir", "weir_cockerham", "weir-cockerham"):
746        method = "weir_cockerham"
747    elif method in ("h", "hudson", "bhatia", "ratio", "ratio_of_averages", "ratio-of-averages"):
748        method = "hudson"
749    elif method not in ("hudson", "weir_cockerham"):
750        # only raise if it's neither a known alias nor a canonical name
751        raise ValueError(f"Unknown method for fst: {method!r}")
752
753    afs, counts, pops = _prepare_inputs(data, sample_labels, ancestry=ancestry, laiobj=laiobj)
754    n_snps, n_pops = afs.shape
755    block_ids, block_lengths = _build_blocks(n_snps, blocks, block_size)
756    n_blocks = block_lengths.size
757
758    if pop1 is None and pop2 is None:
759        if include_self:
760            pairs = [(pops[i], pops[j]) for i in range(n_pops) for j in range(i, n_pops)]
761        else:
762            pairs = [(pops[i], pops[j]) for i in range(n_pops) for j in range(i + 1, n_pops)]
763    else:
764        if pop1 is None or pop2 is None or len(pop1) != len(pop2):
765            raise ValueError("pop1 and pop2 must both be provided and of equal length")
766        pairs = list(zip(pop1, pop2))
767
768    name_to_idx = {p: i for i, p in enumerate(pops)}
769    block_bins = [np.where(block_ids == b)[0] for b in range(n_blocks)]
770
771    out_rows: List[Dict[str, Union[str, float, int]]] = []
772
773    for pA, pB in pairs:
774        i = name_to_idx[pA]
775        j = name_to_idx[pB]
776        p1 = afs[:, i].astype(float)
777        p2 = afs[:, j].astype(float)
778        n1 = counts[:, i].astype(float)
779        n2 = counts[:, j].astype(float)
780
781        valid = np.isfinite(p1) & np.isfinite(p2)
782
783        if method == "hudson":
784            # d_xy and within-pop diversities (unbiased, haplotype-based)
785            d_xy = p1 * (1.0 - p2) + p2 * (1.0 - p1)
786            with np.errstate(divide="ignore", invalid="ignore"):
787                pi1 = 2.0 * p1 * (1.0 - p1) * (n1 / (n1 - 1.0))
788                pi2 = 2.0 * p2 * (1.0 - p2) * (n2 / (n2 - 1.0))
789            num_snp = d_xy - 0.5 * (pi1 + pi2)
790            den_snp = d_xy
791            snp_mask = valid & (n1 > 1) & (n2 > 1) & np.isfinite(num_snp) & np.isfinite(den_snp)
792        else:
793            # Weir-Cockerham θ components (r=2)
794            n = n1 + n2
795            with np.errstate(divide="ignore", invalid="ignore"):
796                n_bar = n / 2.0
797                p_bar = np.where(n > 0, (n1 * p1 + n2 * p2) / n, np.nan)
798                s2 = (n1 * (p1 - p_bar) ** 2 + n2 * (p2 - p_bar) ** 2) / n_bar
799                h1 = 2.0 * p1 * (1.0 - p1)
800                h2 = 2.0 * p2 * (1.0 - p2)
801                h_bar = 0.5 * (h1 + h2)
802                n_c = n - (n1 * n1 + n2 * n2) / np.where(n > 0, n, np.nan)  # == 2*n1*n2/n
803                # components
804                a = (n_bar / n_c) * (s2 - (p_bar * (1.0 - p_bar) - 0.5 * s2 - 0.25 * h_bar) / (n_bar - 1.0))
805                b = (n_bar / (n_bar - 1.0)) * (p_bar * (1.0 - p_bar) - 0.5 * s2 - ((2.0 * n_bar - 1.0) / (4.0 * n_bar)) * h_bar)
806                c = 0.5 * h_bar
807                num_snp = a
808                den_snp = a + b + c
809            # Need at least 2 haplotypes per pop and well-defined denominators
810            snp_mask = valid & (n1 > 1) & (n2 > 1) & np.isfinite(num_snp) & np.isfinite(den_snp)
811
812        # Aggregate by blocks
813        num_block_sums = np.full(n_blocks, np.nan, dtype=float)
814        den_block_sums = np.full(n_blocks, np.nan, dtype=float)
815        ct_block_sums = np.zeros(n_blocks, dtype=int)
816
817        for b_idx, idx in enumerate(block_bins):
818            if idx.size == 0:
819                continue
820            idx2 = idx[snp_mask[idx]]
821            if idx2.size == 0:
822                continue
823            ns = float(np.nansum(num_snp[idx2]))
824            ds = float(np.nansum(den_snp[idx2]))
825            # drop numerically-zero denominators for stability
826            den_block_sums[b_idx] = ds if abs(ds) > 1e-12 else np.nan
827            num_block_sums[b_idx] = ns
828            ct_block_sums[b_idx] = int(idx2.size)
829
830        res = _jackknife_ratio_from_block_sums(num_block_sums, den_block_sums)
831        out_rows.append(
832            {
833                "pop1": pA,
834                "pop2": pB,
835                "method": method,
836                "est": res.est,
837                "se": res.se,
838                "z": res.z,
839                "p": res.p,
840                "n_blocks": res.n_blocks,
841                "n_snps": int(ct_block_sums.sum()),
842            }
843        )
844
845    return pd.DataFrame(out_rows)

Pairwise F_ST with delete-one block jackknife SE.

Methods:
  • 'hudson' (a.k.a. "ratio of averages" following Hudson 1992 / Bhatia 2013): per-SNP num = d_xy - 0.5*(pi_x + pi_y) per-SNP den = d_xy where d_xy = p_x(1-p_y) + p_y(1-p_x) and pi_x = 2p_x(1-p_x) * n_x/(n_x - 1) (unbiased within-pop diversity on haplotypes).
  • 'weir_cockerham' (Weir & Cockerham 1984; θ): compute per-SNP variance components a,b,c (for r=2) from allele freqs and haplotype counts: n1, n2 = haplotype counts; p1, p2 = allele freqs n = n1 + n2 n_bar = n / 2 p_bar = (n1*p1 + n2*p2) / n s2 = (n1*(p1 - p_bar)2 + n2*(p2 - p_bar)2) / n_bar h1, h2 = 2p1(1-p1), 2p2(1-p2) h_bar = 0.5(h1 + h2) n_c = (n - (n1**2 + n2**2)/n) # equals 2*n1*n2/n for r=2 a = (n_bar / n_c) * (s2 - (p_bar(1 - p_bar) - 0.5s2 - 0.25h_bar) / (n_bar - 1)) b = (n_bar / (n_bar - 1)) * (p_bar(1 - p_bar) - 0.5*s2 - ((2*n_bar - 1)/(4n_bar))*h_bar) c = 0.5 * h_bar and use num=a, den=(a+b+c), then ratio-of-sums jackknife.
Notes:
  • Inputs are the same as f2/f3/f4: either SNPObject or (afs, counts, pops).
  • For WC we use expected heterozygosity h_i = 2 p_i (1 - p_i) from allele freqs.
  • SNPs with n<=1 in either pop or with invalid denominators are ignored.