snputils.stats

View Source

 1from .fstats import f2, f3, f4, d_stat, f4_ratio, fst
 2
 3__all__ = [
 4    "f2",
 5    "f3",
 6    "f4",
 7    "d_stat",
 8    "f4_ratio",
 9    "fst",
10]

def f2( data: Union[Any, Tuple[numpy.ndarray, numpy.ndarray, List[str]]], pop1: Optional[Sequence[str]] = None, pop2: Optional[Sequence[str]] = None, sample_labels: Optional[Sequence[str]] = None, apply_correction: bool = True, block_size: int = 5000, blocks: Optional[numpy.ndarray] = None, ancestry: Union[int, str, NoneType] = None, laiobj: Optional[Any] = None, include_self: bool = False) -> pandas.core.frame.DataFrame: View Source

248def f2(
249    data: Union[Any, Tuple[np.ndarray, np.ndarray, List[str]]],
250    pop1: Optional[Sequence[str]] = None,
251    pop2: Optional[Sequence[str]] = None,
252    sample_labels: Optional[Sequence[str]] = None,
253    apply_correction: bool = True,
254    block_size: int = 5000,
255    blocks: Optional[np.ndarray] = None,
256    ancestry: Optional[Union[str, int]] = None,
257    laiobj: Optional[Any] = None,
258    include_self: bool = False,
259) -> pd.DataFrame:
260    """
261    Compute f2-statistics with block-jackknife standard errors.
262
263    Args:
264        data: Either a SNPObject or a tuple (afs, counts, pops), where `afs` and `counts` are arrays of
265              shape (n_snps, n_pops). If a SNPObject is provided, `sample_labels` are used to aggregate samples to populations.
266        pop1, pop2: Populations to compute f2 for. 
267            If None:
268                - with include_self=False (default), compute only off-diagonal pairs i<j.
269                - with include_self=True, compute all pairs including diagonals (i<=j).
270        sample_labels: Population label per sample (aligned with SNPObject.samples) when `data` is a SNPObject.
271        apply_correction: Apply small-sample correction p*(1-p)/(n-1) per population.
272            When True, SNPs with n<=1 in either population are excluded at that SNP.
273        block_size: Number of SNPs per jackknife block (default 5000 SNPs). Ignored if `blocks` is provided.
274        blocks: Optional explicit block id per SNP. If provided, overrides `block_size`.
275        ancestry: Optional ancestry code to mask genotypes to a specific ancestry before aggregation. Requires LAI.
276        laiobj: Optional `LocalAncestryObject` used to derive SNP-level LAI if it is not already present on the SNPObject.
277
278    Returns:
279        Pandas DataFrame with columns: pop1, pop2, est, se, z, p, n_blocks, n_snps
280    """
281    afs, counts, pops = _prepare_inputs(data, sample_labels, ancestry=ancestry, laiobj=laiobj)
282    n_snps, n_pops = afs.shape
283    block_ids, block_lengths = _build_blocks(n_snps, blocks, block_size)
284    n_blocks = block_lengths.size
285
286    if pop1 is None and pop2 is None:
287        if include_self:
288            pop_pairs = [(pops[i], pops[j]) for i in range(n_pops) for j in range(i, n_pops)]
289        else:
290            pop_pairs = [(pops[i], pops[j]) for i in range(n_pops) for j in range(i + 1, n_pops)]
291    else:
292        if pop1 is None or pop2 is None:
293            raise ValueError("Both pop1 and pop2 must be provided when one is provided")
294        if len(pop1) != len(pop2):
295            raise ValueError("pop1 and pop2 must have the same length")
296        pop_pairs = list(zip(pop1, pop2))
297
298    name_to_idx = {p: i for i, p in enumerate(pops)}
299
300    rows: List[Dict[str, Union[str, float, int]]] = []
301    # Precompute per-block index lists for efficiency
302    block_bins = [np.where(block_ids == b)[0] for b in range(n_blocks)]
303
304    for p1, p2 in pop_pairs:
305        i = name_to_idx[p1]
306        j = name_to_idx[p2]
307        p_i = afs[:, i]
308        p_j = afs[:, j]
309        n_i = counts[:, i].astype(float)
310        n_j = counts[:, j].astype(float)
311
312        # per-SNP f2 with optional small-sample correction.
313        # If apply_correction is True, SNPs with n<=1 for either population are excluded.
314        with np.errstate(divide="ignore", invalid="ignore"):
315            num = (p_i - p_j) ** 2
316            if apply_correction:
317                corr_i = np.where(n_i > 1, (p_i * (1.0 - p_i)) / (n_i - 1.0), np.nan)
318                corr_j = np.where(n_j > 1, (p_j * (1.0 - p_j)) / (n_j - 1.0), np.nan)
319                num = num - corr_i - corr_j
320        # SNPs contribute only if the correction was defined (finite) and AFs present
321        if apply_correction:
322            snp_mask = np.isfinite(num)
323        else:
324            snp_mask = np.isfinite(num) & (n_i > 0) & (n_j > 0)
325
326        # Aggregate per block: sums of num and counts. Use NaN to mark empty blocks.
327        num_block_sums = np.full(n_blocks, np.nan, dtype=float)
328        den_block_sums = np.full(n_blocks, np.nan, dtype=float)
329        for b, idx in enumerate(block_bins):
330            if idx.size == 0:
331                continue
332            idx2 = idx[snp_mask[idx]]
333            if idx2.size == 0:
334                continue
335            num_block_sums[b] = float(np.nansum(num[idx2]))
336            den_block_sums[b] = float(idx2.size)
337
338        res = _jackknife_ratio_from_block_sums(num_block_sums, den_block_sums)
339        rows.append(
340            {
341                "pop1": p1,
342                "pop2": p2,
343                "est": res.est,
344                "se": res.se,
345                "z": res.z,
346                "p": res.p,
347                "n_blocks": res.n_blocks,
348                "n_snps": res.n_snps,
349            }
350        )
351
352    return pd.DataFrame(rows)

Compute f2-statistics with block-jackknife standard errors.

Arguments:

data: Either a SNPObject or a tuple (afs, counts, pops), where afs and counts are arrays of shape (n_snps, n_pops). If a SNPObject is provided, sample_labels are used to aggregate samples to populations.
pop1, pop2: Populations to compute f2 for. If None: - with include_self=False (default), compute only off-diagonal pairs i
sample_labels: Population label per sample (aligned with SNPObject.samples) when data is a SNPObject.
apply_correction: Apply small-sample correction p*(1-p)/(n-1) per population. When True, SNPs with n<=1 in either population are excluded at that SNP.
block_size: Number of SNPs per jackknife block (default 5000 SNPs). Ignored if blocks is provided.
blocks: Optional explicit block id per SNP. If provided, overrides block_size.
ancestry: Optional ancestry code to mask genotypes to a specific ancestry before aggregation. Requires LAI.
laiobj: Optional LocalAncestryObject used to derive SNP-level LAI if it is not already present on the SNPObject.

Returns:

Pandas DataFrame with columns: pop1, pop2, est, se, z, p, n_blocks, n_snps

def f3( data: Union[Any, Tuple[numpy.ndarray, numpy.ndarray, List[str]]], target: Optional[Sequence[str]] = None, ref1: Optional[Sequence[str]] = None, ref2: Optional[Sequence[str]] = None, sample_labels: Optional[Sequence[str]] = None, apply_correction: bool = True, block_size: int = 5000, blocks: Optional[numpy.ndarray] = None, ancestry: Union[int, str, NoneType] = None, laiobj: Optional[Any] = None) -> pandas.core.frame.DataFrame: View Source

355def f3(
356    data: Union[Any, Tuple[np.ndarray, np.ndarray, List[str]]],
357    target: Optional[Sequence[str]] = None,
358    ref1: Optional[Sequence[str]] = None,
359    ref2: Optional[Sequence[str]] = None,
360    sample_labels: Optional[Sequence[str]] = None,
361    apply_correction: bool = True,
362    block_size: int = 5000,
363    blocks: Optional[np.ndarray] = None,
364    ancestry: Optional[Union[str, int]] = None,
365    laiobj: Optional[Any] = None,
366) -> pd.DataFrame:
367    """
368    Compute f3-statistics f3(target; ref1, ref2) with block-jackknife SE.
369
370    - `block_size` is the number of SNPs per jackknife block (default 5000 SNPs). Ignored if `blocks` is provided.
371    - If `target`, `ref1`, and `ref2` are all None, compute f3 for all combinations where each role can be any population.
372    - If `ancestry` is provided, genotypes will be masked to the specified ancestry using LAI before aggregation.
373    - If `apply_correction` is True, subtract the finite sample term p_t*(1-p_t)/(n_t-1) from the per-SNP product.
374        When True, SNPs with n_t<=1 are excluded.
375    """
376    afs, counts, pops = _prepare_inputs(data, sample_labels, ancestry=ancestry, laiobj=laiobj)
377    n_snps, _ = afs.shape
378    block_ids, block_lengths = _build_blocks(n_snps, blocks, block_size)
379    n_blocks = block_lengths.size
380
381    if target is None and ref1 is None and ref2 is None:
382        triples = [(a, b, c) for a in pops for b in pops for c in pops]
383    else:
384        if target is None or ref1 is None or ref2 is None:
385            raise ValueError("target, ref1, and ref2 must all be provided if any is provided")
386        if not (len(target) == len(ref1) == len(ref2)):
387            raise ValueError("target, ref1, ref2 must have the same length")
388        triples = list(zip(target, ref1, ref2))
389
390    name_to_idx = {p: i for i, p in enumerate(pops)}
391    block_bins = [np.where(block_ids == b)[0] for b in range(n_blocks)]
392    rows: List[Dict[str, Union[str, float, int]]] = []
393
394    for t, r1, r2 in triples:
395        it = name_to_idx[t]
396        i1 = name_to_idx[r1]
397        i2 = name_to_idx[r2]
398        pt = afs[:, it]
399        p1 = afs[:, i1]
400        p2 = afs[:, i2]
401        nt = counts[:, it].astype(float)
402        n1 = counts[:, i1].astype(float)
403        n2 = counts[:, i2].astype(float)
404
405        with np.errstate(invalid="ignore", divide="ignore"):
406            num = (pt - p1) * (pt - p2)
407            if apply_correction:
408                corr_t = np.where(nt > 1, (pt * (1.0 - pt)) / (nt - 1.0), np.nan)
409                num = num - corr_t
410        if apply_correction:
411            # Require a valid correction on the target, references can be n>0
412            snp_mask = np.isfinite(num) & (n1 > 0) & (n2 > 0)
413        else:
414            snp_mask = np.isfinite(num) & (nt > 0) & (n1 > 0) & (n2 > 0)
415
416        num_block_sums = np.full(n_blocks, np.nan, dtype=float)
417        den_block_sums = np.full(n_blocks, np.nan, dtype=float)
418        for b, idx in enumerate(block_bins):
419            if idx.size == 0:
420                continue
421            idx2 = idx[snp_mask[idx]]
422            if idx2.size == 0:
423                continue
424            num_block_sums[b] = float(np.nansum(num[idx2]))
425            den_block_sums[b] = float(idx2.size)
426
427        res = _jackknife_ratio_from_block_sums(num_block_sums, den_block_sums)
428        rows.append(
429            {
430                "target": t,
431                "ref1": r1,
432                "ref2": r2,
433                "est": res.est,
434                "se": res.se,
435                "z": res.z,
436                "p": res.p,
437                "n_blocks": res.n_blocks,
438                "n_snps": res.n_snps,
439            }
440        )
441
442    return pd.DataFrame(rows)

Compute f3-statistics f3(target; ref1, ref2) with block-jackknife SE.

block_size is the number of SNPs per jackknife block (default 5000 SNPs). Ignored if blocks is provided.
If target, ref1, and ref2 are all None, compute f3 for all combinations where each role can be any population.
If ancestry is provided, genotypes will be masked to the specified ancestry using LAI before aggregation.
If apply_correction is True, subtract the finite sample term p_t*(1-p_t)/(n_t-1) from the per-SNP product. When True, SNPs with n_t<=1 are excluded.

def f4( data: Union[Any, Tuple[numpy.ndarray, numpy.ndarray, List[str]]], a: Optional[Sequence[str]] = None, b: Optional[Sequence[str]] = None, c: Optional[Sequence[str]] = None, d: Optional[Sequence[str]] = None, sample_labels: Optional[Sequence[str]] = None, block_size: int = 5000, blocks: Optional[numpy.ndarray] = None, ancestry: Union[int, str, NoneType] = None, laiobj: Optional[Any] = None) -> pandas.core.frame.DataFrame: View Source

445def f4(
446    data: Union[Any, Tuple[np.ndarray, np.ndarray, List[str]]],
447    a: Optional[Sequence[str]] = None,
448    b: Optional[Sequence[str]] = None,
449    c: Optional[Sequence[str]] = None,
450    d: Optional[Sequence[str]] = None,
451    sample_labels: Optional[Sequence[str]] = None,
452    block_size: int = 5000,
453    blocks: Optional[np.ndarray] = None,
454    ancestry: Optional[Union[str, int]] = None,
455    laiobj: Optional[Any] = None,
456) -> pd.DataFrame:
457    """
458    Compute f4-statistics f4(a, b; c, d) with block-jackknife SE.
459
460    - `block_size` is the number of SNPs per jackknife block (default 5000 SNPs). Ignored if `blocks` is provided.
461    - If `ancestry` is provided, genotypes will be masked to the specified ancestry using LAI before aggregation.
462    """
463    afs, counts, pops = _prepare_inputs(data, sample_labels, ancestry=ancestry, laiobj=laiobj)
464    n_snps, _ = afs.shape
465    block_ids, block_lengths = _build_blocks(n_snps, blocks, block_size)
466    n_blocks = block_lengths.size
467
468    if a is None and b is None and c is None and d is None:
469        quads = [(w, x, y, z) for w in pops for x in pops for y in pops for z in pops]
470    else:
471        if a is None or b is None or c is None or d is None:
472            raise ValueError("a, b, c, d must all be provided if any is provided")
473        if not (len(a) == len(b) == len(c) == len(d)):
474            raise ValueError("a, b, c, d must have the same length")
475        quads = list(zip(a, b, c, d))
476
477    name_to_idx = {p: i for i, p in enumerate(pops)}
478    block_bins = [np.where(block_ids == b)[0] for b in range(n_blocks)]
479    rows: List[Dict[str, Union[str, float, int]]] = []
480
481    for pa, pb, pc, dpop in quads:
482        ia = name_to_idx[pa]
483        ib = name_to_idx[pb]
484        ic = name_to_idx[pc]
485        id_ = name_to_idx[dpop]
486
487        A = afs[:, ia]
488        B = afs[:, ib]
489        C = afs[:, ic]
490        D = afs[:, id_]
491        na = counts[:, ia].astype(float)
492        nb = counts[:, ib].astype(float)
493        nc = counts[:, ic].astype(float)
494        nd = counts[:, id_].astype(float)
495
496        with np.errstate(invalid="ignore"):
497            num = (A - B) * (C - D)
498        snp_mask = np.isfinite(num) & (na > 0) & (nb > 0) & (nc > 0) & (nd > 0)
499
500        num_block_sums = np.full(n_blocks, np.nan, dtype=float)
501        den_block_sums = np.full(n_blocks, np.nan, dtype=float)
502        for b_idx, idx in enumerate(block_bins):
503            if idx.size == 0:
504                continue
505            idx2 = idx[snp_mask[idx]]
506            if idx2.size == 0:
507                continue
508            num_block_sums[b_idx] = float(np.nansum(num[idx2]))
509            den_block_sums[b_idx] = float(idx2.size)
510
511        res = _jackknife_ratio_from_block_sums(num_block_sums, den_block_sums)
512        rows.append(
513            {
514                "a": pa,
515                "b": pb,
516                "c": pc,
517                "d": dpop,
518                "est": res.est,
519                "se": res.se,
520                "z": res.z,
521                "p": res.p,
522                "n_blocks": res.n_blocks,
523                "n_snps": res.n_snps,
524            }
525        )
526
527    return pd.DataFrame(rows)

Compute f4-statistics f4(a, b; c, d) with block-jackknife SE.

block_size is the number of SNPs per jackknife block (default 5000 SNPs). Ignored if blocks is provided.
If ancestry is provided, genotypes will be masked to the specified ancestry using LAI before aggregation.

def d_stat( data: Union[Any, Tuple[numpy.ndarray, numpy.ndarray, List[str]]], a: Optional[Sequence[str]] = None, b: Optional[Sequence[str]] = None, c: Optional[Sequence[str]] = None, d: Optional[Sequence[str]] = None, sample_labels: Optional[Sequence[str]] = None, block_size: int = 5000, blocks: Optional[numpy.ndarray] = None, ancestry: Union[int, str, NoneType] = None, laiobj: Optional[Any] = None) -> pandas.core.frame.DataFrame: View Source

530def d_stat(
531    data: Union[Any, Tuple[np.ndarray, np.ndarray, List[str]]],
532    a: Optional[Sequence[str]] = None,
533    b: Optional[Sequence[str]] = None,
534    c: Optional[Sequence[str]] = None,
535    d: Optional[Sequence[str]] = None,
536    sample_labels: Optional[Sequence[str]] = None,
537    block_size: int = 5000,
538    blocks: Optional[np.ndarray] = None,
539    ancestry: Optional[Union[str, int]] = None,
540    laiobj: Optional[Any] = None,
541) -> pd.DataFrame:
542    """
543    Compute D-statistics D(a, b; c, d) as ratio of sums:
544        D = sum_l (A-B)(C-D)  /  sum_l (A+B-2AB)(C+D-2CD)
545    with delete-one block jackknife SE.
546
547    - `block_size` is the number of SNPs per jackknife block (default 5000 SNPs). Ignored if `blocks` is provided.
548    - If `ancestry` is provided, genotypes will be masked to the specified ancestry using LAI before aggregation.
549    """
550    afs, counts, pops = _prepare_inputs(data, sample_labels, ancestry=ancestry, laiobj=laiobj)
551    n_snps, _ = afs.shape
552    block_ids, block_lengths = _build_blocks(n_snps, blocks, block_size)
553    n_blocks = block_lengths.size
554
555    if a is None and b is None and c is None and d is None:
556        quads = [(w, x, y, z) for w in pops for x in pops for y in pops for z in pops]
557    else:
558        if a is None or b is None or c is None or d is None:
559            raise ValueError("a, b, c, d must all be provided if any is provided")
560        if not (len(a) == len(b) == len(c) == len(d)):
561            raise ValueError("a, b, c, d must have the same length")
562        quads = list(zip(a, b, c, d))
563
564    name_to_idx = {p: i for i, p in enumerate(pops)}
565    block_bins = [np.where(block_ids == b)[0] for b in range(n_blocks)]
566    rows: List[Dict[str, Union[str, float, int]]] = []
567
568    for pa, pb, pc, dpop in quads:
569        ia = name_to_idx[pa]
570        ib = name_to_idx[pb]
571        ic = name_to_idx[pc]
572        id_ = name_to_idx[dpop]
573
574        A = afs[:, ia]
575        B = afs[:, ib]
576        C = afs[:, ic]
577        D = afs[:, id_]
578        na = counts[:, ia].astype(float)
579        nb = counts[:, ib].astype(float)
580        nc = counts[:, ic].astype(float)
581        nd = counts[:, id_].astype(float)
582
583        with np.errstate(invalid="ignore"):
584            num = (A - B) * (C - D)
585            den = (A + B - 2 * A * B) * (C + D - 2 * C * D)
586
587        snp_mask = np.isfinite(num) & np.isfinite(den) & (na > 0) & (nb > 0) & (nc > 0) & (nd > 0)
588
589        num_block_sums = np.full(n_blocks, np.nan, dtype=float)
590        den_block_sums = np.full(n_blocks, np.nan, dtype=float)
591        ct_block_sums = np.zeros(n_blocks, dtype=int)
592        for b_idx, idx in enumerate(block_bins):
593            if idx.size == 0:
594                continue
595            idx2 = idx[snp_mask[idx]]
596            if idx2.size == 0:
597                continue
598            num_block_sums[b_idx] = float(np.nansum(num[idx2]))
599            # drop near-zero denominators within a block for stability
600            block_den = float(np.nansum(den[idx2]))
601            den_block_sums[b_idx] = block_den if abs(block_den) > 1e-12 else np.nan
602            ct_block_sums[b_idx] = int(idx2.size)
603            
604        res = _jackknife_ratio_from_block_sums(num_block_sums, den_block_sums)
605        rows.append(
606            {
607                "a": pa,
608                "b": pb,
609                "c": pc,
610                "d": dpop,
611                "est": res.est,
612                "se": res.se,
613                "z": res.z,
614                "p": res.p,
615                "n_blocks": res.n_blocks,
616                "n_snps": int(ct_block_sums.sum()),
617            }
618        )
619
620    return pd.DataFrame(rows)

Compute D-statistics D(a, b; c, d) as ratio of sums: D = sum_l (A-B)(C-D) / sum_l (A+B-2AB)(C+D-2CD) with delete-one block jackknife SE.

block_size is the number of SNPs per jackknife block (default 5000 SNPs). Ignored if blocks is provided.
If ancestry is provided, genotypes will be masked to the specified ancestry using LAI before aggregation.

def f4_ratio( data: Union[Any, Tuple[numpy.ndarray, numpy.ndarray, List[str]]], num: Sequence[Tuple[str, str, str, str]], den: Sequence[Tuple[str, str, str, str]], sample_labels: Optional[Sequence[str]] = None, block_size: int = 5000, blocks: Optional[numpy.ndarray] = None, ancestry: Union[int, str, NoneType] = None, laiobj: Optional[Any] = None) -> pandas.core.frame.DataFrame: View Source

623def f4_ratio(
624    data: Union[Any, Tuple[np.ndarray, np.ndarray, List[str]]],
625    num: Sequence[Tuple[str, str, str, str]],
626    den: Sequence[Tuple[str, str, str, str]],
627    sample_labels: Optional[Sequence[str]] = None,
628    block_size: int = 5000,
629    blocks: Optional[np.ndarray] = None,
630    ancestry: Optional[Union[str, int]] = None,
631    laiobj: Optional[Any] = None,
632) -> pd.DataFrame:
633    """
634    Compute f4-ratio statistics as ratio of two f4-statistics with block-jackknife SE.
635
636    Args:
637        num: Sequence of quadruples (a, b, c, d) for numerator f4(a, b; c, d)
638        den: Sequence of quadruples (a, b, c, d) for denominator f4(a, b; c, d)
639
640    Notes:
641        - `block_size` is the number of SNPs per jackknife block (default 5000 SNPs). Ignored if `blocks` is provided.
642        - If `ancestry` is provided, genotypes will be masked to the specified ancestry using LAI before aggregation.
643    """
644    if len(num) != len(den):
645        raise ValueError("'num' and 'den' must have the same length")
646
647    afs, counts, pops = _prepare_inputs(data, sample_labels, ancestry=ancestry, laiobj=laiobj)
648    n_snps, _ = afs.shape
649    block_ids, block_lengths = _build_blocks(n_snps, blocks, block_size)
650    n_blocks = block_lengths.size
651    block_bins = [np.where(block_ids == b)[0] for b in range(n_blocks)]
652    name_to_idx = {p: i for i, p in enumerate(pops)}
653
654    rows: List[Dict[str, Union[str, float, int]]] = []
655    for (na, nb, nc, nd), (da, db, dc, dd) in zip(num, den):
656        ia, ib, ic, id_ = name_to_idx[na], name_to_idx[nb], name_to_idx[nc], name_to_idx[nd]
657        ja, jb, jc, jd = name_to_idx[da], name_to_idx[db], name_to_idx[dc], name_to_idx[dd]
658
659        A, B, C, D = afs[:, ia], afs[:, ib], afs[:, ic], afs[:, id_]
660        E, F, G, H = afs[:, ja], afs[:, jb], afs[:, jc], afs[:, jd]
661        nA, nB, nC, nD = counts[:, ia], counts[:, ib], counts[:, ic], counts[:, id_]
662        nE, nF, nG, nH = counts[:, ja], counts[:, jb], counts[:, jc], counts[:, jd]
663
664        with np.errstate(invalid="ignore"):
665            num_snp = (A - B) * (C - D)
666            den_snp = (E - F) * (G - H)
667        mask_num = np.isfinite(num_snp) & (nA > 0) & (nB > 0) & (nC > 0) & (nD > 0)
668        mask_den = np.isfinite(den_snp) & (nE > 0) & (nF > 0) & (nG > 0) & (nH > 0)
669        mask_both = mask_num & mask_den
670
671        num_block_sums = np.full(n_blocks, np.nan, dtype=float)
672        den_block_sums = np.full(n_blocks, np.nan, dtype=float)
673        ct_block_sums = np.zeros(n_blocks, dtype=int)
674        for b_idx, idx in enumerate(block_bins):
675            if idx.size == 0:
676                continue
677            idx2 = idx[mask_both[idx]]
678            if idx2.size == 0:
679                continue
680            num_block_sums[b_idx] = float(np.nansum(num_snp[idx2]))
681            block_den = float(np.nansum(den_snp[idx2]))
682            den_block_sums[b_idx] = block_den if abs(block_den) > 1e-12 else np.nan
683            ct_block_sums[b_idx] = int(idx2.size)
684            
685        res = _jackknife_ratio_from_block_sums(num_block_sums, den_block_sums)
686        rows.append(
687            {
688                "num": f"({na},{nb};{nc},{nd})",
689                "den": f"({da},{db};{dc},{dd})",
690                "est": res.est,
691                "se": res.se,
692                "z": res.z,
693                "p": res.p,
694                "n_blocks": res.n_blocks,
695                "n_snps": int(ct_block_sums.sum()),
696            }
697        )
698
699    return pd.DataFrame(rows)

Compute f4-ratio statistics as ratio of two f4-statistics with block-jackknife SE.

Arguments:

num: Sequence of quadruples (a, b, c, d) for numerator f4(a, b; c, d)
den: Sequence of quadruples (a, b, c, d) for denominator f4(a, b; c, d)

Notes:

block_size is the number of SNPs per jackknife block (default 5000 SNPs). Ignored if blocks is provided.

If ancestry is provided, genotypes will be masked to the specified ancestry using LAI before aggregation.

def fst( data: Union[Any, Tuple[numpy.ndarray, numpy.ndarray, List[str]]], pop1: Optional[Sequence[str]] = None, pop2: Optional[Sequence[str]] = None, *, method: str = 'hudson', sample_labels: Optional[Sequence[str]] = None, block_size: int = 5000, blocks: Optional[numpy.ndarray] = None, ancestry: Union[int, str, NoneType] = None, laiobj: Optional[Any] = None, include_self: bool = False) -> pandas.core.frame.DataFrame: View Source

702def fst(
703    data: Union[Any, Tuple[np.ndarray, np.ndarray, List[str]]],
704    pop1: Optional[Sequence[str]] = None,
705    pop2: Optional[Sequence[str]] = None,
706    *,
707    method: str = "hudson",
708    sample_labels: Optional[Sequence[str]] = None,
709    block_size: int = 5000,
710    blocks: Optional[np.ndarray] = None,
711    ancestry: Optional[Union[str, int]] = None,
712    laiobj: Optional[Any] = None,
713    include_self: bool = False,
714) -> pd.DataFrame:
715    """
716    Pairwise F_ST with delete-one block jackknife SE.
717
718    Methods:
719      - 'hudson' (a.k.a. "ratio of averages" following Hudson 1992 / Bhatia 2013):
720            per-SNP num = d_xy - 0.5*(pi_x + pi_y)
721            per-SNP den = d_xy
722        where d_xy = p_x*(1-p_y) + p_y*(1-p_x) and
723              pi_x = 2*p_x*(1-p_x) * n_x/(n_x - 1) (unbiased within-pop diversity on haplotypes).
724      - 'weir_cockerham' (Weir & Cockerham 1984; θ):
725            compute per-SNP variance components a,b,c (for r=2) from allele freqs and haplotype counts:
726                n1, n2 = haplotype counts; p1, p2 = allele freqs
727                n  = n1 + n2
728                n_bar = n / 2
729                p_bar = (n1*p1 + n2*p2) / n
730                s2    = (n1*(p1 - p_bar)**2 + n2*(p2 - p_bar)**2) / n_bar
731                h1, h2 = 2*p1*(1-p1), 2*p2*(1-p2)
732                h_bar = 0.5*(h1 + h2)
733                n_c   = (n - (n1**2 + n2**2)/n)  # equals 2*n1*n2/n for r=2
734                a = (n_bar / n_c) * (s2 - (p_bar*(1 - p_bar) - 0.5*s2 - 0.25*h_bar) / (n_bar - 1))
735                b = (n_bar / (n_bar - 1)) * (p_bar*(1 - p_bar) - 0.5*s2 - ((2*n_bar - 1)/(4*n_bar))*h_bar)
736                c = 0.5 * h_bar
737            and use num=a, den=(a+b+c), then ratio-of-sums jackknife.
738
739    Notes:
740      * Inputs are the same as f2/f3/f4: either SNPObject or (afs, counts, pops).
741      * For WC we use expected heterozygosity h_i = 2 p_i (1 - p_i) from allele freqs.
742      * SNPs with n<=1 in either pop or with invalid denominators are ignored.
743    """
744    method = str(method).strip().lower().replace(" ", "_").replace("-", "_")
745    if method in ("wc", "weir", "weir_cockerham", "weir-cockerham"):
746        method = "weir_cockerham"
747    elif method in ("h", "hudson", "bhatia", "ratio", "ratio_of_averages", "ratio-of-averages"):
748        method = "hudson"
749    elif method not in ("hudson", "weir_cockerham"):
750        # only raise if it's neither a known alias nor a canonical name
751        raise ValueError(f"Unknown method for fst: {method!r}")
752
753    afs, counts, pops = _prepare_inputs(data, sample_labels, ancestry=ancestry, laiobj=laiobj)
754    n_snps, n_pops = afs.shape
755    block_ids, block_lengths = _build_blocks(n_snps, blocks, block_size)
756    n_blocks = block_lengths.size
757
758    if pop1 is None and pop2 is None:
759        if include_self:
760            pairs = [(pops[i], pops[j]) for i in range(n_pops) for j in range(i, n_pops)]
761        else:
762            pairs = [(pops[i], pops[j]) for i in range(n_pops) for j in range(i + 1, n_pops)]
763    else:
764        if pop1 is None or pop2 is None or len(pop1) != len(pop2):
765            raise ValueError("pop1 and pop2 must both be provided and of equal length")
766        pairs = list(zip(pop1, pop2))
767
768    name_to_idx = {p: i for i, p in enumerate(pops)}
769    block_bins = [np.where(block_ids == b)[0] for b in range(n_blocks)]
770
771    out_rows: List[Dict[str, Union[str, float, int]]] = []
772
773    for pA, pB in pairs:
774        i = name_to_idx[pA]
775        j = name_to_idx[pB]
776        p1 = afs[:, i].astype(float)
777        p2 = afs[:, j].astype(float)
778        n1 = counts[:, i].astype(float)
779        n2 = counts[:, j].astype(float)
780
781        valid = np.isfinite(p1) & np.isfinite(p2)
782
783        if method == "hudson":
784            # d_xy and within-pop diversities (unbiased, haplotype-based)
785            d_xy = p1 * (1.0 - p2) + p2 * (1.0 - p1)
786            with np.errstate(divide="ignore", invalid="ignore"):
787                pi1 = 2.0 * p1 * (1.0 - p1) * (n1 / (n1 - 1.0))
788                pi2 = 2.0 * p2 * (1.0 - p2) * (n2 / (n2 - 1.0))
789            num_snp = d_xy - 0.5 * (pi1 + pi2)
790            den_snp = d_xy
791            snp_mask = valid & (n1 > 1) & (n2 > 1) & np.isfinite(num_snp) & np.isfinite(den_snp)
792        else:
793            # Weir-Cockerham θ components (r=2)
794            n = n1 + n2
795            with np.errstate(divide="ignore", invalid="ignore"):
796                n_bar = n / 2.0
797                p_bar = np.where(n > 0, (n1 * p1 + n2 * p2) / n, np.nan)
798                s2 = (n1 * (p1 - p_bar) ** 2 + n2 * (p2 - p_bar) ** 2) / n_bar
799                h1 = 2.0 * p1 * (1.0 - p1)
800                h2 = 2.0 * p2 * (1.0 - p2)
801                h_bar = 0.5 * (h1 + h2)
802                n_c = n - (n1 * n1 + n2 * n2) / np.where(n > 0, n, np.nan)  # == 2*n1*n2/n
803                # components
804                a = (n_bar / n_c) * (s2 - (p_bar * (1.0 - p_bar) - 0.5 * s2 - 0.25 * h_bar) / (n_bar - 1.0))
805                b = (n_bar / (n_bar - 1.0)) * (p_bar * (1.0 - p_bar) - 0.5 * s2 - ((2.0 * n_bar - 1.0) / (4.0 * n_bar)) * h_bar)
806                c = 0.5 * h_bar
807                num_snp = a
808                den_snp = a + b + c
809            # Need at least 2 haplotypes per pop and well-defined denominators
810            snp_mask = valid & (n1 > 1) & (n2 > 1) & np.isfinite(num_snp) & np.isfinite(den_snp)
811
812        # Aggregate by blocks
813        num_block_sums = np.full(n_blocks, np.nan, dtype=float)
814        den_block_sums = np.full(n_blocks, np.nan, dtype=float)
815        ct_block_sums = np.zeros(n_blocks, dtype=int)
816
817        for b_idx, idx in enumerate(block_bins):
818            if idx.size == 0:
819                continue
820            idx2 = idx[snp_mask[idx]]
821            if idx2.size == 0:
822                continue
823            ns = float(np.nansum(num_snp[idx2]))
824            ds = float(np.nansum(den_snp[idx2]))
825            # drop numerically-zero denominators for stability
826            den_block_sums[b_idx] = ds if abs(ds) > 1e-12 else np.nan
827            num_block_sums[b_idx] = ns
828            ct_block_sums[b_idx] = int(idx2.size)
829
830        res = _jackknife_ratio_from_block_sums(num_block_sums, den_block_sums)
831        out_rows.append(
832            {
833                "pop1": pA,
834                "pop2": pB,
835                "method": method,
836                "est": res.est,
837                "se": res.se,
838                "z": res.z,
839                "p": res.p,
840                "n_blocks": res.n_blocks,
841                "n_snps": int(ct_block_sums.sum()),
842            }
843        )
844
845    return pd.DataFrame(out_rows)

Pairwise F_ST with delete-one block jackknife SE.

Methods:

'hudson' (a.k.a. "ratio of averages" following Hudson 1992 / Bhatia 2013): per-SNP num = d_xy - 0.5*(pi_x + pi_y) per-SNP den = d_xy where d_xy = p_x(1-p_y) + p_y(1-p_x) and pi_x = 2p_x(1-p_x) * n_x/(n_x - 1) (unbiased within-pop diversity on haplotypes).

'weir_cockerham' (Weir & Cockerham 1984; θ): compute per-SNP variance components a,b,c (for r=2) from allele freqs and haplotype counts: n1, n2 = haplotype counts; p1, p2 = allele freqs n = n1 + n2 n_bar = n / 2 p_bar = (n1*p1 + n2*p2) / n s2 = (n1*(p1 - p_bar)2 + n2*(p2 - p_bar)2) / n_bar h1, h2 = 2p1(1-p1), 2p2(1-p2) h_bar = 0.5(h1 + h2) n_c = (n - (n1**2 + n2**2)/n) # equals 2*n1*n2/n for r=2 a = (n_bar / n_c) * (s2 - (p_bar(1 - p_bar) - 0.5s2 - 0.25h_bar) / (n_bar - 1)) b = (n_bar / (n_bar - 1)) * (p_bar(1 - p_bar) - 0.5*s2 - ((2*n_bar - 1)/(4n_bar))*h_bar) c = 0.5 * h_bar and use num=a, den=(a+b+c), then ratio-of-sums jackknife.

Notes:

Inputs are the same as f2/f3/f4: either SNPObject or (afs, counts, pops).

For WC we use expected heterozygosity h_i = 2 p_i (1 - p_i) from allele freqs.

SNPs with n<=1 in either pop or with invalid denominators are ignored.