snputils.stats
def
f2( data: Union[Any, Tuple[numpy.ndarray, numpy.ndarray, List[str]]], pop1: Optional[Sequence[str]] = None, pop2: Optional[Sequence[str]] = None, sample_labels: Optional[Sequence[str]] = None, apply_correction: bool = True, block_size: int = 5000, blocks: Optional[numpy.ndarray] = None, ancestry: Union[int, str, NoneType] = None, laiobj: Optional[Any] = None, include_self: bool = False) -> pandas.core.frame.DataFrame:
248def f2( 249 data: Union[Any, Tuple[np.ndarray, np.ndarray, List[str]]], 250 pop1: Optional[Sequence[str]] = None, 251 pop2: Optional[Sequence[str]] = None, 252 sample_labels: Optional[Sequence[str]] = None, 253 apply_correction: bool = True, 254 block_size: int = 5000, 255 blocks: Optional[np.ndarray] = None, 256 ancestry: Optional[Union[str, int]] = None, 257 laiobj: Optional[Any] = None, 258 include_self: bool = False, 259) -> pd.DataFrame: 260 """ 261 Compute f2-statistics with block-jackknife standard errors. 262 263 Args: 264 data: Either a SNPObject or a tuple (afs, counts, pops), where `afs` and `counts` are arrays of 265 shape (n_snps, n_pops). If a SNPObject is provided, `sample_labels` are used to aggregate samples to populations. 266 pop1, pop2: Populations to compute f2 for. 267 If None: 268 - with include_self=False (default), compute only off-diagonal pairs i<j. 269 - with include_self=True, compute all pairs including diagonals (i<=j). 270 sample_labels: Population label per sample (aligned with SNPObject.samples) when `data` is a SNPObject. 271 apply_correction: Apply small-sample correction p*(1-p)/(n-1) per population. 272 When True, SNPs with n<=1 in either population are excluded at that SNP. 273 block_size: Number of SNPs per jackknife block (default 5000 SNPs). Ignored if `blocks` is provided. 274 blocks: Optional explicit block id per SNP. If provided, overrides `block_size`. 275 ancestry: Optional ancestry code to mask genotypes to a specific ancestry before aggregation. Requires LAI. 276 laiobj: Optional `LocalAncestryObject` used to derive SNP-level LAI if it is not already present on the SNPObject. 277 278 Returns: 279 Pandas DataFrame with columns: pop1, pop2, est, se, z, p, n_blocks, n_snps 280 """ 281 afs, counts, pops = _prepare_inputs(data, sample_labels, ancestry=ancestry, laiobj=laiobj) 282 n_snps, n_pops = afs.shape 283 block_ids, block_lengths = _build_blocks(n_snps, blocks, block_size) 284 n_blocks = block_lengths.size 285 286 if pop1 is None and pop2 is None: 287 if include_self: 288 pop_pairs = [(pops[i], pops[j]) for i in range(n_pops) for j in range(i, n_pops)] 289 else: 290 pop_pairs = [(pops[i], pops[j]) for i in range(n_pops) for j in range(i + 1, n_pops)] 291 else: 292 if pop1 is None or pop2 is None: 293 raise ValueError("Both pop1 and pop2 must be provided when one is provided") 294 if len(pop1) != len(pop2): 295 raise ValueError("pop1 and pop2 must have the same length") 296 pop_pairs = list(zip(pop1, pop2)) 297 298 name_to_idx = {p: i for i, p in enumerate(pops)} 299 300 rows: List[Dict[str, Union[str, float, int]]] = [] 301 # Precompute per-block index lists for efficiency 302 block_bins = [np.where(block_ids == b)[0] for b in range(n_blocks)] 303 304 for p1, p2 in pop_pairs: 305 i = name_to_idx[p1] 306 j = name_to_idx[p2] 307 p_i = afs[:, i] 308 p_j = afs[:, j] 309 n_i = counts[:, i].astype(float) 310 n_j = counts[:, j].astype(float) 311 312 # per-SNP f2 with optional small-sample correction. 313 # If apply_correction is True, SNPs with n<=1 for either population are excluded. 314 with np.errstate(divide="ignore", invalid="ignore"): 315 num = (p_i - p_j) ** 2 316 if apply_correction: 317 corr_i = np.where(n_i > 1, (p_i * (1.0 - p_i)) / (n_i - 1.0), np.nan) 318 corr_j = np.where(n_j > 1, (p_j * (1.0 - p_j)) / (n_j - 1.0), np.nan) 319 num = num - corr_i - corr_j 320 # SNPs contribute only if the correction was defined (finite) and AFs present 321 if apply_correction: 322 snp_mask = np.isfinite(num) 323 else: 324 snp_mask = np.isfinite(num) & (n_i > 0) & (n_j > 0) 325 326 # Aggregate per block: sums of num and counts. Use NaN to mark empty blocks. 327 num_block_sums = np.full(n_blocks, np.nan, dtype=float) 328 den_block_sums = np.full(n_blocks, np.nan, dtype=float) 329 for b, idx in enumerate(block_bins): 330 if idx.size == 0: 331 continue 332 idx2 = idx[snp_mask[idx]] 333 if idx2.size == 0: 334 continue 335 num_block_sums[b] = float(np.nansum(num[idx2])) 336 den_block_sums[b] = float(idx2.size) 337 338 res = _jackknife_ratio_from_block_sums(num_block_sums, den_block_sums) 339 rows.append( 340 { 341 "pop1": p1, 342 "pop2": p2, 343 "est": res.est, 344 "se": res.se, 345 "z": res.z, 346 "p": res.p, 347 "n_blocks": res.n_blocks, 348 "n_snps": res.n_snps, 349 } 350 ) 351 352 return pd.DataFrame(rows)
Compute f2-statistics with block-jackknife standard errors.
Arguments:
- data: Either a SNPObject or a tuple (afs, counts, pops), where
afs
andcounts
are arrays of shape (n_snps, n_pops). If a SNPObject is provided,sample_labels
are used to aggregate samples to populations. - pop1, pop2: Populations to compute f2 for.
If None:
- with include_self=False (default), compute only off-diagonal pairs i
- sample_labels: Population label per sample (aligned with SNPObject.samples) when
data
is a SNPObject.- apply_correction: Apply small-sample correction p*(1-p)/(n-1) per population. When True, SNPs with n<=1 in either population are excluded at that SNP.
- block_size: Number of SNPs per jackknife block (default 5000 SNPs). Ignored if
blocks
is provided.- blocks: Optional explicit block id per SNP. If provided, overrides
block_size
.- ancestry: Optional ancestry code to mask genotypes to a specific ancestry before aggregation. Requires LAI.
- laiobj: Optional
LocalAncestryObject
used to derive SNP-level LAI if it is not already present on the SNPObject. - sample_labels: Population label per sample (aligned with SNPObject.samples) when
Returns:
Pandas DataFrame with columns: pop1, pop2, est, se, z, p, n_blocks, n_snps
def
f3( data: Union[Any, Tuple[numpy.ndarray, numpy.ndarray, List[str]]], target: Optional[Sequence[str]] = None, ref1: Optional[Sequence[str]] = None, ref2: Optional[Sequence[str]] = None, sample_labels: Optional[Sequence[str]] = None, apply_correction: bool = True, block_size: int = 5000, blocks: Optional[numpy.ndarray] = None, ancestry: Union[int, str, NoneType] = None, laiobj: Optional[Any] = None) -> pandas.core.frame.DataFrame:
355def f3( 356 data: Union[Any, Tuple[np.ndarray, np.ndarray, List[str]]], 357 target: Optional[Sequence[str]] = None, 358 ref1: Optional[Sequence[str]] = None, 359 ref2: Optional[Sequence[str]] = None, 360 sample_labels: Optional[Sequence[str]] = None, 361 apply_correction: bool = True, 362 block_size: int = 5000, 363 blocks: Optional[np.ndarray] = None, 364 ancestry: Optional[Union[str, int]] = None, 365 laiobj: Optional[Any] = None, 366) -> pd.DataFrame: 367 """ 368 Compute f3-statistics f3(target; ref1, ref2) with block-jackknife SE. 369 370 - `block_size` is the number of SNPs per jackknife block (default 5000 SNPs). Ignored if `blocks` is provided. 371 - If `target`, `ref1`, and `ref2` are all None, compute f3 for all combinations where each role can be any population. 372 - If `ancestry` is provided, genotypes will be masked to the specified ancestry using LAI before aggregation. 373 - If `apply_correction` is True, subtract the finite sample term p_t*(1-p_t)/(n_t-1) from the per-SNP product. 374 When True, SNPs with n_t<=1 are excluded. 375 """ 376 afs, counts, pops = _prepare_inputs(data, sample_labels, ancestry=ancestry, laiobj=laiobj) 377 n_snps, _ = afs.shape 378 block_ids, block_lengths = _build_blocks(n_snps, blocks, block_size) 379 n_blocks = block_lengths.size 380 381 if target is None and ref1 is None and ref2 is None: 382 triples = [(a, b, c) for a in pops for b in pops for c in pops] 383 else: 384 if target is None or ref1 is None or ref2 is None: 385 raise ValueError("target, ref1, and ref2 must all be provided if any is provided") 386 if not (len(target) == len(ref1) == len(ref2)): 387 raise ValueError("target, ref1, ref2 must have the same length") 388 triples = list(zip(target, ref1, ref2)) 389 390 name_to_idx = {p: i for i, p in enumerate(pops)} 391 block_bins = [np.where(block_ids == b)[0] for b in range(n_blocks)] 392 rows: List[Dict[str, Union[str, float, int]]] = [] 393 394 for t, r1, r2 in triples: 395 it = name_to_idx[t] 396 i1 = name_to_idx[r1] 397 i2 = name_to_idx[r2] 398 pt = afs[:, it] 399 p1 = afs[:, i1] 400 p2 = afs[:, i2] 401 nt = counts[:, it].astype(float) 402 n1 = counts[:, i1].astype(float) 403 n2 = counts[:, i2].astype(float) 404 405 with np.errstate(invalid="ignore", divide="ignore"): 406 num = (pt - p1) * (pt - p2) 407 if apply_correction: 408 corr_t = np.where(nt > 1, (pt * (1.0 - pt)) / (nt - 1.0), np.nan) 409 num = num - corr_t 410 if apply_correction: 411 # Require a valid correction on the target, references can be n>0 412 snp_mask = np.isfinite(num) & (n1 > 0) & (n2 > 0) 413 else: 414 snp_mask = np.isfinite(num) & (nt > 0) & (n1 > 0) & (n2 > 0) 415 416 num_block_sums = np.full(n_blocks, np.nan, dtype=float) 417 den_block_sums = np.full(n_blocks, np.nan, dtype=float) 418 for b, idx in enumerate(block_bins): 419 if idx.size == 0: 420 continue 421 idx2 = idx[snp_mask[idx]] 422 if idx2.size == 0: 423 continue 424 num_block_sums[b] = float(np.nansum(num[idx2])) 425 den_block_sums[b] = float(idx2.size) 426 427 res = _jackknife_ratio_from_block_sums(num_block_sums, den_block_sums) 428 rows.append( 429 { 430 "target": t, 431 "ref1": r1, 432 "ref2": r2, 433 "est": res.est, 434 "se": res.se, 435 "z": res.z, 436 "p": res.p, 437 "n_blocks": res.n_blocks, 438 "n_snps": res.n_snps, 439 } 440 ) 441 442 return pd.DataFrame(rows)
Compute f3-statistics f3(target; ref1, ref2) with block-jackknife SE.
block_size
is the number of SNPs per jackknife block (default 5000 SNPs). Ignored ifblocks
is provided.- If
target
,ref1
, andref2
are all None, compute f3 for all combinations where each role can be any population. - If
ancestry
is provided, genotypes will be masked to the specified ancestry using LAI before aggregation. - If
apply_correction
is True, subtract the finite sample term p_t*(1-p_t)/(n_t-1) from the per-SNP product. When True, SNPs with n_t<=1 are excluded.
def
f4( data: Union[Any, Tuple[numpy.ndarray, numpy.ndarray, List[str]]], a: Optional[Sequence[str]] = None, b: Optional[Sequence[str]] = None, c: Optional[Sequence[str]] = None, d: Optional[Sequence[str]] = None, sample_labels: Optional[Sequence[str]] = None, block_size: int = 5000, blocks: Optional[numpy.ndarray] = None, ancestry: Union[int, str, NoneType] = None, laiobj: Optional[Any] = None) -> pandas.core.frame.DataFrame:
445def f4( 446 data: Union[Any, Tuple[np.ndarray, np.ndarray, List[str]]], 447 a: Optional[Sequence[str]] = None, 448 b: Optional[Sequence[str]] = None, 449 c: Optional[Sequence[str]] = None, 450 d: Optional[Sequence[str]] = None, 451 sample_labels: Optional[Sequence[str]] = None, 452 block_size: int = 5000, 453 blocks: Optional[np.ndarray] = None, 454 ancestry: Optional[Union[str, int]] = None, 455 laiobj: Optional[Any] = None, 456) -> pd.DataFrame: 457 """ 458 Compute f4-statistics f4(a, b; c, d) with block-jackknife SE. 459 460 - `block_size` is the number of SNPs per jackknife block (default 5000 SNPs). Ignored if `blocks` is provided. 461 - If `ancestry` is provided, genotypes will be masked to the specified ancestry using LAI before aggregation. 462 """ 463 afs, counts, pops = _prepare_inputs(data, sample_labels, ancestry=ancestry, laiobj=laiobj) 464 n_snps, _ = afs.shape 465 block_ids, block_lengths = _build_blocks(n_snps, blocks, block_size) 466 n_blocks = block_lengths.size 467 468 if a is None and b is None and c is None and d is None: 469 quads = [(w, x, y, z) for w in pops for x in pops for y in pops for z in pops] 470 else: 471 if a is None or b is None or c is None or d is None: 472 raise ValueError("a, b, c, d must all be provided if any is provided") 473 if not (len(a) == len(b) == len(c) == len(d)): 474 raise ValueError("a, b, c, d must have the same length") 475 quads = list(zip(a, b, c, d)) 476 477 name_to_idx = {p: i for i, p in enumerate(pops)} 478 block_bins = [np.where(block_ids == b)[0] for b in range(n_blocks)] 479 rows: List[Dict[str, Union[str, float, int]]] = [] 480 481 for pa, pb, pc, dpop in quads: 482 ia = name_to_idx[pa] 483 ib = name_to_idx[pb] 484 ic = name_to_idx[pc] 485 id_ = name_to_idx[dpop] 486 487 A = afs[:, ia] 488 B = afs[:, ib] 489 C = afs[:, ic] 490 D = afs[:, id_] 491 na = counts[:, ia].astype(float) 492 nb = counts[:, ib].astype(float) 493 nc = counts[:, ic].astype(float) 494 nd = counts[:, id_].astype(float) 495 496 with np.errstate(invalid="ignore"): 497 num = (A - B) * (C - D) 498 snp_mask = np.isfinite(num) & (na > 0) & (nb > 0) & (nc > 0) & (nd > 0) 499 500 num_block_sums = np.full(n_blocks, np.nan, dtype=float) 501 den_block_sums = np.full(n_blocks, np.nan, dtype=float) 502 for b_idx, idx in enumerate(block_bins): 503 if idx.size == 0: 504 continue 505 idx2 = idx[snp_mask[idx]] 506 if idx2.size == 0: 507 continue 508 num_block_sums[b_idx] = float(np.nansum(num[idx2])) 509 den_block_sums[b_idx] = float(idx2.size) 510 511 res = _jackknife_ratio_from_block_sums(num_block_sums, den_block_sums) 512 rows.append( 513 { 514 "a": pa, 515 "b": pb, 516 "c": pc, 517 "d": dpop, 518 "est": res.est, 519 "se": res.se, 520 "z": res.z, 521 "p": res.p, 522 "n_blocks": res.n_blocks, 523 "n_snps": res.n_snps, 524 } 525 ) 526 527 return pd.DataFrame(rows)
Compute f4-statistics f4(a, b; c, d) with block-jackknife SE.
block_size
is the number of SNPs per jackknife block (default 5000 SNPs). Ignored ifblocks
is provided.- If
ancestry
is provided, genotypes will be masked to the specified ancestry using LAI before aggregation.
def
d_stat( data: Union[Any, Tuple[numpy.ndarray, numpy.ndarray, List[str]]], a: Optional[Sequence[str]] = None, b: Optional[Sequence[str]] = None, c: Optional[Sequence[str]] = None, d: Optional[Sequence[str]] = None, sample_labels: Optional[Sequence[str]] = None, block_size: int = 5000, blocks: Optional[numpy.ndarray] = None, ancestry: Union[int, str, NoneType] = None, laiobj: Optional[Any] = None) -> pandas.core.frame.DataFrame:
530def d_stat( 531 data: Union[Any, Tuple[np.ndarray, np.ndarray, List[str]]], 532 a: Optional[Sequence[str]] = None, 533 b: Optional[Sequence[str]] = None, 534 c: Optional[Sequence[str]] = None, 535 d: Optional[Sequence[str]] = None, 536 sample_labels: Optional[Sequence[str]] = None, 537 block_size: int = 5000, 538 blocks: Optional[np.ndarray] = None, 539 ancestry: Optional[Union[str, int]] = None, 540 laiobj: Optional[Any] = None, 541) -> pd.DataFrame: 542 """ 543 Compute D-statistics D(a, b; c, d) as ratio of sums: 544 D = sum_l (A-B)(C-D) / sum_l (A+B-2AB)(C+D-2CD) 545 with delete-one block jackknife SE. 546 547 - `block_size` is the number of SNPs per jackknife block (default 5000 SNPs). Ignored if `blocks` is provided. 548 - If `ancestry` is provided, genotypes will be masked to the specified ancestry using LAI before aggregation. 549 """ 550 afs, counts, pops = _prepare_inputs(data, sample_labels, ancestry=ancestry, laiobj=laiobj) 551 n_snps, _ = afs.shape 552 block_ids, block_lengths = _build_blocks(n_snps, blocks, block_size) 553 n_blocks = block_lengths.size 554 555 if a is None and b is None and c is None and d is None: 556 quads = [(w, x, y, z) for w in pops for x in pops for y in pops for z in pops] 557 else: 558 if a is None or b is None or c is None or d is None: 559 raise ValueError("a, b, c, d must all be provided if any is provided") 560 if not (len(a) == len(b) == len(c) == len(d)): 561 raise ValueError("a, b, c, d must have the same length") 562 quads = list(zip(a, b, c, d)) 563 564 name_to_idx = {p: i for i, p in enumerate(pops)} 565 block_bins = [np.where(block_ids == b)[0] for b in range(n_blocks)] 566 rows: List[Dict[str, Union[str, float, int]]] = [] 567 568 for pa, pb, pc, dpop in quads: 569 ia = name_to_idx[pa] 570 ib = name_to_idx[pb] 571 ic = name_to_idx[pc] 572 id_ = name_to_idx[dpop] 573 574 A = afs[:, ia] 575 B = afs[:, ib] 576 C = afs[:, ic] 577 D = afs[:, id_] 578 na = counts[:, ia].astype(float) 579 nb = counts[:, ib].astype(float) 580 nc = counts[:, ic].astype(float) 581 nd = counts[:, id_].astype(float) 582 583 with np.errstate(invalid="ignore"): 584 num = (A - B) * (C - D) 585 den = (A + B - 2 * A * B) * (C + D - 2 * C * D) 586 587 snp_mask = np.isfinite(num) & np.isfinite(den) & (na > 0) & (nb > 0) & (nc > 0) & (nd > 0) 588 589 num_block_sums = np.full(n_blocks, np.nan, dtype=float) 590 den_block_sums = np.full(n_blocks, np.nan, dtype=float) 591 ct_block_sums = np.zeros(n_blocks, dtype=int) 592 for b_idx, idx in enumerate(block_bins): 593 if idx.size == 0: 594 continue 595 idx2 = idx[snp_mask[idx]] 596 if idx2.size == 0: 597 continue 598 num_block_sums[b_idx] = float(np.nansum(num[idx2])) 599 # drop near-zero denominators within a block for stability 600 block_den = float(np.nansum(den[idx2])) 601 den_block_sums[b_idx] = block_den if abs(block_den) > 1e-12 else np.nan 602 ct_block_sums[b_idx] = int(idx2.size) 603 604 res = _jackknife_ratio_from_block_sums(num_block_sums, den_block_sums) 605 rows.append( 606 { 607 "a": pa, 608 "b": pb, 609 "c": pc, 610 "d": dpop, 611 "est": res.est, 612 "se": res.se, 613 "z": res.z, 614 "p": res.p, 615 "n_blocks": res.n_blocks, 616 "n_snps": int(ct_block_sums.sum()), 617 } 618 ) 619 620 return pd.DataFrame(rows)
Compute D-statistics D(a, b; c, d) as ratio of sums: D = sum_l (A-B)(C-D) / sum_l (A+B-2AB)(C+D-2CD) with delete-one block jackknife SE.
block_size
is the number of SNPs per jackknife block (default 5000 SNPs). Ignored ifblocks
is provided.- If
ancestry
is provided, genotypes will be masked to the specified ancestry using LAI before aggregation.
def
f4_ratio( data: Union[Any, Tuple[numpy.ndarray, numpy.ndarray, List[str]]], num: Sequence[Tuple[str, str, str, str]], den: Sequence[Tuple[str, str, str, str]], sample_labels: Optional[Sequence[str]] = None, block_size: int = 5000, blocks: Optional[numpy.ndarray] = None, ancestry: Union[int, str, NoneType] = None, laiobj: Optional[Any] = None) -> pandas.core.frame.DataFrame:
623def f4_ratio( 624 data: Union[Any, Tuple[np.ndarray, np.ndarray, List[str]]], 625 num: Sequence[Tuple[str, str, str, str]], 626 den: Sequence[Tuple[str, str, str, str]], 627 sample_labels: Optional[Sequence[str]] = None, 628 block_size: int = 5000, 629 blocks: Optional[np.ndarray] = None, 630 ancestry: Optional[Union[str, int]] = None, 631 laiobj: Optional[Any] = None, 632) -> pd.DataFrame: 633 """ 634 Compute f4-ratio statistics as ratio of two f4-statistics with block-jackknife SE. 635 636 Args: 637 num: Sequence of quadruples (a, b, c, d) for numerator f4(a, b; c, d) 638 den: Sequence of quadruples (a, b, c, d) for denominator f4(a, b; c, d) 639 640 Notes: 641 - `block_size` is the number of SNPs per jackknife block (default 5000 SNPs). Ignored if `blocks` is provided. 642 - If `ancestry` is provided, genotypes will be masked to the specified ancestry using LAI before aggregation. 643 """ 644 if len(num) != len(den): 645 raise ValueError("'num' and 'den' must have the same length") 646 647 afs, counts, pops = _prepare_inputs(data, sample_labels, ancestry=ancestry, laiobj=laiobj) 648 n_snps, _ = afs.shape 649 block_ids, block_lengths = _build_blocks(n_snps, blocks, block_size) 650 n_blocks = block_lengths.size 651 block_bins = [np.where(block_ids == b)[0] for b in range(n_blocks)] 652 name_to_idx = {p: i for i, p in enumerate(pops)} 653 654 rows: List[Dict[str, Union[str, float, int]]] = [] 655 for (na, nb, nc, nd), (da, db, dc, dd) in zip(num, den): 656 ia, ib, ic, id_ = name_to_idx[na], name_to_idx[nb], name_to_idx[nc], name_to_idx[nd] 657 ja, jb, jc, jd = name_to_idx[da], name_to_idx[db], name_to_idx[dc], name_to_idx[dd] 658 659 A, B, C, D = afs[:, ia], afs[:, ib], afs[:, ic], afs[:, id_] 660 E, F, G, H = afs[:, ja], afs[:, jb], afs[:, jc], afs[:, jd] 661 nA, nB, nC, nD = counts[:, ia], counts[:, ib], counts[:, ic], counts[:, id_] 662 nE, nF, nG, nH = counts[:, ja], counts[:, jb], counts[:, jc], counts[:, jd] 663 664 with np.errstate(invalid="ignore"): 665 num_snp = (A - B) * (C - D) 666 den_snp = (E - F) * (G - H) 667 mask_num = np.isfinite(num_snp) & (nA > 0) & (nB > 0) & (nC > 0) & (nD > 0) 668 mask_den = np.isfinite(den_snp) & (nE > 0) & (nF > 0) & (nG > 0) & (nH > 0) 669 mask_both = mask_num & mask_den 670 671 num_block_sums = np.full(n_blocks, np.nan, dtype=float) 672 den_block_sums = np.full(n_blocks, np.nan, dtype=float) 673 ct_block_sums = np.zeros(n_blocks, dtype=int) 674 for b_idx, idx in enumerate(block_bins): 675 if idx.size == 0: 676 continue 677 idx2 = idx[mask_both[idx]] 678 if idx2.size == 0: 679 continue 680 num_block_sums[b_idx] = float(np.nansum(num_snp[idx2])) 681 block_den = float(np.nansum(den_snp[idx2])) 682 den_block_sums[b_idx] = block_den if abs(block_den) > 1e-12 else np.nan 683 ct_block_sums[b_idx] = int(idx2.size) 684 685 res = _jackknife_ratio_from_block_sums(num_block_sums, den_block_sums) 686 rows.append( 687 { 688 "num": f"({na},{nb};{nc},{nd})", 689 "den": f"({da},{db};{dc},{dd})", 690 "est": res.est, 691 "se": res.se, 692 "z": res.z, 693 "p": res.p, 694 "n_blocks": res.n_blocks, 695 "n_snps": int(ct_block_sums.sum()), 696 } 697 ) 698 699 return pd.DataFrame(rows)
Compute f4-ratio statistics as ratio of two f4-statistics with block-jackknife SE.
Arguments:
- num: Sequence of quadruples (a, b, c, d) for numerator f4(a, b; c, d)
- den: Sequence of quadruples (a, b, c, d) for denominator f4(a, b; c, d)
Notes:
block_size
is the number of SNPs per jackknife block (default 5000 SNPs). Ignored ifblocks
is provided.- If
ancestry
is provided, genotypes will be masked to the specified ancestry using LAI before aggregation.
def
fst( data: Union[Any, Tuple[numpy.ndarray, numpy.ndarray, List[str]]], pop1: Optional[Sequence[str]] = None, pop2: Optional[Sequence[str]] = None, *, method: str = 'hudson', sample_labels: Optional[Sequence[str]] = None, block_size: int = 5000, blocks: Optional[numpy.ndarray] = None, ancestry: Union[int, str, NoneType] = None, laiobj: Optional[Any] = None, include_self: bool = False) -> pandas.core.frame.DataFrame:
702def fst( 703 data: Union[Any, Tuple[np.ndarray, np.ndarray, List[str]]], 704 pop1: Optional[Sequence[str]] = None, 705 pop2: Optional[Sequence[str]] = None, 706 *, 707 method: str = "hudson", 708 sample_labels: Optional[Sequence[str]] = None, 709 block_size: int = 5000, 710 blocks: Optional[np.ndarray] = None, 711 ancestry: Optional[Union[str, int]] = None, 712 laiobj: Optional[Any] = None, 713 include_self: bool = False, 714) -> pd.DataFrame: 715 """ 716 Pairwise F_ST with delete-one block jackknife SE. 717 718 Methods: 719 - 'hudson' (a.k.a. "ratio of averages" following Hudson 1992 / Bhatia 2013): 720 per-SNP num = d_xy - 0.5*(pi_x + pi_y) 721 per-SNP den = d_xy 722 where d_xy = p_x*(1-p_y) + p_y*(1-p_x) and 723 pi_x = 2*p_x*(1-p_x) * n_x/(n_x - 1) (unbiased within-pop diversity on haplotypes). 724 - 'weir_cockerham' (Weir & Cockerham 1984; θ): 725 compute per-SNP variance components a,b,c (for r=2) from allele freqs and haplotype counts: 726 n1, n2 = haplotype counts; p1, p2 = allele freqs 727 n = n1 + n2 728 n_bar = n / 2 729 p_bar = (n1*p1 + n2*p2) / n 730 s2 = (n1*(p1 - p_bar)**2 + n2*(p2 - p_bar)**2) / n_bar 731 h1, h2 = 2*p1*(1-p1), 2*p2*(1-p2) 732 h_bar = 0.5*(h1 + h2) 733 n_c = (n - (n1**2 + n2**2)/n) # equals 2*n1*n2/n for r=2 734 a = (n_bar / n_c) * (s2 - (p_bar*(1 - p_bar) - 0.5*s2 - 0.25*h_bar) / (n_bar - 1)) 735 b = (n_bar / (n_bar - 1)) * (p_bar*(1 - p_bar) - 0.5*s2 - ((2*n_bar - 1)/(4*n_bar))*h_bar) 736 c = 0.5 * h_bar 737 and use num=a, den=(a+b+c), then ratio-of-sums jackknife. 738 739 Notes: 740 * Inputs are the same as f2/f3/f4: either SNPObject or (afs, counts, pops). 741 * For WC we use expected heterozygosity h_i = 2 p_i (1 - p_i) from allele freqs. 742 * SNPs with n<=1 in either pop or with invalid denominators are ignored. 743 """ 744 method = str(method).strip().lower().replace(" ", "_").replace("-", "_") 745 if method in ("wc", "weir", "weir_cockerham", "weir-cockerham"): 746 method = "weir_cockerham" 747 elif method in ("h", "hudson", "bhatia", "ratio", "ratio_of_averages", "ratio-of-averages"): 748 method = "hudson" 749 elif method not in ("hudson", "weir_cockerham"): 750 # only raise if it's neither a known alias nor a canonical name 751 raise ValueError(f"Unknown method for fst: {method!r}") 752 753 afs, counts, pops = _prepare_inputs(data, sample_labels, ancestry=ancestry, laiobj=laiobj) 754 n_snps, n_pops = afs.shape 755 block_ids, block_lengths = _build_blocks(n_snps, blocks, block_size) 756 n_blocks = block_lengths.size 757 758 if pop1 is None and pop2 is None: 759 if include_self: 760 pairs = [(pops[i], pops[j]) for i in range(n_pops) for j in range(i, n_pops)] 761 else: 762 pairs = [(pops[i], pops[j]) for i in range(n_pops) for j in range(i + 1, n_pops)] 763 else: 764 if pop1 is None or pop2 is None or len(pop1) != len(pop2): 765 raise ValueError("pop1 and pop2 must both be provided and of equal length") 766 pairs = list(zip(pop1, pop2)) 767 768 name_to_idx = {p: i for i, p in enumerate(pops)} 769 block_bins = [np.where(block_ids == b)[0] for b in range(n_blocks)] 770 771 out_rows: List[Dict[str, Union[str, float, int]]] = [] 772 773 for pA, pB in pairs: 774 i = name_to_idx[pA] 775 j = name_to_idx[pB] 776 p1 = afs[:, i].astype(float) 777 p2 = afs[:, j].astype(float) 778 n1 = counts[:, i].astype(float) 779 n2 = counts[:, j].astype(float) 780 781 valid = np.isfinite(p1) & np.isfinite(p2) 782 783 if method == "hudson": 784 # d_xy and within-pop diversities (unbiased, haplotype-based) 785 d_xy = p1 * (1.0 - p2) + p2 * (1.0 - p1) 786 with np.errstate(divide="ignore", invalid="ignore"): 787 pi1 = 2.0 * p1 * (1.0 - p1) * (n1 / (n1 - 1.0)) 788 pi2 = 2.0 * p2 * (1.0 - p2) * (n2 / (n2 - 1.0)) 789 num_snp = d_xy - 0.5 * (pi1 + pi2) 790 den_snp = d_xy 791 snp_mask = valid & (n1 > 1) & (n2 > 1) & np.isfinite(num_snp) & np.isfinite(den_snp) 792 else: 793 # Weir-Cockerham θ components (r=2) 794 n = n1 + n2 795 with np.errstate(divide="ignore", invalid="ignore"): 796 n_bar = n / 2.0 797 p_bar = np.where(n > 0, (n1 * p1 + n2 * p2) / n, np.nan) 798 s2 = (n1 * (p1 - p_bar) ** 2 + n2 * (p2 - p_bar) ** 2) / n_bar 799 h1 = 2.0 * p1 * (1.0 - p1) 800 h2 = 2.0 * p2 * (1.0 - p2) 801 h_bar = 0.5 * (h1 + h2) 802 n_c = n - (n1 * n1 + n2 * n2) / np.where(n > 0, n, np.nan) # == 2*n1*n2/n 803 # components 804 a = (n_bar / n_c) * (s2 - (p_bar * (1.0 - p_bar) - 0.5 * s2 - 0.25 * h_bar) / (n_bar - 1.0)) 805 b = (n_bar / (n_bar - 1.0)) * (p_bar * (1.0 - p_bar) - 0.5 * s2 - ((2.0 * n_bar - 1.0) / (4.0 * n_bar)) * h_bar) 806 c = 0.5 * h_bar 807 num_snp = a 808 den_snp = a + b + c 809 # Need at least 2 haplotypes per pop and well-defined denominators 810 snp_mask = valid & (n1 > 1) & (n2 > 1) & np.isfinite(num_snp) & np.isfinite(den_snp) 811 812 # Aggregate by blocks 813 num_block_sums = np.full(n_blocks, np.nan, dtype=float) 814 den_block_sums = np.full(n_blocks, np.nan, dtype=float) 815 ct_block_sums = np.zeros(n_blocks, dtype=int) 816 817 for b_idx, idx in enumerate(block_bins): 818 if idx.size == 0: 819 continue 820 idx2 = idx[snp_mask[idx]] 821 if idx2.size == 0: 822 continue 823 ns = float(np.nansum(num_snp[idx2])) 824 ds = float(np.nansum(den_snp[idx2])) 825 # drop numerically-zero denominators for stability 826 den_block_sums[b_idx] = ds if abs(ds) > 1e-12 else np.nan 827 num_block_sums[b_idx] = ns 828 ct_block_sums[b_idx] = int(idx2.size) 829 830 res = _jackknife_ratio_from_block_sums(num_block_sums, den_block_sums) 831 out_rows.append( 832 { 833 "pop1": pA, 834 "pop2": pB, 835 "method": method, 836 "est": res.est, 837 "se": res.se, 838 "z": res.z, 839 "p": res.p, 840 "n_blocks": res.n_blocks, 841 "n_snps": int(ct_block_sums.sum()), 842 } 843 ) 844 845 return pd.DataFrame(out_rows)
Pairwise F_ST with delete-one block jackknife SE.
Methods:
- 'hudson' (a.k.a. "ratio of averages" following Hudson 1992 / Bhatia 2013): per-SNP num = d_xy - 0.5*(pi_x + pi_y) per-SNP den = d_xy where d_xy = p_x(1-p_y) + p_y(1-p_x) and pi_x = 2p_x(1-p_x) * n_x/(n_x - 1) (unbiased within-pop diversity on haplotypes).
- 'weir_cockerham' (Weir & Cockerham 1984; θ): compute per-SNP variance components a,b,c (for r=2) from allele freqs and haplotype counts: n1, n2 = haplotype counts; p1, p2 = allele freqs n = n1 + n2 n_bar = n / 2 p_bar = (n1*p1 + n2*p2) / n s2 = (n1*(p1 - p_bar)2 + n2*(p2 - p_bar)2) / n_bar h1, h2 = 2p1(1-p1), 2p2(1-p2) h_bar = 0.5(h1 + h2) n_c = (n - (n1**2 + n2**2)/n) # equals 2*n1*n2/n for r=2 a = (n_bar / n_c) * (s2 - (p_bar(1 - p_bar) - 0.5s2 - 0.25h_bar) / (n_bar - 1)) b = (n_bar / (n_bar - 1)) * (p_bar(1 - p_bar) - 0.5*s2 - ((2*n_bar - 1)/(4n_bar))*h_bar) c = 0.5 * h_bar and use num=a, den=(a+b+c), then ratio-of-sums jackknife.
Notes:
- Inputs are the same as f2/f3/f4: either SNPObject or (afs, counts, pops).
- For WC we use expected heterozygosity h_i = 2 p_i (1 - p_i) from allele freqs.
- SNPs with n<=1 in either pop or with invalid denominators are ignored.