How to Build Memory-Efficient Transformers with xFormers Using Packed Sequences, GQA, ALiBi, SwiGLU, and Causal Attention

by CryptoExpert
fiverr


print(“\n” + “=”*70 + “\n4. Variable-length packed batch — no padding waste\n” + “=”*70)
seqlens = [37, 120, 8, 200]
total = sum(seqlens)
H, K = 8, 64
q = torch.randn(1, total, H, K, device=device, dtype=torch.float16)
k = torch.randn(1, total, H, K, device=device, dtype=torch.float16)
v = torch.randn(1, total, H, K, device=device, dtype=torch.float16)
try:
bias = ab.BlockDiagonalMask.from_seqlens(seqlens)
out_packed = xops.memory_efficient_attention(q, k, v, attn_bias=bias)
s0 = seqlens[0]
ref0 = vanilla_attention(q[:, :s0], k[:, :s0], v[:, :s0]).half()
print(“packed shape :”, tuple(out_packed.shape), “(all”, total, “tokens, no pad)”)
print(“segment-0 max diff : {:.2e}”.format((out_packed[:, :s0] – ref0).abs().max().item()))
cbias = ab.BlockDiagonalCausalMask.from_seqlens(seqlens)
_ = xops.memory_efficient_attention(q, k, v, attn_bias=cbias)
print(“-> also did a packed CAUSAL pass. This is how vLLM-style engines”)
print(” batch requests of different lengths with zero padding overhead.”)
splits = bias.split(out_packed)
print(“recovered segments :”, [tuple(t.shape) for t in splits])
except Exception as e:
print(“BlockDiagonalMask path skipped on this version/backend:”, repr(e))
print(“\n” + “=”*70 + “\n5. Grouped-query attention (5-D BMGHK layout)\n” + “=”*70)
B, M, K = 2, 256, 64
n_q_heads, n_kv_heads = 8, 2
G, Hq = n_kv_heads, n_q_heads // n_kv_heads
try:
qg = torch.randn(B, M, G, Hq, K, device=device, dtype=torch.float16)
kg = torch.randn(B, M, G, 1, K, device=device, dtype=torch.float16)
vg = torch.randn(B, M, G, 1, K, device=device, dtype=torch.float16)
out_gqa = xops.memory_efficient_attention(qg, kg, vg)
print(“GQA output shape :”, tuple(out_gqa.shape), “= [B, M, G, Hq, K]”)
print(f”-> {n_q_heads} query heads, only {n_kv_heads} KV heads: smaller KV-cache,”)
print(” which is exactly what Llama-/Mistral-class models use at inference.”)
except Exception as e:
print(“GQA 5-D path skipped on this version/backend:”, repr(e))



Source link

You may also like