From 0fbf88101f06d982ad2a7d6e6401a9c23a72e8b8 Mon Sep 17 00:00:00 2001 From: Matt Jadud Date: Sat, 29 Nov 2025 17:05:21 -0500 Subject: [PATCH] Core structure --- README.md | 5 ++ cmd/api/main.go | 11 +++- internal/domain64/README.md | 114 ++++++++++++++++++++++++++++++++++ internal/domain64/domain64.go | 31 ++++++++- 4 files changed, 158 insertions(+), 3 deletions(-) create mode 100644 internal/domain64/README.md diff --git a/README.md b/README.md index 8616142..5aa89c8 100644 --- a/README.md +++ b/README.md @@ -17,3 +17,8 @@ Two reasons. * SQlite or Postgres The use-case is (essentially) single-user. + +## API + +### /fetch/ + diff --git a/cmd/api/main.go b/cmd/api/main.go index 502eb13..00f1e43 100644 --- a/cmd/api/main.go +++ b/cmd/api/main.go @@ -7,5 +7,14 @@ import ( ) func main() { - fmt.Printf("True: %s", domain64.ReturnsTrue()) + d64 := domain64.Domain64{ + TLD: 1, + Domain: 1, + Subdomain: 1, + Path: 1, + } + result := d64.AsInt64() + fmt.Printf("%064b\n", result) + fmt.Printf("%016x\n", result) + fmt.Println(result) } diff --git a/internal/domain64/README.md b/internal/domain64/README.md new file mode 100644 index 0000000..b00dcd3 --- /dev/null +++ b/internal/domain64/README.md @@ -0,0 +1,114 @@ +# domain64 + +`domain64` is a BIGINT (or 64-bit) type that can be used to encode all domains we are likely to encounter. It represents well as JSonnet/JSON, and can be used in partitioning database tables easily. + +## what is it + +To encode all of the TLDs, domains, and subdomains we will encounter, we'll use a `domain64` encoding. It maps the entire URL space into a single, 64-bit number (or, `BIGSERIAL` in Postgres). + + +```mermaid +packet-beta +0-7: "FF | TLD" +8-31: "FFFFFF | Domain" +32-39: "FF | Subdomain" +40-63: "FFFFFF | Path" +``` + +``` +FF:FFFFFF:FF:FFFFFF +``` + +or + +``` +tld:domain:subdomain:path +``` + +or + +``` +com:jadud:www:teaching:berea +``` + +can be indexed/partitioned uniquely. + +This lets us track + +* 255 (#FF) TLDs +* 16,777,216 (#FFFFFF) domains under each TLD +* 255 (#FF) subdomains under each domain +* 16,777,216 (#FFFFFF) paths on a given domain + +## what that means + +There are only around 10 TLDs that make up the majority of all sites on the internet. The search engine maxes out at tracking 256 unique TLDs (#00-#FF). + +Each TLD can hold up to 16M unique sites. There are 302M `.com` domains, meaning , 36M `.cn`, and 20M `.org`. Again, this is for a "personal" search engine, and it is not intended to scale to handling all of the internet. Handling ~ 5% of `.com` (or 75% of `.org`) is *just fine*. + +Under a domain, it is possible to uniquely partition off 255 subdomains (where `00` is "no subdomain"). + +Paths can be indexed uniquely, up to 16M per subdomain. + +## example + +``` +01:000001:00:000000 com.jadud +01:000001:01:000000 gov.jadud.research +01:000001:02:000000 gov.jadud.teaching +01:000001:02:000001 gov.jadud.teaching/olin +01:000001:02:000002 gov.jadud.teaching/berea +``` + + +| tld | domain | sub | path | hex | dec | +| --- | --- | --- | --- | --- | --- | +| com | jadud | _ | _ | #x0100000100000000 | 72057598332895232 | +| com | jadud | research | _ | #x0100000101000000 | 72057598332895488 | +| com | jadud | teaching | _ | #x0100000102000000 | 72057598366449664 | +| com | jadud | teaching | olin | #x0100000102000001 | 72057598366449665 | +| com | jadud | teaching | berea | #x0100000102000002 | 72057598366449666 | + +## for partitioning + +On a table that contains a `domain64` value, we can partition based on numeric ranges very efficiently. + + +```sql +CREATE TABLE comjadud PARTITION OF com + FOR VALUES FROM (0x0100000100000000) TO (0x01000001FFFFFFFF); +``` + +Or + +```sql +CREATE TABLE comjadudresearch PARTITION OF com + FOR VALUES FROM (0x0100000101000000) TO (0xx0100000101FFFFFF); +``` + +## As Jsonnet/JSON + +Jsonnet will naturally sort by the hex key values. + +``` +{ + "01": { + "name": "com", + "children": { + "00000001": { + "name": "jadud", + "children": { + "01": "research", + "02": "teaching", + } + } + } + }, + "02": { + "name": "org", + "children": { + ... + } + } +} +``` diff --git a/internal/domain64/domain64.go b/internal/domain64/domain64.go index 53991c3..2d2b24f 100644 --- a/internal/domain64/domain64.go +++ b/internal/domain64/domain64.go @@ -1,5 +1,32 @@ package domain64 -func ReturnsTrue() bool { - return true +type Domain64 struct { + // The TLD is FF + TLD uint8 + // The Domain is FFFFFF, so the uint16 is the closest we'll get + Domain uint16 + // Subdomains are FF + Subdomain uint8 + // Paths are, again, FFFFFF + Path uint16 } + +/* +```mermaid +packet-beta +0-7: "FF | TLD" +8-31: "FFFFFF | Domain" +32-39: "FF | Subdomain" +40-63: "FFFFFF | Path" +``` +*/ +func (d64 *Domain64) AsInt64() int64 { + var result int64 = 0 + result = result | (int64(d64.TLD) << (64 - 8)) + result = result | (int64(d64.Domain) << (64 - (8 + 24))) + result = result | (int64(d64.Subdomain) << (64 - (8 + 24 + 8))) + result = result | (int64(d64.Path) << (64 - (8 + 24 + 8 + 24))) + return result +} + +// https://gobyexample.com/testing-and-benchmarking