Compare commits
2 Commits
7beeacab1a
...
0fbf88101f
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0fbf88101f
|
||
|
|
b80e2421f1
|
@@ -17,3 +17,8 @@ Two reasons.
|
|||||||
* SQlite or Postgres
|
* SQlite or Postgres
|
||||||
|
|
||||||
The use-case is (essentially) single-user.
|
The use-case is (essentially) single-user.
|
||||||
|
|
||||||
|
## API
|
||||||
|
|
||||||
|
### /fetch/<b64:URL>
|
||||||
|
|
||||||
|
|||||||
@@ -2,8 +2,19 @@ package main
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
|
|
||||||
|
"git.jadud.com/grosbeak/internal/domain64"
|
||||||
)
|
)
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
fmt.Printf("True: %s", domain64.ReturnsTrue())
|
d64 := domain64.Domain64{
|
||||||
|
TLD: 1,
|
||||||
|
Domain: 1,
|
||||||
|
Subdomain: 1,
|
||||||
|
Path: 1,
|
||||||
|
}
|
||||||
|
result := d64.AsInt64()
|
||||||
|
fmt.Printf("%064b\n", result)
|
||||||
|
fmt.Printf("%016x\n", result)
|
||||||
|
fmt.Println(result)
|
||||||
}
|
}
|
||||||
|
|||||||
114
internal/domain64/README.md
Normal file
114
internal/domain64/README.md
Normal file
@@ -0,0 +1,114 @@
|
|||||||
|
# domain64
|
||||||
|
|
||||||
|
`domain64` is a BIGINT (or 64-bit) type that can be used to encode all domains we are likely to encounter. It represents well as JSonnet/JSON, and can be used in partitioning database tables easily.
|
||||||
|
|
||||||
|
## what is it
|
||||||
|
|
||||||
|
To encode all of the TLDs, domains, and subdomains we will encounter, we'll use a `domain64` encoding. It maps the entire URL space into a single, 64-bit number (or, `BIGSERIAL` in Postgres).
|
||||||
|
|
||||||
|
|
||||||
|
```mermaid
|
||||||
|
packet-beta
|
||||||
|
0-7: "FF | TLD"
|
||||||
|
8-31: "FFFFFF | Domain"
|
||||||
|
32-39: "FF | Subdomain"
|
||||||
|
40-63: "FFFFFF | Path"
|
||||||
|
```
|
||||||
|
|
||||||
|
```
|
||||||
|
FF:FFFFFF:FF:FFFFFF
|
||||||
|
```
|
||||||
|
|
||||||
|
or
|
||||||
|
|
||||||
|
```
|
||||||
|
tld:domain:subdomain:path
|
||||||
|
```
|
||||||
|
|
||||||
|
or
|
||||||
|
|
||||||
|
```
|
||||||
|
com:jadud:www:teaching:berea
|
||||||
|
```
|
||||||
|
|
||||||
|
can be indexed/partitioned uniquely.
|
||||||
|
|
||||||
|
This lets us track
|
||||||
|
|
||||||
|
* 255 (#FF) TLDs
|
||||||
|
* 16,777,216 (#FFFFFF) domains under each TLD
|
||||||
|
* 255 (#FF) subdomains under each domain
|
||||||
|
* 16,777,216 (#FFFFFF) paths on a given domain
|
||||||
|
|
||||||
|
## what that means
|
||||||
|
|
||||||
|
There are only around 10 TLDs that make up the majority of all sites on the internet. The search engine maxes out at tracking 256 unique TLDs (#00-#FF).
|
||||||
|
|
||||||
|
Each TLD can hold up to 16M unique sites. There are 302M `.com` domains, meaning , 36M `.cn`, and 20M `.org`. Again, this is for a "personal" search engine, and it is not intended to scale to handling all of the internet. Handling ~ 5% of `.com` (or 75% of `.org`) is *just fine*.
|
||||||
|
|
||||||
|
Under a domain, it is possible to uniquely partition off 255 subdomains (where `00` is "no subdomain").
|
||||||
|
|
||||||
|
Paths can be indexed uniquely, up to 16M per subdomain.
|
||||||
|
|
||||||
|
## example
|
||||||
|
|
||||||
|
```
|
||||||
|
01:000001:00:000000 com.jadud
|
||||||
|
01:000001:01:000000 gov.jadud.research
|
||||||
|
01:000001:02:000000 gov.jadud.teaching
|
||||||
|
01:000001:02:000001 gov.jadud.teaching/olin
|
||||||
|
01:000001:02:000002 gov.jadud.teaching/berea
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
| tld | domain | sub | path | hex | dec |
|
||||||
|
| --- | --- | --- | --- | --- | --- |
|
||||||
|
| com | jadud | _ | _ | #x0100000100000000 | 72057598332895232 |
|
||||||
|
| com | jadud | research | _ | #x0100000101000000 | 72057598332895488 |
|
||||||
|
| com | jadud | teaching | _ | #x0100000102000000 | 72057598366449664 |
|
||||||
|
| com | jadud | teaching | olin | #x0100000102000001 | 72057598366449665 |
|
||||||
|
| com | jadud | teaching | berea | #x0100000102000002 | 72057598366449666 |
|
||||||
|
|
||||||
|
## for partitioning
|
||||||
|
|
||||||
|
On a table that contains a `domain64` value, we can partition based on numeric ranges very efficiently.
|
||||||
|
|
||||||
|
|
||||||
|
```sql
|
||||||
|
CREATE TABLE comjadud PARTITION OF com
|
||||||
|
FOR VALUES FROM (0x0100000100000000) TO (0x01000001FFFFFFFF);
|
||||||
|
```
|
||||||
|
|
||||||
|
Or
|
||||||
|
|
||||||
|
```sql
|
||||||
|
CREATE TABLE comjadudresearch PARTITION OF com
|
||||||
|
FOR VALUES FROM (0x0100000101000000) TO (0xx0100000101FFFFFF);
|
||||||
|
```
|
||||||
|
|
||||||
|
## As Jsonnet/JSON
|
||||||
|
|
||||||
|
Jsonnet will naturally sort by the hex key values.
|
||||||
|
|
||||||
|
```
|
||||||
|
{
|
||||||
|
"01": {
|
||||||
|
"name": "com",
|
||||||
|
"children": {
|
||||||
|
"00000001": {
|
||||||
|
"name": "jadud",
|
||||||
|
"children": {
|
||||||
|
"01": "research",
|
||||||
|
"02": "teaching",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"02": {
|
||||||
|
"name": "org",
|
||||||
|
"children": {
|
||||||
|
...
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
@@ -1,5 +1,32 @@
|
|||||||
package domain64
|
package domain64
|
||||||
|
|
||||||
func ReturnsTrue() bool {
|
type Domain64 struct {
|
||||||
return true
|
// The TLD is FF
|
||||||
|
TLD uint8
|
||||||
|
// The Domain is FFFFFF, so the uint16 is the closest we'll get
|
||||||
|
Domain uint16
|
||||||
|
// Subdomains are FF
|
||||||
|
Subdomain uint8
|
||||||
|
// Paths are, again, FFFFFF
|
||||||
|
Path uint16
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
```mermaid
|
||||||
|
packet-beta
|
||||||
|
0-7: "FF | TLD"
|
||||||
|
8-31: "FFFFFF | Domain"
|
||||||
|
32-39: "FF | Subdomain"
|
||||||
|
40-63: "FFFFFF | Path"
|
||||||
|
```
|
||||||
|
*/
|
||||||
|
func (d64 *Domain64) AsInt64() int64 {
|
||||||
|
var result int64 = 0
|
||||||
|
result = result | (int64(d64.TLD) << (64 - 8))
|
||||||
|
result = result | (int64(d64.Domain) << (64 - (8 + 24)))
|
||||||
|
result = result | (int64(d64.Subdomain) << (64 - (8 + 24 + 8)))
|
||||||
|
result = result | (int64(d64.Path) << (64 - (8 + 24 + 8 + 24)))
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
// https://gobyexample.com/testing-and-benchmarking
|
||||||
|
|||||||
Reference in New Issue
Block a user