Wrote initial batch import.

This commit is contained in:
Jay
2025-02-16 16:56:23 -05:00
commit 1bb73d7712
8 changed files with 959 additions and 0 deletions

381
lib/graph.go Normal file
View File

@@ -0,0 +1,381 @@
// This module defines types and functions for working with Neo4j graph
// entities.
package lib
import (
"fmt"
"sort"
"strings"
)
// ========================================
// Types
// ========================================
// Properties represents a map of node or relationship props.
type Properties map[string]any
// ========================================
// Match Key Provider
// ========================================
// MatchKeysProvider defines methods for querying a mapping of node labels and
// the property keys used to match nodes with them.
type MatchKeysProvider interface {
// GetLabels returns the array of node labels in the mapping.
GetLabels() []string
// GetKeys returns the node property keys used to match nodes with the
// given label and a boolean indicating the success of the lookup.
GetKeys(label string) ([]string, bool)
}
// MatchKeys is a simple implementation of the MatchKeysProvider interface.
type MatchKeys struct {
keys map[string][]string
}
func (p *MatchKeys) GetLabels() []string {
labels := []string{}
for l := range p.keys {
labels = append(labels, l)
}
return labels
}
func (p *MatchKeys) GetKeys(label string) ([]string, bool) {
if keys, exists := p.keys[label]; exists {
return keys, exists
} else {
return nil, exists
}
}
// ========================================
// Nodes
// ========================================
// Node represents a Neo4j node entity, encapsulating its labels and
// properties.
type Node struct {
// Set of labels on the node.
Labels Set[string]
// Mapping of properties on the node.
Props Properties
}
// NewNode creates a new node with the given label and properties.
func NewNode(label string, props Properties) *Node {
if props == nil {
props = make(Properties)
}
return &Node{
Labels: NewSet(label),
Props: props,
}
}
// MatchProps returns the node label and the property values to match it in the
// database.
func (n *Node) MatchProps(
matchProvider MatchKeysProvider) (string, Properties, error) {
// Iterate over each label on the node, checking whether each has match
// keys associated with it.
labels := n.Labels.ToArray()
sort.Strings(labels)
for _, label := range labels {
if keys, exists := matchProvider.GetKeys(label); exists {
props := make(Properties)
// Get the property values associated with each match key.
for _, key := range keys {
if value, exists := n.Props[key]; exists {
props[key] = value
} else {
// If any match property values are missing, return an
// error.
return label, nil,
fmt.Errorf(
"missing property %s for label %s", key, label)
}
}
// Return the label and match properties
return label, props, nil
}
}
// If none of the node labels have defined match keys, return an error.
return "", nil, fmt.Errorf("no recognized label found in %v", n.Labels)
}
type SerializedNode = Properties
func (n *Node) Serialize() *SerializedNode {
return &n.Props
}
// ========================================
// Relationships
// ========================================
// Relationship represents a Neo4j relationship between two nodes, including
// its type and properties.
type Relationship struct {
// The relationship type.
Type string
// The start node for the relationship.
Start *Node
// The end node for the relationship.
End *Node
// Mapping of properties on the relationship
Props Properties
}
// NewRelationship creates a new relationship with the given type, start node,
// end node, and properties
func NewRelationship(
rtype string, start *Node, end *Node, props Properties) *Relationship {
if props == nil {
props = make(Properties)
}
return &Relationship{
Type: rtype,
Start: start,
End: end,
Props: props,
}
}
type SerializedRel = map[string]Properties
func (r *Relationship) Serialize() *SerializedRel {
srel := make(map[string]Properties)
srel["props"] = r.Props
srel["start"] = r.Start.Props
srel["end"] = r.End.Props
return &srel
}
// ========================================
// Simple Subgraph
// ========================================
// Subgraph represents a simple collection of nodes and relationships.
type Subgraph struct {
// The nodes in the subgraph.
nodes []*Node
// The relationships in the subgraph.
rels []*Relationship
}
// NewSubgraph creates an empty subgraph.
func NewSubgraph() *Subgraph {
return &Subgraph{
nodes: []*Node{},
rels: []*Relationship{},
}
}
// AddNode adds a node to the subgraph
func (s *Subgraph) AddNode(node *Node) {
s.nodes = append(s.nodes, node)
}
// AddRel adds a relationship to the subgraph.
func (s *Subgraph) AddRel(rel *Relationship) {
s.rels = append(s.rels, rel)
}
// ========================================
// Structured Subgraph
// ========================================
// StructuredSubgraph is a structured collection of nodes and relationships for
// the purpose of conducting batch operations.
type StructuredSubgraph struct {
// A map of grouped nodes, sorted by their label combinations.
nodes map[string][]*Node
// A map of grouped relationships, sorted by their type and related node
// labels.
rels map[string][]*Relationship
// Provides node property keys used to match nodes with given labels in the
// database.
matchProvider MatchKeysProvider
}
// NewStructuredSubgraph creates an empty structured subgraph with the given
// match keys provider.
func NewStructuredSubgraph(matchProvider MatchKeysProvider) *StructuredSubgraph {
return &StructuredSubgraph{
nodes: make(map[string][]*Node),
rels: make(map[string][]*Relationship),
matchProvider: matchProvider,
}
}
// AddNode sorts a node into the subgraph.
func (s *StructuredSubgraph) AddNode(node *Node) {
// Verify that the node has defined match property values.
matchLabel, _, err := node.MatchProps(s.matchProvider)
if err != nil {
panic(fmt.Errorf("invalid node: %s", err))
}
// Determine the node's sort key.
sortKey := createNodeSortKey(matchLabel, node.Labels.ToArray())
if _, exists := s.nodes[sortKey]; !exists {
s.nodes[sortKey] = []*Node{}
}
// Add the node to the subgraph.
s.nodes[sortKey] = append(s.nodes[sortKey], node)
}
// AddRel sorts a relationship into the subgraph.
func (s *StructuredSubgraph) AddRel(rel *Relationship) {
// Verify that the start node has defined match property values.
startLabel, _, err := rel.Start.MatchProps(s.matchProvider)
if err != nil {
panic(fmt.Errorf("invalid start node: %s", err))
}
// Verify that the end node has defined match property values.
endLabel, _, err := rel.End.MatchProps(s.matchProvider)
if err != nil {
panic(fmt.Errorf("invalid end node: %s", err))
}
// Determine the relationship's sort key.
sortKey := createRelSortKey(rel.Type, startLabel, endLabel)
if _, exists := s.rels[sortKey]; !exists {
s.rels[sortKey] = []*Relationship{}
}
// Add the relationship to the subgraph.
s.rels[sortKey] = append(s.rels[sortKey], rel)
}
// GetNodes returns the nodes grouped under the given sort key.
func (s *StructuredSubgraph) GetNodes(nodeKey string) []*Node {
return s.nodes[nodeKey]
}
// GetRels returns the rels grouped under the given sort key.
func (s *StructuredSubgraph) GetRels(relKey string) []*Relationship {
return s.rels[relKey]
}
// NodeCount returns the number of nodes in the subgraph.
func (s *StructuredSubgraph) NodeCount() int {
count := 0
for l := range s.nodes {
count += len(s.nodes[l])
}
return count
}
// RelCount returns the number of relationships in the subgraph.
func (s *StructuredSubgraph) RelCount() int {
count := 0
for t := range s.rels {
count += len(s.rels[t])
}
return count
}
// NodeKeys returns the list of node sort keys in the subgraph.
func (s *StructuredSubgraph) NodeKeys() []string {
keys := []string{}
for l := range s.nodes {
keys = append(keys, l)
}
return keys
}
// RelKeys returns the list of relationship sort keys in the subgraph.
func (s *StructuredSubgraph) RelKeys() []string {
keys := []string{}
for t := range s.rels {
keys = append(keys, t)
}
return keys
}
// createNodeSortKey returns the serialized node labels for sorting.
func createNodeSortKey(matchLabel string, labels []string) string {
sort.Strings(labels)
serializedLabels := strings.Join(labels, ",")
return fmt.Sprintf("%s:%s", matchLabel, serializedLabels)
}
// createRelSortKey returns the serialized relationship type and start/end node
// labels for sorting.
func createRelSortKey(
rtype string, startLabel string, endLabel string) string {
return strings.Join([]string{rtype, startLabel, endLabel}, ",")
}
// DeserializeNodeKey returns the list of node labels from the serialized sort
// key.
func DeserializeNodeKey(sortKey string) (string, []string) {
parts := strings.Split(sortKey, ":")
if len(parts) != 2 {
panic(fmt.Sprintf("invalid node sort key: %s", sortKey))
}
matchLabel, serializedLabels := parts[0], parts[1]
labels := strings.Split(serializedLabels, ",")
return matchLabel, labels
}
// DeserializeRelKey returns the relationship type, start node label, and end
// node label from the serialized sort key. Panics if the sort key is invalid.
func DeserializeRelKey(sortKey string) (string, string, string) {
parts := strings.Split(sortKey, ",")
if len(parts) != 3 {
panic(fmt.Sprintf("invalid relationship sort key: %s", sortKey))
}
rtype, startLabel, endLabel := parts[0], parts[1], parts[2]
return rtype, startLabel, endLabel
}
// ========================================
// Cypher Formatting Functions
// ========================================
// ToCypherLabel converts a node label or relationship type into its Cypher
// format.
func ToCypherLabel(label string) string {
return fmt.Sprintf(":`%s`", label)
}
// ToCypherLabels converts a list of node labels into its Cypher format.
func ToCypherLabels(labels []string) string {
var cypherLabels []string
for _, label := range labels {
cypherLabels = append(cypherLabels, ToCypherLabel(label))
}
return strings.Join(cypherLabels, "")
}
func ToCypherProps(keys []string, prefix string) string {
if prefix == "" {
prefix = "$"
}
cypherPropsParts := []string{}
for _, key := range keys {
cypherPropsParts = append(
cypherPropsParts, fmt.Sprintf("%s: %s%s", key, prefix, key))
}
return strings.Join(cypherPropsParts, ", ")
}

336
lib/import.go Normal file
View File

@@ -0,0 +1,336 @@
package lib
import (
"context"
"encoding/json"
"fmt"
"log"
"os"
"strings"
"sync"
"github.com/nbd-wtf/go-nostr"
"github.com/neo4j/neo4j-go-driver/v5/neo4j"
)
// Workers
func ImportEvents() {
data, err := os.ReadFile("./export.json")
if err != nil {
panic(err)
}
events := make(chan nostr.Event)
var wg sync.WaitGroup
wg.Add(1)
go func() {
defer wg.Done()
ParseEvents(events)
}()
var event nostr.Event
for i, line := range strings.Split(string(data), "\n") {
if i > 10000 {
break
}
line = strings.TrimSpace(line)
if line == "" {
continue
}
event = nostr.Event{}
err = json.Unmarshal([]byte(line), &event)
if err != nil {
log.Println("Invalid event:", event)
}
events <- event
}
close(events)
wg.Wait()
}
func ParseEvents(events chan nostr.Event) {
subgraphChannel := make(chan Subgraph)
var wg sync.WaitGroup
wg.Add(1)
go func() {
defer wg.Done()
MergeEntities(subgraphChannel)
}()
for event := range events {
// fmt.Println(event.ID)
subgraph := *NewSubgraph()
// Create User and Event nodes
userNode := NewUserNode(event.PubKey)
eventNode := NewEventNode(event.ID)
eventNode.Props["created_at"] = event.CreatedAt.Time().Unix()
eventNode.Props["kind"] = event.Kind
eventNode.Props["content"] = event.Content
authorRel := NewSignedRel(userNode, eventNode, nil)
subgraph.AddNode(userNode)
subgraph.AddNode(eventNode)
subgraph.AddRel(authorRel)
// Create Tag nodes
for _, tag := range event.Tags {
if len(tag) >= 2 {
name := tag[0]
value := tag[1]
// Special cases
tagNode := NewTagNode(name, value)
tagRel := NewTaggedRel(eventNode, tagNode, nil)
subgraph.AddNode(tagNode)
subgraph.AddRel(tagRel)
}
}
subgraphChannel <- subgraph
}
close(subgraphChannel)
wg.Wait()
}
func MergeEntities(subgraphChannel chan Subgraph) {
ctx := context.Background()
driver, err := connectNeo4j(ctx)
if err != nil {
panic(err)
}
defer driver.Close(ctx)
batchSize := 25000
matchProvider := NewMatchKeys()
subgraph := NewStructuredSubgraph(matchProvider)
for sg := range subgraphChannel {
for _, node := range sg.nodes {
subgraph.AddNode(node)
}
for _, rel := range sg.rels {
subgraph.AddRel(rel)
}
if subgraph.NodeCount() > batchSize {
mergeSubgraph(ctx, driver, subgraph)
subgraph = NewStructuredSubgraph(matchProvider)
}
}
mergeSubgraph(ctx, driver, subgraph)
}
// Helper Functions
func connectNeo4j(ctx context.Context) (neo4j.DriverWithContext, error) {
dbUri := "neo4j://localhost:7687"
dbUser := "neo4j"
dbPassword := "neo4jnostr"
driver, err := neo4j.NewDriverWithContext(
dbUri,
neo4j.BasicAuth(dbUser, dbPassword, ""))
err = driver.VerifyConnectivity(ctx)
if err != nil {
return driver, err
}
indexQueries := []string{
`CREATE CONSTRAINT user_pubkey IF NOT EXISTS
FOR (n:User) REQUIRE n.pubkey IS UNIQUE`,
`CREATE INDEX user_pubkey IF NOT EXISTS
FOR (n:User) ON (n.pubkey)`,
`CREATE INDEX event_id IF NOT EXISTS
FOR (n:Event) ON (n.id)`,
`CREATE INDEX event_kind IF NOT EXISTS
FOR (n:Event) ON (n.kind)`,
`CREATE INDEX tag_name_value IF NOT EXISTS
FOR (n:Tag) ON (n.name, n.value)`,
}
// Create indexes/constraints
for _, query := range indexQueries {
_, err = neo4j.ExecuteQuery(ctx, driver,
query,
nil,
neo4j.EagerResultTransformer,
neo4j.ExecuteQueryWithDatabase("neo4j"))
if err != nil {
panic(err)
}
}
return driver, nil
}
func mergeSubgraph(
ctx context.Context,
driver neo4j.DriverWithContext,
subgraph *StructuredSubgraph,
) {
// fmt.Println("Got node keys:", subgraph.NodeKeys())
// fmt.Println("Got rel keys:", subgraph.RelKeys())
// fmt.Println("Node count:", subgraph.NodeCount())
// fmt.Println("Rel count:", subgraph.RelCount())
for _, nodeKey := range subgraph.NodeKeys() {
matchLabel, labels := DeserializeNodeKey(nodeKey)
mergeNodes(
ctx, driver,
matchLabel,
labels,
subgraph.matchProvider,
subgraph.GetNodes(nodeKey),
)
}
for _, relKey := range subgraph.RelKeys() {
rtype, startLabel, endLabel := DeserializeRelKey(relKey)
mergeRels(
ctx, driver,
rtype,
startLabel,
endLabel,
subgraph.matchProvider,
subgraph.GetRels(relKey),
)
}
}
func mergeNodes(
ctx context.Context,
driver neo4j.DriverWithContext,
matchLabel string,
nodeLabels []string,
matchProvider MatchKeysProvider,
nodes []*Node,
) {
cypherLabels := ToCypherLabels(nodeLabels)
matchKeys, exists := matchProvider.GetKeys(matchLabel)
if !exists {
panic(fmt.Errorf("unknown match label: %s", matchLabel))
}
cypherProps := ToCypherProps(matchKeys, "node.")
serializedNodes := []*SerializedNode{}
for _, node := range nodes {
serializedNodes = append(serializedNodes, node.Serialize())
}
query := fmt.Sprintf(`
UNWIND $nodes as node
MERGE (n%s { %s })
SET n += node
`,
cypherLabels, cypherProps,
)
// fmt.Println("First node:", *serializedNodes[0])
// fmt.Printf("Generated query:\n```\n%s\n```\n", query)
result, err := neo4j.ExecuteQuery(ctx, driver,
query,
map[string]any{
"nodes": serializedNodes,
}, neo4j.EagerResultTransformer,
neo4j.ExecuteQueryWithDatabase("neo4j"))
if err != nil {
panic(err)
}
summary := result.Summary
fmt.Printf("Created %v nodes in %+v.\n",
summary.Counters().NodesCreated(),
summary.ResultAvailableAfter())
}
func mergeRels(
ctx context.Context,
driver neo4j.DriverWithContext,
rtype string,
startLabel string,
endLabel string,
matchProvider MatchKeysProvider,
rels []*Relationship,
) {
cypherType := ToCypherLabel(rtype)
startCypherLabel := ToCypherLabel(startLabel)
endCypherLabel := ToCypherLabel(endLabel)
matchKeys, exists := matchProvider.GetKeys(startLabel)
if !exists {
panic(fmt.Errorf("unknown start node label: %s", startLabel))
}
startCypherProps := ToCypherProps(matchKeys, "rel.start.")
matchKeys, exists = matchProvider.GetKeys(endLabel)
if !exists {
panic(fmt.Errorf("unknown end node label: %s", endLabel))
}
endCypherProps := ToCypherProps(matchKeys, "rel.end.")
serializedRels := []*SerializedRel{}
for _, rel := range rels {
serializedRels = append(serializedRels, rel.Serialize())
}
query := fmt.Sprintf(`
UNWIND $rels as rel
MATCH (start%s { %s })
MATCH (end%s { %s })
CREATE (start)-[r%s]->(end)
SET r += rel.props
`,
startCypherLabel, startCypherProps,
endCypherLabel, endCypherProps,
cypherType,
)
// fmt.Println("First rel:", *serializedRels[0])
// fmt.Printf("Generated query:\n```\n%s\n```\n", query)
result, err := neo4j.ExecuteQuery(ctx, driver,
query,
map[string]any{
"rels": serializedRels,
}, neo4j.EagerResultTransformer,
neo4j.ExecuteQueryWithDatabase("neo4j"))
if err != nil {
panic(err)
}
summary := result.Summary
fmt.Printf("Created %v relationships in %+v.\n",
summary.Counters().RelationshipsCreated(),
summary.ResultAvailableAfter())
}

88
lib/schema.go Normal file
View File

@@ -0,0 +1,88 @@
// This module provides methods for creating nodes and relationships according
// to a defined schema.
package lib
import (
"fmt"
)
// ========================================
// Schema Match Keys
// ========================================
func NewMatchKeys() *MatchKeys {
return &MatchKeys{
keys: map[string][]string{
"User": {"pubkey"},
"Relay": {"url"},
"Event": {"id"},
"Tag": {"name", "value"},
},
}
}
// ========================================
// Node Constructors
// ========================================
func NewUserNode(pubkey string) *Node {
return NewNode("User", Properties{"pubkey": pubkey})
}
func NewRelayNode(url string) *Node {
return NewNode("Relay", Properties{"url": url})
}
func NewEventNode(id string) *Node {
return NewNode("Event", Properties{"id": id})
}
func NewTagNode(name string, value string) *Node {
return NewNode("Tag", Properties{"name": name, "value": value})
}
// ========================================
// Relationship Constructors
// ========================================
func NewSignedRel(
start *Node, end *Node, props Properties) *Relationship {
return NewRelationshipWithValidation(
"SIGNED", "User", "Event", start, end, props)
}
func NewTaggedRel(
start *Node, end *Node, props Properties) *Relationship {
return NewRelationshipWithValidation(
"TAGGED", "Event", "Tag", start, end, props)
}
// ========================================
// Relationship Constructor Helpers
// ========================================
func validateNodeLabel(node *Node, role string, expectedLabel string) {
if !node.Labels.Contains(expectedLabel) {
panic(fmt.Errorf(
"expected %s node to have label '%s'. got %v",
role, expectedLabel, node.Labels.ToArray(),
),
)
}
}
func NewRelationshipWithValidation(
rtype string,
startLabel string,
endLabel string,
start *Node,
end *Node,
props Properties) *Relationship {
validateNodeLabel(start, "start", startLabel)
validateNodeLabel(end, "end", endLabel)
return NewRelationship(rtype, start, end, props)
}

50
lib/util.go Normal file
View File

@@ -0,0 +1,50 @@
package lib
// Sets
type Set[T comparable] struct {
inner map[T]struct{}
}
func NewSet[T comparable](items ...T) Set[T] {
set := Set[T]{
inner: make(map[T]struct{}),
}
for _, i := range items {
set.Add(i)
}
return set
}
func (s Set[T]) Add(item T) {
s.inner[item] = struct{}{}
}
func (s Set[T]) Remove(item T) {
delete(s.inner, item)
}
func (s Set[T]) Contains(item T) bool {
_, exists := s.inner[item]
return exists
}
func (s Set[T]) ToArray() []T {
array := []T{}
for i := range s.inner {
array = append(array, i)
}
return array
}
// Operations
func Flatten[K comparable, V comparable](mapping map[K][]V) []V {
var values []V
for _, array := range mapping {
for _, v := range array {
values = append(values, v)
}
}
return values
}