Integrations / Platforms / WordPress / Splitting Large Records

Algolia has a limit of 10 KB per record for performance and relevance reasons. Instead of indexing large pieces of text into a single record, you can split them into multiple records and use the distinct feature to deduplicate results at query time.

Building an HTML splitter# A

The following HTML splitter creates one record per <h2> heading and includes as many paragraphs as possible. You create a new record every time there’s a new <h2>, or when you’ve reached the content limit.

It also stores every <h3> heading into an array. This is helpful for relevance, as you can set headings before content in searchableAttributes to give them more weight.

You can add all splitters into a splitters.php file, and require it from your main plugin file.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
namespace Algolia;

use DOMDocument;

class HtmlSplitter
{
    protected $level1 = 'h2';
    protected $level2 = 'h3';
    protected $contentLimit = 1000;

    /**
     * Splits the given value.
     *
     * @param  object $searchable
     * @param  string $value
     *
     * @return array
     */
    public function split(\WP_Post $post) {
        $dom = new DOMDocument();
        $dom->loadHTML( $this->get_sanitized_content($post) );
        $rootNodes = $dom->getElementsByTagName('body')->item(0)->childNodes;
        $values = $split = [];

        foreach($rootNodes as $node) {
            $values[] = [$node->tagName => $this->get_node_content($node)];
        }

        $current = [];

        foreach ($values as $entry) {
            foreach ($entry as $tag => $value) {
                if ($tag == $this->level1) {
                    $split[] = $current;
                    $current = [
                        'subtitle' => $value,
                        'subtitle-2' => [],
                        'content' => [],
                    ];
                } elseif ($tag == $this->level2) {
                    $current['subtitle-2'][] = $value;
                } else {
                    $current['content'][] = $value;
                }

                if (!empty($current['content']) && $this->isContentLargeEnough($current['content'])) {
                    $split[] = $current;
                    $current = [
                        'subtitle' => '',
                        'subtitle-2' => [],
                        'content' => [],
                    ];
                }
            }
        }

        foreach ($split as $key => $piece) {
            $split[$key]['content'] = implode("\n\n", $piece['content']);
        }

        return $split;
    }

    private function get_sanitized_content(\WP_Post $post) {
        $the_content = apply_filters('the_content', $post->post_content);

        // Remove <script> tags
        $the_content = preg_replace('#<script(.*?)>(.*?)</script>#is', '', $the_content);
        // Remove \n characters
        $the_content = preg_replace('/\n/', '', $the_content);

        return $the_content;
    }

    private function get_node_content(\DOMElement $node) {
        if (in_array($node->tagName , ['ul', 'ol'])) {
            $text = [];
            foreach ($node->childNodes as $li) {
                $text[] = $li->nodeValue;
            }
            return ' - '.implode("\n - ", $text);
        }

        return $node->textContent;
    }

    private function isContentLargeEnough($content) {
        if (is_array($content)) {
            $content = implode(' ', $content);
        }

        return mb_strlen($content, 'UTF-8') > $this->contentLimit;
    }
}

Splitting records# A

The following example shows how to edit the end of the algolia_post_to_record function in the theme’s functions.php file.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
function algolia_post_to_record(WP_Post $post) {
    $tags = array_map(function (WP_Term $term) {
        return $term->name;
    }, wp_get_post_terms($post->ID, 'post_tag'));

    // Prepare all common attributes and add a new `distinct_key` property
    $common = [
        'distinct_key' => implode('#', [$post->post_type, $post->ID]),
        'title' => $post->post_title,
        'author' => [
            'id' => $post->post_author,
            'name' => get_user_by( 'ID', $post->post_author )->display_name,
        ],
        'excerpt' => $post->post_excerpt,
        'content' => strip_tags($post->post_content),
        'tags' => $tags,
        'url' => get_post_permalink($post->ID),
    ];

    // Split the records on the `post_content` attribute
    $splitter = new \Algolia\HtmlSplitter;
    $records = $splitter->split($post);

    // Merge the common attributes into each split and add a unique `objectID`
    foreach ($records as $key => $split) {
        $records[$key] = array_merge($common, $split, [
            'objectID' => implode('-', [$post->post_type, $post->ID, $key]),
        ]);
    }

    return $records;
}
add_filter('post_to_record', 'algolia_post_to_record');

Make sure to set the distinct_key property in attributesForFaceting in your Algolia settings so you can use it with distinct.

Note that instead of returning a single record, the function returned a records list. You need to update the reindex_post command to take this into account.

1
2
3
4
5
6
7
8
9
10
foreach ($posts->posts as $post) {
    if ($assoc_args['verbose']) {
        WP_CLI::line('Serializing ['.$post->post_title.']');
    }

    $split = apply_filters('post_to_record', $post);

    $records = array_merge($records, $split);
    $count++;
}

Automatic update for split records# A

If you’re indexing long posts and splitting them into multiple records, you need to tweak some things for the automatic update to work.

When updating a post, it could become shorter and take fewer records. This means you need to delete old records for a given post before indexing the new ones. You can delete all records for a given post by using the deleteBy method on the distinct_key attribute.

1
$algolia->initIndex('index_name')->deleteBy(['filters' => 'distinct_key:distinct_value']);

Your final algolia_update_post function, with record splitting support, should look like this:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
function algolia_update_post($id, WP_Post $post, $update) {
    if (wp_is_post_revision($id) || wp_is_post_autosave($id)) {
        return $post;
    }

    global $algolia;

    $record = (array) apply_filters($post->post_type.'_to_record', $post);

    if (!isset($record['objectID'])) {
        $record['objectID'] = implode('#', [$post->post_type, $post->ID]);
    }

    $index = $algolia->initIndex(
        apply_filters('algolia_index_name', $post->post_type)
    );

    // If the post is split, we always delete it
    if ($splitRecord = isSplitRecord($record) ) {
        $index->deleteBy(['filters' => 'distinct_key:'.$record['distinct_key']]);
    }

    if ('trash' == $post->status) {
        // If the post was split, it's already deleted
        if (!$splitRecord) {
            $index->deleteObject($record['objectID']);
        }
    } else {
        $index->saveObjects($record);
    }

    return $post;
}

add_action('save_post', 'algolia_update_post', 10, 3);

function isSplitRecord() {
    // Split records must be an indexed array
    return array_keys($arr) == range(0, count($arr) - 1);
}
Did you find this page helpful?