1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
use std::{
    collections::HashMap,
    fmt::{Debug, Display},
};

use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt};

use super::{btree_entry::BTreeEntry, read_chunk, BinarySerialization, PagedWriter};
use crate::{
    chunk_cache::CacheEntry,
    error::Error,
    io::File,
    tree::{btree_entry::NodeInclusion, key_entry::ValueIndex},
    vault::AnyVault,
    AbortError, ArcBytes, ChunkCache, ErrorKind,
};

/// An interior B-Tree node. Does not contain values directly, and instead
/// points to a node located on-disk elsewhere.
#[derive(Clone, Debug)]
pub struct Interior<Index, ReducedIndex> {
    /// The key with the highest sort value within.
    pub key: ArcBytes<'static>,
    /// The location of the node.
    pub position: Pointer<Index, ReducedIndex>,
    /// The reduced statistics.
    pub stats: ReducedIndex,
}

/// A pointer to a location on-disk. May also contain the node already loaded.
#[derive(Clone, Debug)]
pub enum Pointer<Index, ReducedIndex> {
    /// The position on-disk of the node.
    OnDisk(u64),
    /// An in-memory node that may have previously been saved on-disk.
    Loaded {
        /// The position on-disk of the node, if it was previously saved.
        previous_location: Option<u64>,
        /// The loaded B-Tree entry.
        entry: Box<BTreeEntry<Index, ReducedIndex>>,
    },
}

impl<
        Index: BinarySerialization + Debug + Clone + 'static,
        ReducedIndex: BinarySerialization + Debug + Clone + 'static,
    > Pointer<Index, ReducedIndex>
{
    /// Attempts to load the node from disk. If the node is already loaded, this
    /// function does nothing.
    #[allow(clippy::missing_panics_doc)] // Currently the only panic is if the types don't match, which shouldn't happen due to these nodes always being accessed through a root.
    pub fn load(
        &mut self,
        file: &mut dyn File,
        validate_crc: bool,
        vault: Option<&dyn AnyVault>,
        cache: Option<&ChunkCache>,
        current_order: Option<usize>,
    ) -> Result<(), Error> {
        match self {
            Pointer::OnDisk(position) => {
                let entry = match read_chunk(*position, validate_crc, file, vault, cache)? {
                    CacheEntry::ArcBytes(mut buffer) => {
                        // It's worthless to store this node in the cache
                        // because if we mutate, we'll be rewritten.
                        Box::new(BTreeEntry::deserialize_from(&mut buffer, current_order)?)
                    }
                    CacheEntry::Decoded(node) => node
                        .as_ref()
                        .as_any()
                        .downcast_ref::<Box<BTreeEntry<Index, ReducedIndex>>>()
                        .unwrap()
                        .clone(),
                };
                *self = Self::Loaded {
                    entry,
                    previous_location: Some(*position),
                };
            }
            Pointer::Loaded { .. } => {}
        }
        Ok(())
    }

    /// Returns the previously-[`load()`ed](Self::load) entry.
    pub fn get(&mut self) -> Option<&BTreeEntry<Index, ReducedIndex>> {
        match self {
            Pointer::OnDisk(_) => None,
            Pointer::Loaded { entry, .. } => Some(entry),
        }
    }

    /// Returns the previously-[`load()`ed](Self::load) entry as a mutable reference.
    pub fn get_mut(&mut self) -> Option<&mut BTreeEntry<Index, ReducedIndex>> {
        match self {
            Pointer::OnDisk(_) => None,
            Pointer::Loaded { entry, .. } => Some(entry.as_mut()),
        }
    }

    /// Returns the position on-disk of the node being pointed at, if the node
    /// has been saved before.
    #[must_use]
    pub fn position(&self) -> Option<u64> {
        match self {
            Pointer::OnDisk(location) => Some(*location),
            Pointer::Loaded {
                previous_location, ..
            } => *previous_location,
        }
    }

    /// Loads the pointed at node, if necessary, and invokes `callback` with the
    /// loaded node. This is useful in situations where the node isn't needed to
    /// be accessed mutably.
    #[allow(clippy::missing_panics_doc)]
    pub fn map_loaded_entry<
        Output,
        CallerError: Display + Debug,
        Cb: FnOnce(
            &BTreeEntry<Index, ReducedIndex>,
            &mut dyn File,
        ) -> Result<Output, AbortError<CallerError>>,
    >(
        &self,
        file: &mut dyn File,
        vault: Option<&dyn AnyVault>,
        cache: Option<&ChunkCache>,
        current_order: Option<usize>,
        callback: Cb,
    ) -> Result<Output, AbortError<CallerError>> {
        match self {
            Pointer::OnDisk(position) => match read_chunk(*position, false, file, vault, cache)? {
                CacheEntry::ArcBytes(mut buffer) => {
                    let decoded = BTreeEntry::deserialize_from(&mut buffer, current_order)?;

                    let result = callback(&decoded, file);
                    if let (Some(cache), Some(file_id)) = (cache, file.id()) {
                        cache.replace_with_decoded(file_id, *position, Box::new(decoded));
                    }
                    result
                }
                CacheEntry::Decoded(value) => {
                    let entry = value
                        .as_ref()
                        .as_any()
                        .downcast_ref::<Box<BTreeEntry<Index, ReducedIndex>>>()
                        .unwrap();
                    callback(entry, file)
                }
            },
            Pointer::Loaded { entry, .. } => callback(entry, file),
        }
    }
}

impl<
        Index: Clone + ValueIndex + BinarySerialization + Debug + 'static,
        ReducedIndex: Clone + BinarySerialization + Debug + 'static,
    > Interior<Index, ReducedIndex>
{
    /// Returns a new instance
    pub fn new<Reducer: super::Reducer<Index, ReducedIndex>>(
        entry: BTreeEntry<Index, ReducedIndex>,
        reducer: &Reducer,
    ) -> Self {
        let key = entry.max_key().clone();

        Self {
            key,
            stats: entry.stats(reducer),
            position: Pointer::Loaded {
                previous_location: None,
                entry: Box::new(entry),
            },
        }
    }

    #[allow(clippy::too_many_arguments)]
    pub(crate) fn copy_data_to<Callback>(
        &mut self,
        include_nodes: NodeInclusion,
        file: &mut dyn File,
        copied_chunks: &mut HashMap<u64, u64>,
        writer: &mut PagedWriter<'_>,
        vault: Option<&dyn AnyVault>,
        scratch: &mut Vec<u8>,
        index_callback: &mut Callback,
    ) -> Result<bool, Error>
    where
        Callback: FnMut(
            &ArcBytes<'static>,
            &mut Index,
            &mut dyn File,
            &mut HashMap<u64, u64>,
            &mut PagedWriter<'_>,
            Option<&dyn AnyVault>,
        ) -> Result<bool, Error>,
    {
        self.position.load(file, true, vault, None, None)?;
        let node = self.position.get_mut().unwrap();
        let mut any_data_copied = node.copy_data_to(
            include_nodes,
            file,
            copied_chunks,
            writer,
            vault,
            scratch,
            index_callback,
        )?;

        // Serialize if we are supposed to
        let position = if include_nodes.should_include() {
            any_data_copied = true;
            scratch.clear();
            node.serialize_to(scratch, writer)?;
            Some(writer.write_chunk(scratch)?)
        } else {
            self.position.position()
        };

        // Remove the node from memory to save RAM during the compaction process.
        if let Some(position) = position {
            self.position = Pointer::OnDisk(position);
        }

        Ok(any_data_copied)
    }
}

impl<
        Index: Clone + BinarySerialization + Debug + 'static,
        ReducedIndex: Clone + BinarySerialization + Debug + 'static,
    > BinarySerialization for Interior<Index, ReducedIndex>
{
    fn serialize_to(
        &mut self,
        writer: &mut Vec<u8>,
        paged_writer: &mut PagedWriter<'_>,
    ) -> Result<usize, Error> {
        let mut pointer = Pointer::OnDisk(0);
        std::mem::swap(&mut pointer, &mut self.position);
        let location_on_disk = match pointer {
            Pointer::OnDisk(position) => position,
            Pointer::Loaded {
                mut entry,
                previous_location,
            } => match (entry.dirty, previous_location) {
                // Serialize if dirty, or if this node hasn't been on-disk before.
                (true, _) | (_, None) => {
                    entry.dirty = false;
                    let old_writer_length = writer.len();
                    entry.serialize_to(writer, paged_writer)?;
                    let position =
                        paged_writer.write_chunk(&writer[old_writer_length..writer.len()])?;
                    writer.truncate(old_writer_length);
                    if let (Some(cache), Some(file_id)) = (paged_writer.cache, paged_writer.id()) {
                        cache.replace_with_decoded(file_id, position, entry);
                    }
                    position
                }
                (false, Some(position)) => position,
            },
        };
        self.position = Pointer::OnDisk(location_on_disk);
        let mut bytes_written = 0;
        // Write the key
        let key_len = u16::try_from(self.key.len()).map_err(|_| ErrorKind::KeyTooLarge)?;
        writer.write_u16::<BigEndian>(key_len)?;
        writer.extend_from_slice(&self.key);
        bytes_written += 2 + key_len as usize;

        writer.write_u64::<BigEndian>(location_on_disk)?;
        bytes_written += 8;

        bytes_written += self.stats.serialize_to(writer, paged_writer)?;

        Ok(bytes_written)
    }

    fn deserialize_from(
        reader: &mut ArcBytes<'_>,
        current_order: Option<usize>,
    ) -> Result<Self, Error> {
        let key_len = reader.read_u16::<BigEndian>()? as usize;
        if key_len > reader.len() {
            return Err(Error::data_integrity(format!(
                "key length {} found but only {} bytes remaining",
                key_len,
                reader.len()
            )));
        }
        let key = reader.read_bytes(key_len)?.into_owned();

        let position = reader.read_u64::<BigEndian>()?;
        let stats = ReducedIndex::deserialize_from(reader, current_order)?;

        Ok(Self {
            key,
            position: Pointer::OnDisk(position),
            stats,
        })
    }
}