1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
use super::{Token, TokenFilter, TokenStream};

/// `RemoveLongFilter` removes tokens that are longer
/// than a given number of bytes (in UTF-8 representation).
///
/// It is especially useful when indexing unconstrained content.
/// e.g. Mail containing base-64 encoded pictures etc.
#[derive(Clone)]
pub struct RemoveLongFilter {
    length_limit: usize,
}

impl RemoveLongFilter {
    /// Creates a `RemoveLongFilter` given a limit in bytes of the UTF-8 representation.
    pub fn limit(length_limit: usize) -> RemoveLongFilter {
        RemoveLongFilter { length_limit }
    }
}

impl<TailTokenStream> RemoveLongFilterStream<TailTokenStream>
where
    TailTokenStream: TokenStream,
{
    fn predicate(&self, token: &Token) -> bool {
        token.text.len() < self.token_length_limit
    }

    fn wrap(
        token_length_limit: usize,
        tail: TailTokenStream,
    ) -> RemoveLongFilterStream<TailTokenStream> {
        RemoveLongFilterStream {
            token_length_limit,
            tail,
        }
    }
}

impl<TailTokenStream> TokenFilter<TailTokenStream> for RemoveLongFilter
where
    TailTokenStream: TokenStream,
{
    type ResultTokenStream = RemoveLongFilterStream<TailTokenStream>;

    fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream {
        RemoveLongFilterStream::wrap(self.length_limit, token_stream)
    }
}

pub struct RemoveLongFilterStream<TailTokenStream>
where
    TailTokenStream: TokenStream,
{
    token_length_limit: usize,
    tail: TailTokenStream,
}

impl<TailTokenStream> TokenStream for RemoveLongFilterStream<TailTokenStream>
where
    TailTokenStream: TokenStream,
{
    fn token(&self) -> &Token {
        self.tail.token()
    }

    fn token_mut(&mut self) -> &mut Token {
        self.tail.token_mut()
    }

    fn advance(&mut self) -> bool {
        loop {
            if self.tail.advance() {
                if self.predicate(self.tail.token()) {
                    return true;
                }
            } else {
                return false;
            }
        }
    }
}