html_strip过滤的配置发现无效,问题是es不会通过ik分词器处理过滤html标签:
测试代码:
PUT l2qq { "settings": { "analysis": { "analyzer": { "my_custom_analyzer": { "type": "custom", "tokenizer": "ik_smart", "char_filter": [ "html_strip" ], "filter": [ "lowercase", "asciifolding" ] } } } }, "mappings": { "properties": { "@timestamp": { "type": "date" }, "@version": { "type": "keyword" }, "allcontent": { "type": "text", "analyzer": "my_custom_analyzer", "fields": { "keyword": { "type": "keyword" } } }, "content": { "type": "text", "analyzer": "my_custom_analyzer", "fields": { "keyword": { "type": "keyword" } } }, "id": { "type": "long" }, "last_index_time": { "type": "date" }, "lasttimeat": { "type": "date" }, "title": { "type": "keyword" } } } }
代码中的:setting进行设置自定义analyzer进行html_strip处理不生效,所以其实没必要这样搞了,直接:
PUT /l2qq { "mappings": { "properties": { "@timestamp": { "type": "date" }, "@version": { "type": "keyword" }, "allcontent": { "type": "text", "analyzer": "ik_smart", "fields": { "keyword": { "type": "keyword" } } }, "content": { "type": "text", "analyzer": "ik_smart", "fields": { "keyword": { "type": "keyword" } } }, "id": { "type": "long" }, "last_index_time": { "type": "date" }, "lasttimeat": { "type": "date" }, "title": { "type": "text", "analyzer": "ik_smart", "fields": { "keyword": { "type": "keyword" } } } } } }
正确的处理办法:
应该是通过logstash导入数据进行处理,logstash通过filter过滤字段的html代码片段
filter{ mutate{ gsub => [ "content", "<script(.*?)</script>", "" ] } mutate{ gsub => [ "content", "<iframe(.*?)</iframe>", "" ] } mutate{ gsub => [ "content", "<style(.*?)</style>", "" ] } mutate{ gsub => [ "content", "<(.*?)>", "" ] } mutate{ gsub => [ "content", " ", "" ] } }
最新评论 0