elasticsearch-note #

官方文档 cn（基于 Elasticsearch 2.x 版本）

添加、更新记录 #

text
PUT /megacorp/employee/1
{
    "first_name" : "John",
    "last_name" :  "Smith",
    "age" :        25,
    "about" :      "I love to go rock climbing",
    "interests": [ "sports", "music" ]
}

megacorp 索引名称
employee 类型名称
1 特定雇员的ID

删除记录 #

text
DELETE /megacorp/employee/1

获取记录 #

text
GET /megacorp/employee/1

text
{
  "_index" :   "megacorp",
  "_type" :    "employee",
  "_id" :      "1",
  "_version" : 1,
  "found" :    true,
  "_source" :  {
      "first_name" :  "John",
      "last_name" :   "Smith",
      "age" :         25,
      "about" :       "I love to go rock climbing",
      "interests":  [ "sports", "music" ]
  }
}

检查数据 #

text
HEAD /megacorp/employee/1

搜索 #

用例1：获取全部 #

text
GET /megacorp/employee/_search

json
{
   "took":      6,
   "timed_out": false,
   "_shards": { ... },
   "hits": {
      "total":      3,
      "max_score":  1,
      "hits": [
         {
            "_index":         "megacorp",
            "_type":          "employee",
            "_id":            "3",
            "_score":         1,
            "_source": {
               "first_name":  "Douglas",
               "last_name":   "Fir",
               "age":         35,
               "about":       "I like to build cabinets",
               "interests": [ "forestry" ]
            }
         },
         {
            "_index":         "megacorp",
            "_type":          "employee",
            "_id":            "1",
            "_score":         1,
            "_source": {
               "first_name":  "John",
               "last_name":   "Smith",
               "age":         25,
               "about":       "I love to go rock climbing",
               "interests": [ "sports", "music" ]
            }
         },
         {
            "_index":         "megacorp",
            "_type":          "employee",
            "_id":            "2",
            "_score":         1,
            "_source": {
               "first_name":  "Jane",
               "last_name":   "Smith",
               "age":         32,
               "about":       "I like to collect rock albums",
               "interests": [ "music" ]
            }
         }
      ]
   }
}

用例2：匹配字段 query string #

text
GET /megacorp/employee/_search?q=last_name:Smith

json
{
   ...
   "hits": {
      "total":      2,
      "max_score":  0.30685282,
      "hits": [
         {
            ...
            "_source": {
               "first_name":  "John",
               "last_name":   "Smith",
               "age":         25,
               "about":       "I love to go rock climbing",
               "interests": [ "sports", "music" ]
            }
         },
         {
            ...
            "_source": {
               "first_name":  "Jane",
               "last_name":   "Smith",
               "age":         32,
               "about":       "I like to collect rock albums",
               "interests": [ "music" ]
            }
         }
      ]
   }
}

用例3：使用查询表达式 json #

text
GET /megacorp/employee/_search
{
    "query" : {
        "match" : {
            "last_name" : "Smith"
        }
    }
}

用例4：使用过滤器 filter #

text
GET /megacorp/employee/_search
{
    "query" : {
        "bool": {
            "must": {
                "match" : {
                    "last_name" : "smith"
                }
            },
            "filter": {
                "range" : {
                    "age" : { "gt" : 30 }
                }
            }
        }
    }
}

用例4：全文搜索 match.about #

text
GET /megacorp/employee/_search
{
    "query" : {
        "match" : {
            "about" : "rock climbing"
        }
    }
}

json
{
   ...
   "hits": {
      "total":      2,
      "max_score":  0.16273327,
      "hits": [
         {
            ...
            "_score":         0.16273327,
            "_source": {
               "first_name":  "John",
               "last_name":   "Smith",
               "age":         25,
               "about":       "I love to go rock climbing",
               "interests": [ "sports", "music" ]
            }
         },
         {
            ...
            "_score":         0.016878016,
            "_source": {
               "first_name":  "Jane",
               "last_name":   "Smith",
               "age":         32,
               "about":       "I like to collect rock albums",
               "interests": [ "music" ]
            }
         }
      ]
   }
}

_score 相关性得分

用例5：短语搜索 match_phrase.about #

text
GET /megacorp/employee/_search
{
    "query" : {
        "match_phrase" : {
            "about" : "rock climbing"
        }
    }
}

用例6：高亮搜索 highlight.fields.about #

text
GET /megacorp/employee/_search
{
    "query" : {
        "match_phrase" : {
            "about" : "rock climbing"
        }
    },
    "highlight": {
        "fields" : {
            "about" : {}
        }
    }
}

json
{
   ...
   "hits": {
      "total":      1,
      "max_score":  0.23013961,
      "hits": [
         {
            ...
            "_score":         0.23013961,
            "_source": {
               "first_name":  "John",
               "last_name":   "Smith",
               "age":         25,
               "about":       "I love to go rock climbing",
               "interests": [ "sports", "music" ]
            },
            "highlight": {
               "about": [
                  "I love to go <em>rock</em> <em>climbing</em>"
               ]
            }
         }
      ]
   }
}

高亮片段用 HTML 标签 <em></em> 封装。

用例7：聚合分析 #

查询叫 Smith 的员工中最受欢迎的兴趣爱好：（匹配+聚合）

text
GET /megacorp/employee/_search
{
  "query": {
    "match": {
      "last_name": "smith"
    }
  },
  "aggs": {
    "all_interests": {
      "terms": {
        "field": "interests"
      }
    }
  }
}

text
...
  "all_interests": {
     "buckets": [
        {
           "key": "music",
           "doc_count": 2
        },
        {
           "key": "sports",
           "doc_count": 1
        }
     ]
  }

查询特定兴趣爱好员工的平均年龄：（分级汇总）

text
GET /megacorp/employee/_search
{
    "aggs" : {
        "all_interests" : {
            "terms" : { "field" : "interests" },
            "aggs" : {
                "avg_age" : {
                    "avg" : { "field" : "age" }
                }
            }
        }
    }
}

json
...
  "all_interests": {
     "buckets": [
        {
           "key": "music",
           "doc_count": 2,
           "avg_age": {
              "value": 28.5
           }
        },
        {
           "key": "forestry",
           "doc_count": 1,
           "avg_age": {
              "value": 35
           }
        },
        {
           "key": "sports",
           "doc_count": 1,
           "avg_age": {
              "value": 25
           }
        }
     ]
  }

集群 #

集群健康 #

text
GET /_cluster/health

在一个不包含任何索引的空集群中，它将会有一个类似于如下所示的返回内容：

json
{
   "cluster_name":          "elasticsearch",
   "status":                "green",
   "timed_out":             false,
   "number_of_nodes":       1,
   "number_of_data_nodes":  1,
   "active_primary_shards": 0,
   "active_shards":         0,
   "relocating_shards":     0,
   "initializing_shards":   0,
   "unassigned_shards":     0
}

status 字段指示着当前集群在总体上是否工作正常：
- green 所有的主分片和副本分片都正常运行。
- yellow 所有的主分片都正常运行，但不是所有的副本分片都正常运行。
- red 有主分片没能正常运行。

什么是文档？ #

在大多数应用中，多数实体或对象可以被序列化为包含键值对的 JSON 对象。一个键可以是一个字段或字段的名称，一个值可以是一个字符串，一个数字，一个布尔值，另一个对象，一些数组值，或一些其它特殊类型诸如表示日期的字符串，或代表一个地理位置的对象：

json
{
    "name":         "John Smith",
    "age":          42,
    "confirmed":    true,
    "join_date":    "2014-06-01",
    "home": {
        "lat":      51.5,
        "lon":      0.1
    },
    "accounts": [
        {
            "type": "facebook",
            "id":   "johnsmith"
        },
        {
            "type": "twitter",
            "id":   "johnsmith"
        }
    ]
}

通常情况下，我们使用的术语对象和文档是可以互相替换的。不过，有一个区别：一个对象仅仅是类似于 hash 、 hashmap 、字典或者关联数组的 JSON 对象，对象中也可以嵌套其他的对象。对象可能包含了另外一些对象。在 Elasticsearch 中，术语文档有着特定的含义。它是指最顶层或者根对象, 这个根对象被序列化成 JSON 并存储到 Elasticsearch 中，指定了唯一 ID。

字段的名字可以是任何合法的字符串，但 不可以 包含英文句号(.)。

文档元数据 #

一个文档不仅仅包含它的数据，也包含 元数据 —— 有关文档的信息。三个必须的元数据元素如下：

_index 文档在哪存放
_type 文档表示的对象类别
_id 文档唯一标识

_index #

一个索引应该是因共同的特性被分组到一起的文档集合。

实际上，在 Elasticsearch 中，我们的数据是被存储和索引在分片中，而一个索引仅仅是逻辑上的命名空间，这个命名空间由一个或者多个分片组合在一起。然而，这是一个内部细节，我们的应用程序根本不应该关心分片，对于应用程序而言，只需知道文档位于一个索引内。 Elasticsearch 会处理所有的细节。

_type #

Elasticsearch 公开了一个称为 types （类型）的特性，它允许您在索引中对数据进行逻辑分区。不同 types 的文档可能有不同的字段，但最好能够非常相似。

一个 _type 命名可以是大写或者小写，但是不能以下划线或者句号开头，不应该包含逗号，并且长度限制为256个字符. 我们使用 blog 作为类型名举例。

_id #

ID 是一个字符串，当它和 _index 以及 _type 组合就可以唯一确定 Elasticsearch 中的一个文档。当你创建一个新的文档，要么提供自己的 _id ，要么让 Elasticsearch 帮你生成。

索引文档 #

使用自定义ID #

text
PUT /{index}/{type}/{id}
{
  "field": "value",
  ...
}

用例：

text
PUT /website/blog/123
{
  "title": "My first blog entry",
  "text":  "Just trying this out...",
  "date":  "2014/01/01"
}

json
{
   "_index":    "website",
   "_type":     "blog",
   "_id":       "123",
   "_version":  1,
   "created":   true
}

在 Elasticsearch 中每个文档都有一个版本号。当每次对文档进行修改时（包括删除）， _version 的值会递增。

自动生成ID #

请求的结构调整为：不再使用 PUT 谓词(“使用这个 URL 存储这个文档”)，而是使用 POST 谓词(“存储文档在这个 URL 命名空间下”)。

用例：

text
POST /website/blog/
{
  "title": "My second blog entry",
  "text":  "Still trying this out...",
  "date":  "2014/01/01"
}

json
{
   "_index":    "website",
   "_type":     "blog",
   "_id":       "AVFgSgVHUP18jI2wRx0w",
   "_version":  1,
   "created":   true
}

自动生成的 ID 是 URL-safe、基于 Base64 编码且长度为20个字符的 GUID 字符串。这些 GUID 字符串由可修改的 FlakeID 模式生成，这种模式允许多个节点并行生成唯一 ID ，且互相之间的冲突概率几乎为零。

更新整个文档 #

如果想要更新现有的文档，需要 重建索引 或者进行替换：

text
PUT /website/blog/123
{
  "title": "My first blog entry",
  "text":  "I am starting to get the hang of this...",
  "date":  "2014/01/02"
}

json
{
  "_index" :   "website",
  "_type" :    "blog",
  "_id" :      "123",
  "_version" : 2,
  "created":   false 
}

_version 字段值增加
created 标志设置成 false ，是因为相同的索引、类型和 ID 的文档已经存在。
在内部，Elasticsearch 已将旧文档标记为已删除，并增加一个全新的文档。尽管不能再对旧版本的文档进行访问，但它并不会立即消失。当继续索引更多的数据，Elasticsearch 会在后台清理这些已删除文档。

创建新文档 #

只有在相同的 _index 、 _type 和 _id 不存在时才接受索引请求，使用哪种，取决于哪种使用起来更方便：

第一种方法使用 op_type 查询-字符串参数：

text
PUT /website/blog/123?op_type=create
{ ... }

第二种方法是在 URL 末端使用 /_create :

text
PUT /website/blog/123/_create
{ ... }

如果创建新文档的请求成功执行，Elasticsearch 会返回元数据和一个 201 Created 的 HTTP 响应码。

如果具有相同的 _index 、 _type 和 _id 的文档已经存在，Elasticsearch 将会返回 409 Conflict 响应码，以及如下的错误信息：

json
{
   "error": {
      "root_cause": [
         {
            "type": "document_already_exists_exception",
            "reason": "[blog][123]: document already exists",
            "shard": "0",
            "index": "website"
         }
      ],
      "type": "document_already_exists_exception",
      "reason": "[blog][123]: document already exists",
      "shard": "0",
      "index": "website"
   },
   "status": 409
}

取回一个文档 #

为了从 Elasticsearch 中检索出文档，我们仍然使用相同的 _index , _type , 和 _id ，但是 HTTP 谓词更改为 GET :

text
GET /website/blog/123?pretty

json
{
  "_index" :   "website",
  "_type" :    "blog",
  "_id" :      "123",
  "_version" : 1,
  "found" :    true,
  "_source" :  {
      "title": "My first blog entry",
      "text":  "Just trying this out...",
      "date":  "2014/01/01"
  }
}

在请求的查询串参数中加上 pretty 参数，这将会调用 Elasticsearch 的 pretty-print 功能，该功能使得 JSON 响应体更加可读。但是， _source 字段不能被格式化打印出来。相反，我们得到的 _source 字段中的 JSON 串，刚好是和我们传给它的一样。

GET 请求的响应体包括 {"found": true} ，这证实了文档已经被找到。如果我们请求一个不存在的文档，我们仍旧会得到一个 JSON 响应体，但是 found 将会是 false 。此外， HTTP 响应码将会是 404 Not Found ，而不是 200 OK 。

返回文档的一部分 #

text
GET /website/blog/123?_source=title,text

json
{
  "_index" :   "website",
  "_type" :    "blog",
  "_id" :      "123",
  "_version" : 1,
  "found" :   true,
  "_source" : {
      "title": "My first blog entry" ,
      "text":  "Just trying this out..."
  }
}

只返回 _source 字段 #

text
GET /website/blog/123/_source

json
{
   "title": "My first blog entry",
   "text":  "Just trying this out...",
   "date":  "2014/01/01"
}

检查文档是否存在 #

如果只想检查一个文档是否存在–根本不想关心内容—那么用 HEAD 方法来代替 GET 方法。 HEAD 请求没有返回体，只返回一个 HTTP 请求报头：

text
HEAD /website/blog/123

cURL命令：

bash
curl -i -XHEAD http://localhost:9200/website/blog/123

结果：

text
HTTP/1.1 200 OK
Content-Type: text/plain; charset=UTF-8
Content-Length: 0

text
HTTP/1.1 404 Not Found
Content-Type: text/plain; charset=UTF-8
Content-Length: 0

删除文档 #

text
DELETE /website/blog/123

如果找到该文档，Elasticsearch 将要返回一个 200 ok 的 HTTP 响应码，和一个类似以下结构的响应体。注意，字段 _version 值已经增加：

json
{
  "found" :    true,
  "_index" :   "website",
  "_type" :    "blog",
  "_id" :      "123",
  "_version" : 3
}

如果文档没有找到，我们将得到 404 Not Found 的响应码和类似这样的响应体：

json
{
  "found" :    false,
  "_index" :   "website",
  "_type" :    "blog",
  "_id" :      "123",
  "_version" : 4
}

即使文档不存在（ Found 是 false ）， _version 值仍然会增加。这是 Elasticsearch 内部记录本的一部分，用来确保这些改变在跨多节点时以正确的顺序执行。

乐观并发控制 #

Elasticsearch 是分布式的。当文档创建、更新或删除时，新版本的文档必须复制到集群中的其他节点。Elasticsearch 也是异步和并发的，这意味着这些复制请求被并行发送，并且到达目的地时也许 顺序是乱的 。 Elasticsearch 需要一种方法确保文档的旧版本不会覆盖新的版本。

指定 version 参数来重建索引：

text
PUT /website/blog/1?version=1 
{
  "title": "My first blog entry",
  "text":  "Starting to get the hang of this..."
}

只有当索引中的文档 _version 为 1 时，本次更新才能成功。

此请求成功，并且 _version 已经递增到 2 :

json
{
  "_index":   "website",
  "_type":    "blog",
  "_id":      "1",
  "_version": 2
  "created":  false
}

若重新运行相同的请求，将返回 409 Conflict HTTP 响应码：

json
{
   "error": {
      "root_cause": [
         {
            "type": "version_conflict_engine_exception",
            "reason": "[blog][1]: version conflict, current [2], provided [1]",
            "index": "website",
            "shard": "3"
         }
      ],
      "type": "version_conflict_engine_exception",
      "reason": "[blog][1]: version conflict, current [2], provided [1]",
      "index": "website",
      "shard": "3"
   },
   "status": 409
}

所有文档的更新或删除 API，都可以接受 version 参数，这允许你在代码中使用乐观的并发控制，这是一种明智的做法。

通过外部系统使用版本控制 #

一个常见的设置是使用其它数据库作为主要的数据存储，使用 Elasticsearch 做数据检索，这意味着主数据库的所有更改发生时都需要被复制到 Elasticsearch ，如果多个进程负责这一数据同步，可能遇到类似于之前描述的并发问题。

如果主数据库已经有了版本号 — 或一个能作为版本号的字段值比如 timestamp — 那么就可以在 Elasticsearch 中通过增加 version_type=external 到查询字符串的方式重用这些相同的版本号，版本号必须是大于零的整数，且小于 9.2E+18 — 一个 Java 中 long 类型的正值。Elasticsearch 检查当前 _version 是否小于指定的版本号。如果请求成功，外部的版本号作为文档的新 _version 进行存储。

例如，要创建一个新的具有外部版本号 5 的博客文章：

text
PUT /website/blog/2?version=5&version_type=external
{
  "title": "My first external blog entry",
  "text":  "Starting to get the hang of this..."
}

现在更新这个文档，指定一个新的 version 号是 10 ：

text
PUT /website/blog/2?version=10&version_type=external
{
  "title": "My first external blog entry",
  "text":  "This is a piece of cake..."
}

如果重新运行此请求时，它将会失败，并返回像之前看到的同样的冲突错误，因为指定的外部版本号不大于 Elasticsearch 的当前版本号。

文档部分更新 #

在内部， update API 简单使用与之前描述相同的 检索-修改-重建索引 的处理过程。区别在于这个过程发生在分片内部，这样就避免了多次请求的网络开销。通过减少检索和重建索引步骤之间的时间，我们也减少了其他进程的变更带来冲突的可能性。

text
POST /website/blog/1/_update
{
   "doc" : {
      "tags" : [ "testing" ],
      "views": 0
   }
}

json
{
   "_index" :   "website",
   "_id" :      "1",
   "_type" :    "blog",
   "_version" : 3
}

使用脚本部分更新文档 #

脚本可以在 update API中用来改变 _source 的字段内容，它在更新脚本中称为 ctx._source 。例如，我们可以使用脚本来增加博客文章中 views 的数量：

text
POST /website/blog/1/_update
{
   "script" : "ctx._source.views+=1"
}

使用脚本给 tags 数组添加一个新的标签，同时指定新的标签作为参数，而不是硬编码到脚本内部。这使得 Elasticsearch 可以重用这个脚本，而不是每次添加标签时都要对新脚本重新编译：

text
POST /website/blog/1/_update
{
   "script" : "ctx._source.tags+=new_tag",
   "params" : {
      "new_tag" : "search"
   }
}

用 Groovy 脚本编程
对于那些 API 不能满足需求的情况，Elasticsearch 允许你使用脚本编写自定义的逻辑。许多API都支持脚本的使用，包括搜索、排序、聚合和文档更新。脚本可以作为请求的一部分被传递，从特殊的 .scripts 索引中检索，或者从磁盘加载脚本。
默认的脚本语言是 Groovy，一种快速表达的脚本语言，在语法上与 JavaScript 类似。它在 Elasticsearch V1.3.0 版本首次引入并运行在沙盒中，然而 Groovy 脚本引擎存在漏洞，允许攻击者通过构建 Groovy 脚本，在 Elasticsearch Java VM 运行时脱离沙盒并执行 shell 命令。
因此，在版本 v1.3.8 、 1.4.3 和 V1.5.0 及更高的版本中，它已经被默认禁用。此外，您可以通过设置集群中的所有节点的 config/elasticsearch.yml 文件来禁用动态 Groovy 脚本：
yaml
script.groovy.sandbox.enabled: false
这将关闭 Groovy 沙盒，从而防止动态 Groovy 脚本作为请求的一部分被接受，或者从特殊的 .scripts 索引中被检索。当然，你仍然可以使用存储在每个节点的 config/scripts/ 目录下的 Groovy 脚本。
如果你的架构和安全性不需要担心漏洞攻击，例如你的 Elasticsearch 终端仅暴露和提供给可信赖的应用，当它是你的应用需要的特性时，你可以选择重新启用动态脚本。
你可以在 scripting reference documentation 获取更多关于脚本的资料。

更新尚不存在的文档 #

可以使用 upsert 参数，指定如果文档不存在就应该先创建它：

text
POST /website/pageviews/1/_update
{
   "script" : "ctx._source.views+=1",
   "upsert": {
       "views": 1
   }
}

第一次运行这个请求时， upsert 值作为新文档被索引，初始化 views 字段为 1 。在后续的运行中，由于文档已经存在， script 更新操作将替代 upsert 进行应用，对 views 计数器进行累加。

更新和冲突 #

检索和 重建索引 步骤的间隔越小，变更冲突的机会越小。但是它并不能完全消除冲突的可能性。还是有可能在 update 设法重新索引之前，来自另一进程的请求修改了文档。

为了避免数据丢失， update API 在检索步骤时检索得到文档当前的 _version 号，并传递版本号到 重建索引 步骤的 index 请求。如果另一个进程修改了处于检索和重新索引步骤之间的文档，那么 _version 号将不匹配，更新请求将会失败。

对于部分更新的很多使用场景，文档已经被改变也没有关系。例如，如果两个进程都对页面访问量计数器进行递增操作，它们发生的先后顺序其实不太重要；如果冲突发生了，我们唯一需要做的就是尝试再次更新。

这可以通过设置参数 retry_on_conflict 来自动完成，这个参数规定了失败之前 update 应该重试的次数，它的默认值为 0 。

text
POST /website/pageviews/1/_update?retry_on_conflict=5 
{
   "script" : "ctx._source.views+=1",
   "upsert": {
       "views": 0
   }
}

取回多个文档 #

mget API 要求有一个 docs 数组作为参数，每个元素包含需要检索文档的元数据，包括 _index 、 _type 和 _id 。如果你想检索一个或者多个特定的字段，那么你可以通过 _source 参数来指定这些字段的名字：

text
GET /_mget
{
   "docs" : [
      {
         "_index" : "website",
         "_type" :  "blog",
         "_id" :    2
      },
      {
         "_index" : "website",
         "_type" :  "pageviews",
         "_id" :    1,
         "_source": "views"
      }
   ]
}

该响应体也包含一个 docs 数组，对于每一个在请求中指定的文档，这个数组中都包含有一个对应的响应，且顺序与请求中的顺序相同：

json
{
   "docs" : [
      {
         "_index" :   "website",
         "_id" :      "2",
         "_type" :    "blog",
         "found" :    true,
         "_source" : {
            "text" :  "This is a piece of cake...",
            "title" : "My first external blog entry"
         },
         "_version" : 10
      },
      {
         "_index" :   "website",
         "_id" :      "1",
         "_type" :    "pageviews",
         "found" :    true,
         "_version" : 2,
         "_source" : {
            "views" : 2
         }
      }
   ]
}

如果想检索的数据都在相同的 _index 中（甚至相同的 _type 中），则可以在 URL 中指定默认的 /_index 或者默认的 /_index/_type ：

text
GET /website/blog/_mget
{
   "docs" : [
      { "_id" : 2 },
      { "_type" : "pageviews", "_id" :   1 }
   ]
}

事实上，如果所有文档的 _index 和 _type 都是相同的，可以只传一个 ids 数组，而不是整个 docs 数组：

text
GET /website/blog/_mget
{
   "ids" : [ "2", "1" ]
}

文档 ID 1 的类型是 pageviews ，这个不存在的情况将在响应体中被报告：

json
{
  "docs" : [
    {
      "_index" :   "website",
      "_type" :    "blog",
      "_id" :      "2",
      "_version" : 10,
      "found" :    true,
      "_source" : {
        "title":   "My first external blog entry",
        "text":    "This is a piece of cake..."
      }
    },
    {
      "_index" :   "website",
      "_type" :    "blog",
      "_id" :      "1",
      "found" :    false  
    }
  ]
}

即使有某个文档没有找到，上述请求的 HTTP 状态码仍然是 200 。事实上，即使请求没有找到任何文档，它的状态码依然是 200 –因为 mget 请求本身已经成功执行。为了确定某个文档查找是成功或者失败，你需要检查 found 标记。

代价较小的批量操作 #

格式 #

bulk API允许在单个步骤中进行多次 create 、 index 、 update 、 delete 请求。如下所示：

sh
{ action: { metadata }}\n
{ request body        }\n
{ action: { metadata }}\n
{ request body        }\n
...

bulk 与其他的请求体格式不同，类似与一个有效的单行 JSON 文档流，它通过换行符(\n)连接到一起。

每行一定要以换行符(\n)结尾， 包括最后一行 。
不能包含未转义的换行符，因为会对解析造成干扰。这意味着 JSON 不能使用 pretty 参数打印。

action/metadata 行指定 哪一个文档 做 什么操作 。

action 必须是以下选项之一：

create ：如果文档不存在，则创建。
index ：创建一个新文档或者替换一个现有的文档。
update ：部分更新一个文档。
delete ：删除一个文档。

metadata 应该指定被索引、创建、更新或删除的文档的 _index 、 _type 和 _id 。

批量请求的大小有一个最佳值，大于这个值，性能将不再提升，甚至会下降，一个好的批量大小在开始处理后所占的物理大小约 5-15 MB。

示例1 #

text
POST /_bulk
{ "delete": { "_index": "website", "_type": "blog", "_id": "123" }} 
{ "create": { "_index": "website", "_type": "blog", "_id": "123" }}
{ "title":    "My first blog post" }
{ "index":  { "_index": "website", "_type": "blog" }}
{ "title":    "My second blog post" }
{ "update": { "_index": "website", "_type": "blog", "_id": "123", "_retry_on_conflict" : 3} }
{ "doc" : {"title" : "My updated blog post"} }

delete 动作不能有请求体。
最后一个换行符不能落下。

json
{
   "took": 4,
   "errors": false, 
   "items": [
      {  "delete": {
            "_index":   "website",
            "_type":    "blog",
            "_id":      "123",
            "_version": 2,
            "status":   200,
            "found":    true
      }},
      {  "create": {
            "_index":   "website",
            "_type":    "blog",
            "_id":      "123",
            "_version": 3,
            "status":   201
      }},
      {  "create": {
            "_index":   "website",
            "_type":    "blog",
            "_id":      "EiwfApScQiiy7TIKFxRCTw",
            "_version": 1,
            "status":   201
      }},
      {  "update": {
            "_index":   "website",
            "_type":    "blog",
            "_id":      "123",
            "_version": 4,
            "status":   200
      }}
   ]
}

所有子请求都是独立完成的，因此某个子请求的失败不会对其他子请求造成影响。

示例2 #

text
POST /_bulk
{ "create": { "_index": "website", "_type": "blog", "_id": "123" }}
{ "title":    "Cannot create - it already exists" }
{ "index":  { "_index": "website", "_type": "blog", "_id": "123" }}
{ "title":    "But we can update it" }

如果其中任何子请求失败，最顶层 error 标志被设置为 true ，并且在相应的请求报告出错明细：

json
{
   "took": 3,
   "errors": true, 
   "items": [
      {  "create": {
            "_index":   "website",
            "_type":    "blog",
            "_id":      "123",
            "status":   409, 
            "error":    "DocumentAlreadyExistsException 
                        [[website][4] [blog][123]:
                        document already exists]"
      }},
      {  "index": {
            "_index":   "website",
            "_type":    "blog",
            "_id":      "123",
            "_version": 5,
            "status":   200 
      }}
   ]
}

bulk 请求不是原子的：不能用来实现事务控制。每个请求是单独处理的，因此一个请求的成功或失败不会影响其他的请求。

示例3 #

text
POST /website/_bulk
{ "index": { "_type": "log" }}
{ "event": "User logged in" }

可以像 mget API一样，在 bulk 请求的URL中接受默认的 /_index 或者 /_index/_type 。

text
POST /website/log/_bulk
{ "index": {}}
{ "event": "User logged in" }
{ "index": { "_type": "blog" }}
{ "title": "Overriding the default type" }

也可以覆盖元数据行中的 _index 和 _type 。