diff --git a/qa/suites/rados/singleton/all/mon-config-key-caps.yaml b/qa/suites/rados/singleton/all/mon-config-key-caps.yaml new file mode 100644 index 0000000000000..0b0b95c52e080 --- /dev/null +++ b/qa/suites/rados/singleton/all/mon-config-key-caps.yaml @@ -0,0 +1,17 @@ +roles: +- - mon.a + - mgr.x + - osd.0 + - osd.1 + - osd.2 + - client.0 +tasks: +- install: +- ceph: + log-whitelist: + - overall HEALTH_ + - \(AUTH_BAD_CAPS\) +- workunit: + clients: + all: + - mon/test_config_key_caps.sh diff --git a/qa/workunits/mon/test_config_key_caps.sh b/qa/workunits/mon/test_config_key_caps.sh new file mode 100755 index 0000000000000..77b4b53b701d1 --- /dev/null +++ b/qa/workunits/mon/test_config_key_caps.sh @@ -0,0 +1,201 @@ +#!/usr/bin/env bash + +set -x +set -e + +tmp=$(mktemp -d -p /tmp test_mon_config_key_caps.XXXXX) +entities=() + +function cleanup() +{ + set +e + set +x + if [[ -e $tmp/keyring ]] && [[ -e $tmp/keyring.orig ]]; then + grep '\[.*\..*\]' $tmp/keyring.orig > $tmp/entities.orig + for e in $(grep '\[.*\..*\]' $tmp/keyring | \ + diff $tmp/entities.orig - | \ + sed -n 's/^.*\[\(.*\..*\)\]/\1/p'); + do + ceph auth rm $e 2>&1 >& /dev/null + done + fi + #rm -fr $tmp +} + +trap cleanup 0 # cleanup on exit + +function expect_false() +{ + set -x + if "$@"; then return 1; else return 0; fi +} + +# for cleanup purposes +ceph auth export -o $tmp/keyring.orig + +k=$tmp/keyring + +# setup a few keys +ceph config-key ls +ceph config-key set daemon-private/osd.123/test-foo +ceph config-key set mgr/test-foo +ceph config-key set device/test-foo +ceph config-key set test/foo + +allow_aa=client.allow_aa +allow_bb=client.allow_bb +allow_cc=client.allow_cc + +mgr_a=mgr.a +mgr_b=mgr.b +osd_a=osd.100 +osd_b=osd.200 + +prefix_aa=client.prefix_aa +prefix_bb=client.prefix_bb +prefix_cc=client.prefix_cc +match_aa=client.match_aa +match_bb=client.match_bb + +fail_aa=client.fail_aa +fail_bb=client.fail_bb +fail_cc=client.fail_cc +fail_dd=client.fail_dd +fail_ee=client.fail_ee +fail_ff=client.fail_ff +fail_gg=client.fail_gg +fail_writes=client.fail_writes + +ceph auth get-or-create $allow_aa mon 'allow *' +ceph auth get-or-create $allow_bb mon 'allow service config-key rwx' +ceph auth get-or-create $allow_cc mon 'allow command "config-key get"' + +ceph auth get-or-create $mgr_a mon 'allow profile mgr' +ceph auth get-or-create $mgr_b mon 'allow profile mgr' +ceph auth get-or-create $osd_a mon 'allow profile osd' +ceph auth get-or-create $osd_b mon 'allow profile osd' + +ceph auth get-or-create $prefix_aa mon \ + "allow command \"config-key get\" with key prefix client/$prefix_aa" + +cap="allow command \"config-key set\" with key prefix client/" +cap="$cap,allow command \"config-key get\" with key prefix client/$prefix_bb" +ceph auth get-or-create $prefix_bb mon "$cap" + +cap="allow command \"config-key get\" with key prefix client/" +cap="$cap, allow command \"config-key set\" with key prefix client/" +cap="$cap, allow command \"config-key ls\"" +ceph auth get-or-create $prefix_cc mon "$cap" + +cap="allow command \"config-key get\" with key=client/$match_aa/foo" +ceph auth get-or-create $match_aa mon "$cap" +cap="allow command \"config-key get\" with key=client/$match_bb/foo" +cap="$cap,allow command \"config-key set\" with key=client/$match_bb/foo" +ceph auth get-or-create $match_bb mon "$cap" + +ceph auth get-or-create $fail_aa mon 'allow rx' +ceph auth get-or-create $fail_bb mon 'allow r,allow w' +ceph auth get-or-create $fail_cc mon 'allow rw' +ceph auth get-or-create $fail_dd mon 'allow rwx' +ceph auth get-or-create $fail_ee mon 'allow profile bootstrap-rgw' +ceph auth get-or-create $fail_ff mon 'allow profile bootstrap-rbd' +# write commands will require rw; wx is not enough +ceph auth get-or-create $fail_gg mon 'allow service config-key wx' +# read commands will only require 'r'; 'rx' should be enough. +ceph auth get-or-create $fail_writes mon 'allow service config-key rx' + +# grab keyring +ceph auth export -o $k + +# keys will all the caps can do whatever +for c in $allow_aa $allow_bb $allow_cc $mgr_a $mgr_b; do + ceph -k $k --name $c config-key get daemon-private/osd.123/test-foo + ceph -k $k --name $c config-key get mgr/test-foo + ceph -k $k --name $c config-key get device/test-foo + ceph -k $k --name $c config-key get test/foo +done + +for c in $osd_a $osd_b; do + ceph -k $k --name $c config-key put daemon-private/$c/test-foo + ceph -k $k --name $c config-key get daemon-private/$c/test-foo + expect_false ceph -k $k --name $c config-key ls + expect_false ceph -k $k --name $c config-key get mgr/test-foo + expect_false ceph -k $k --name $c config-key get device/test-foo + expect_false ceph -k $k --name $c config-key get test/foo +done + +expect_false ceph -k $k --name $osd_a get daemon-private/$osd_b/test-foo +expect_false ceph -k $k --name $osd_b get daemon-private/$osd_a/test-foo + +expect_false ceph -k $k --name $prefix_aa \ + config-key ls +expect_false ceph -k $k --name $prefix_aa \ + config-key get daemon-private/osd.123/test-foo +expect_false ceph -k $k --name $prefix_aa \ + config-key set test/bar +expect_false ceph -k $k --name $prefix_aa \ + config-key set client/$prefix_aa/foo + +# write something so we can read, use a custom entity +ceph -k $k --name $allow_bb config-key set client/$prefix_aa/foo +ceph -k $k --name $prefix_aa config-key get client/$prefix_aa/foo +# check one writes to the other's prefix, the other is able to read +ceph -k $k --name $prefix_bb config-key set client/$prefix_aa/bar +ceph -k $k --name $prefix_aa config-key get client/$prefix_aa/bar + +ceph -k $k --name $prefix_bb config-key set client/$prefix_bb/foo +ceph -k $k --name $prefix_bb config-key get client/$prefix_bb/foo + +expect_false ceph -k $k --name $prefix_bb config-key get client/$prefix_aa/bar +expect_false ceph -k $k --name $prefix_bb config-key ls +expect_false ceph -k $k --name $prefix_bb \ + config-key get daemon-private/osd.123/test-foo +expect_false ceph -k $k --name $prefix_bb config-key get mgr/test-foo +expect_false ceph -k $k --name $prefix_bb config-key get device/test-foo +expect_false ceph -k $k --name $prefix_bb config-key get test/bar +expect_false ceph -k $k --name $prefix_bb config-key set test/bar + +ceph -k $k --name $prefix_cc config-key set client/$match_aa/foo +ceph -k $k --name $prefix_cc config-key set client/$match_bb/foo +ceph -k $k --name $prefix_cc config-key get client/$match_aa/foo +ceph -k $k --name $prefix_cc config-key get client/$match_bb/foo +expect_false ceph -k $k --name $prefix_cc config-key set other/prefix +expect_false ceph -k $k --name $prefix_cc config-key get mgr/test-foo +ceph -k $k --name $prefix_cc config-key ls >& /dev/null + +ceph -k $k --name $match_aa config-key get client/$match_aa/foo +expect_false ceph -k $k --name $match_aa config-key get client/$match_bb/foo +expect_false ceph -k $k --name $match_aa config-key set client/$match_aa/foo +ceph -k $k --name $match_bb config-key get client/$match_bb/foo +ceph -k $k --name $match_bb config-key set client/$match_bb/foo +expect_false ceph -k $k --name $match_bb config-key get client/$match_aa/foo +expect_false ceph -k $k --name $match_bb config-key set client/$match_aa/foo + +keys=(daemon-private/osd.123/test-foo + mgr/test-foo + device/test-foo + test/foo + client/$prefix_aa/foo + client/$prefix_bb/foo + client/$match_aa/foo + client/$match_bb/foo +) +# expect these all to fail accessing config-key +for c in $fail_aa $fail_bb $fail_cc \ + $fail_dd $fail_ee $fail_ff \ + $fail_gg; do + for m in get set; do + for key in ${keys[*]} client/$prefix_aa/foo client/$prefix_bb/foo; do + expect_false ceph -k $k --name $c config-key $m $key + done + done +done + +# fail writes but succeed on reads +expect_false ceph -k $k --name $fail_writes config-key set client/$match_aa/foo +expect_false ceph -k $k --name $fail_writes config-key set test/foo +ceph -k $k --name $fail_writes config-key ls +ceph -k $k --name $fail_writes config-key get client/$match_aa/foo +ceph -k $k --name $fail_writes config-key get daemon-private/osd.123/test-foo + +echo "OK" diff --git a/src/mon/MonCap.cc b/src/mon/MonCap.cc index e8d3f7e8bb3fb..e67fbec0bfc47 100644 --- a/src/mon/MonCap.cc +++ b/src/mon/MonCap.cc @@ -214,6 +214,12 @@ mon_rwxa_t MonCapGrant::get_allowed(CephContext *cct, } return MON_CAP_ALL; } + // we don't allow config-key service to be accessed with blanket caps other + // than '*' (i.e., 'any'), and that should have been checked by the caller + // via 'is_allow_all()'. + if (s == "config-key") { + return 0; + } return allow; } @@ -346,7 +352,7 @@ struct MonCapParser : qi::grammar quoted_string %= lexeme['"' >> +(char_ - '"') >> '"'] | lexeme['\'' >> +(char_ - '\'') >> '\'']; - unquoted_word %= +char_("a-zA-Z0-9_.-"); + unquoted_word %= +char_("a-zA-Z0-9_/.-"); str %= quoted_string | unquoted_word; spaces = +(lit(' ') | lit('\n') | lit('\t')); diff --git a/src/mon/MonCap.cc.orig b/src/mon/MonCap.cc.orig new file mode 100644 index 0000000000000..e8d3f7e8bb3fb --- /dev/null +++ b/src/mon/MonCap.cc.orig @@ -0,0 +1,450 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Inktank + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "MonCap.h" +#include "include/stringify.h" +#include "common/config.h" +#include "common/debug.h" +#include "common/Formatter.h" + +#include + +static inline bool is_not_alnum_space(char c) +{ + return !(isalpha(c) || isdigit(c) || (c == '-') || (c == '_')); +} + +static string maybe_quote_string(const std::string& str) +{ + if (find_if(str.begin(), str.end(), is_not_alnum_space) == str.end()) + return str; + return string("\"") + str + string("\""); +} + +using std::ostream; +using std::vector; + +#define dout_subsys ceph_subsys_mon + +ostream& operator<<(ostream& out, mon_rwxa_t p) +{ + if (p == MON_CAP_ANY) + return out << "*"; + + if (p & MON_CAP_R) + out << "r"; + if (p & MON_CAP_W) + out << "w"; + if (p & MON_CAP_X) + out << "x"; + return out; +} + +ostream& operator<<(ostream& out, const StringConstraint& c) +{ + if (c.prefix.length()) + return out << "prefix " << c.prefix; + else + return out << "value " << c.value; +} + +ostream& operator<<(ostream& out, const MonCapGrant& m) +{ + out << "allow"; + if (m.service.length()) { + out << " service " << maybe_quote_string(m.service); + } + if (m.command.length()) { + out << " command " << maybe_quote_string(m.command); + if (!m.command_args.empty()) { + out << " with"; + for (map::const_iterator p = m.command_args.begin(); + p != m.command_args.end(); + ++p) { + if (p->second.value.length()) + out << " " << maybe_quote_string(p->first) << "=" << maybe_quote_string(p->second.value); + else + out << " " << maybe_quote_string(p->first) << " prefix " << maybe_quote_string(p->second.prefix); + } + } + } + if (m.profile.length()) { + out << " profile " << maybe_quote_string(m.profile); + } + if (m.allow != 0) + out << " " << m.allow; + return out; +} + + +// +// fusion lets us easily populate structs via the qi parser. + +typedef map kvmap; + +BOOST_FUSION_ADAPT_STRUCT(MonCapGrant, + (std::string, service) + (std::string, profile) + (std::string, command) + (kvmap, command_args) + (mon_rwxa_t, allow)) + +BOOST_FUSION_ADAPT_STRUCT(StringConstraint, + (std::string, value) + (std::string, prefix)) + +// + +void MonCapGrant::expand_profile(entity_name_t name) const +{ + // only generate this list once + if (!profile_grants.empty()) + return; + + if (profile == "mon") { + profile_grants.push_back(MonCapGrant("mon", MON_CAP_ALL)); + profile_grants.push_back(MonCapGrant("log", MON_CAP_ALL)); + } + if (profile == "osd") { + profile_grants.push_back(MonCapGrant("osd", MON_CAP_ALL)); + profile_grants.push_back(MonCapGrant("mon", MON_CAP_R)); + profile_grants.push_back(MonCapGrant("pg", MON_CAP_R | MON_CAP_W)); + profile_grants.push_back(MonCapGrant("log", MON_CAP_W)); + } + if (profile == "mds") { + profile_grants.push_back(MonCapGrant("mds", MON_CAP_ALL)); + profile_grants.push_back(MonCapGrant("mon", MON_CAP_R)); + profile_grants.push_back(MonCapGrant("osd", MON_CAP_R)); + profile_grants.push_back(MonCapGrant("log", MON_CAP_W)); + } + if (profile == "osd" || profile == "mds" || profile == "mon") { + string prefix = string("daemon-private/") + stringify(name) + string("/"); + profile_grants.push_back(MonCapGrant("config-key get", "key", StringConstraint("", prefix))); + profile_grants.push_back(MonCapGrant("config-key put", "key", StringConstraint("", prefix))); + profile_grants.push_back(MonCapGrant("config-key exists", "key", StringConstraint("", prefix))); + profile_grants.push_back(MonCapGrant("config-key delete", "key", StringConstraint("", prefix))); + } + if (profile == "bootstrap-osd") { + profile_grants.push_back(MonCapGrant("mon", MON_CAP_R)); // read monmap + profile_grants.push_back(MonCapGrant("osd", MON_CAP_R)); // read osdmap + profile_grants.push_back(MonCapGrant("mon getmap")); + profile_grants.push_back(MonCapGrant("osd create")); + profile_grants.push_back(MonCapGrant("auth add")); + profile_grants.back().command_args["entity"] = StringConstraint("", "osd."); + profile_grants.back().command_args["caps_mon"] = StringConstraint("allow profile osd", ""); + profile_grants.back().command_args["caps_osd"] = StringConstraint("allow *", ""); + } + if (profile == "bootstrap-mds") { + profile_grants.push_back(MonCapGrant("mon", MON_CAP_R)); // read monmap + profile_grants.push_back(MonCapGrant("osd", MON_CAP_R)); // read osdmap + profile_grants.push_back(MonCapGrant("mon getmap")); + profile_grants.push_back(MonCapGrant("auth get-or-create")); // FIXME: this can expose other mds keys + profile_grants.back().command_args["entity"] = StringConstraint("", "mds."); + profile_grants.back().command_args["caps_mon"] = StringConstraint("allow profile mds", ""); + profile_grants.back().command_args["caps_osd"] = StringConstraint("allow rwx", ""); + profile_grants.back().command_args["caps_mds"] = StringConstraint("allow", ""); + } + if (profile == "fs-client") { + profile_grants.push_back(MonCapGrant("mon", MON_CAP_R)); + profile_grants.push_back(MonCapGrant("mds", MON_CAP_R)); + profile_grants.push_back(MonCapGrant("osd", MON_CAP_R)); + profile_grants.push_back(MonCapGrant("pg", MON_CAP_R)); + } + if (profile == "simple-rados-client") { + profile_grants.push_back(MonCapGrant("mon", MON_CAP_R)); + profile_grants.push_back(MonCapGrant("osd", MON_CAP_R)); + profile_grants.push_back(MonCapGrant("pg", MON_CAP_R)); + } +} + +mon_rwxa_t MonCapGrant::get_allowed(CephContext *cct, + entity_name_t name, + const std::string& s, const std::string& c, + const map& c_args) const +{ + if (profile.length()) { + expand_profile(name); + mon_rwxa_t a; + for (list::const_iterator p = profile_grants.begin(); + p != profile_grants.end(); ++p) + a = a | p->get_allowed(cct, name, s, c, c_args); + return a; + } + if (service.length()) { + if (service != s) + return 0; + return allow; + } + if (command.length()) { + if (command != c) + return 0; + for (map::const_iterator p = command_args.begin(); p != command_args.end(); ++p) { + map::const_iterator q = c_args.find(p->first); + // argument must be present if a constraint exists + if (q == c_args.end()) + return 0; + if (p->second.value.length()) { + // match value + if (p->second.value != q->second) + return 0; + } else { + // match prefix + if (q->second.find(p->second.prefix) != 0) + return 0; + } + } + return MON_CAP_ALL; + } + return allow; +} + +ostream& operator<<(ostream&out, const MonCap& m) +{ + for (vector::const_iterator p = m.grants.begin(); p != m.grants.end(); ++p) { + if (p != m.grants.begin()) + out << ", "; + out << *p; + } + return out; +} + +bool MonCap::is_allow_all() const +{ + for (vector::const_iterator p = grants.begin(); p != grants.end(); ++p) + if (p->is_allow_all()) + return true; + return false; +} + +void MonCap::set_allow_all() +{ + grants.clear(); + grants.push_back(MonCapGrant(MON_CAP_ANY)); + text = "allow *"; +} + +bool MonCap::is_capable(CephContext *cct, + entity_name_t name, + const string& service, + const string& command, const map& command_args, + bool op_may_read, bool op_may_write, bool op_may_exec) const +{ + if (cct) + ldout(cct, 20) << "is_capable service=" << service << " command=" << command + << (op_may_read ? " read":"") + << (op_may_write ? " write":"") + << (op_may_exec ? " exec":"") + << " on cap " << *this + << dendl; + mon_rwxa_t allow = 0; + for (vector::const_iterator p = grants.begin(); + p != grants.end(); ++p) { + if (cct) + ldout(cct, 20) << " allow so far " << allow << ", doing grant " << *p << dendl; + + if (p->is_allow_all()) { + if (cct) + ldout(cct, 20) << " allow all" << dendl; + return true; + } + + // check enumerated caps + allow = allow | p->get_allowed(cct, name, service, command, command_args); + if ((!op_may_read || (allow & MON_CAP_R)) && + (!op_may_write || (allow & MON_CAP_W)) && + (!op_may_exec || (allow & MON_CAP_X))) { + if (cct) + ldout(cct, 20) << " match" << dendl; + return true; + } + } + return false; +} + +void MonCap::encode(bufferlist& bl) const +{ + ENCODE_START(4, 4, bl); // legacy MonCaps was 3, 3 + ::encode(text, bl); + ENCODE_FINISH(bl); +} + +void MonCap::decode(bufferlist::iterator& bl) +{ + string s; + DECODE_START(4, bl); + ::decode(s, bl); + DECODE_FINISH(bl); + parse(s, NULL); +} + +void MonCap::dump(Formatter *f) const +{ + f->dump_string("text", text); +} + +void MonCap::generate_test_instances(list& ls) +{ + ls.push_back(new MonCap); + ls.push_back(new MonCap); + ls.back()->parse("allow *"); + ls.push_back(new MonCap); + ls.back()->parse("allow rwx"); + ls.push_back(new MonCap); + ls.back()->parse("allow service foo x"); + ls.push_back(new MonCap); + ls.back()->parse("allow command bar x"); + ls.push_back(new MonCap); + ls.back()->parse("allow service foo r, allow command bar x"); + ls.push_back(new MonCap); + ls.back()->parse("allow command bar with k1=v1 x"); + ls.push_back(new MonCap); + ls.back()->parse("allow command bar with k1=v1 k2=v2 x"); +} + +// grammar +namespace qi = boost::spirit::qi; +namespace ascii = boost::spirit::ascii; +namespace phoenix = boost::phoenix; + + +template +struct MonCapParser : qi::grammar +{ + MonCapParser() : MonCapParser::base_type(moncap) + { + using qi::char_; + using qi::int_; + using qi::ulong_long; + using qi::lexeme; + using qi::alnum; + using qi::_val; + using qi::_1; + using qi::_2; + using qi::_3; + using qi::eps; + using qi::lit; + + quoted_string %= + lexeme['"' >> +(char_ - '"') >> '"'] | + lexeme['\'' >> +(char_ - '\'') >> '\'']; + unquoted_word %= +char_("a-zA-Z0-9_.-"); + str %= quoted_string | unquoted_word; + + spaces = +(lit(' ') | lit('\n') | lit('\t')); + + // command := command[=]cmd [k1=v1 k2=v2 ...] + str_match = '=' >> str >> qi::attr(string()); + str_prefix = spaces >> lit("prefix") >> spaces >> qi::attr(string()) >> str; + kv_pair = str >> (str_match | str_prefix); + kv_map %= kv_pair >> *(spaces >> kv_pair); + command_match = -spaces >> lit("allow") >> spaces >> lit("command") >> (lit('=') | spaces) + >> qi::attr(string()) >> qi::attr(string()) + >> str + >> -(spaces >> lit("with") >> spaces >> kv_map) + >> qi::attr(0); + + // service foo rwxa + service_match %= -spaces >> lit("allow") >> spaces >> lit("service") >> (lit('=') | spaces) + >> str >> qi::attr(string()) >> qi::attr(string()) + >> qi::attr(map()) + >> spaces >> rwxa; + + // profile foo + profile_match %= -spaces >> lit("allow") >> spaces >> lit("profile") >> (lit('=') | spaces) + >> qi::attr(string()) + >> str + >> qi::attr(string()) + >> qi::attr(map()) + >> qi::attr(0); + + // rwxa + rwxa_match %= -spaces >> lit("allow") >> spaces + >> qi::attr(string()) >> qi::attr(string()) >> qi::attr(string()) + >> qi::attr(map()) + >> rwxa; + + // rwxa := * | [r][w][x] + rwxa = + (lit("*")[_val = MON_CAP_ANY]) | + ( eps[_val = 0] >> + ( lit('r')[_val |= MON_CAP_R] || + lit('w')[_val |= MON_CAP_W] || + lit('x')[_val |= MON_CAP_X] + ) + ); + + // grant := allow ... + grant = -spaces >> (rwxa_match | profile_match | service_match | command_match) >> -spaces; + + // moncap := grant [grant ...] + grants %= (grant % (*lit(' ') >> (lit(';') | lit(',')) >> *lit(' '))); + moncap = grants [_val = phoenix::construct(_1)]; + + } + qi::rule spaces; + qi::rule rwxa; + qi::rule quoted_string; + qi::rule unquoted_word; + qi::rule str; + + qi::rule str_match, str_prefix; + qi::rule()> kv_pair; + qi::rule()> kv_map; + + qi::rule rwxa_match; + qi::rule command_match; + qi::rule service_match; + qi::rule profile_match; + qi::rule grant; + qi::rule()> grants; + qi::rule moncap; +}; + +bool MonCap::parse(const string& str, ostream *err) +{ + string s = str; + string::iterator iter = s.begin(); + string::iterator end = s.end(); + + MonCapParser g; + bool r = qi::parse(iter, end, g, *this); + //MonCapGrant foo; + //bool r = qi::phrase_parse(iter, end, g, ascii::space, foo); + if (r && iter == end) { + text = str; + return true; + } + + // Make sure no grants are kept after parsing failed! + grants.clear(); + + if (err) { + if (iter != end) + *err << "moncap parse failed, stopped at '" << std::string(iter, end) + << "' of '" << str << "'\n"; + else + *err << "moncap parse failed, stopped at end of '" << str << "'\n"; + } + + return false; +} + diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc index fd3a358db8bfc..a8dbfdc77a2f7 100644 --- a/src/mon/Monitor.cc +++ b/src/mon/Monitor.cc @@ -2214,7 +2214,19 @@ void Monitor::handle_command(MMonCommand *m) return; } - cmd_getval(g_ceph_context, cmdmap, "prefix", prefix); + // check return value. If no prefix parameter provided, + // return value will be false, then return error info. + if(!cmd_getval(g_ceph_context, cmdmap, "prefix", prefix)) { + reply_command(op, -EINVAL, "command prefix not found", 0); + return; + } + + // check prefix is empty + if (prefix.empty()) { + reply_command(op, -EINVAL, "command prefix must not be empty", 0); + return; + } + if (prefix == "get_command_descriptions") { bufferlist rdata; Formatter *f = new_formatter("json"); @@ -2235,6 +2247,15 @@ void Monitor::handle_command(MMonCommand *m) boost::scoped_ptr f(new_formatter(format)); get_str_vec(prefix, fullcmd); + + // make sure fullcmd is not empty. + // invalid prefix will cause empty vector fullcmd. + // such as, prefix=";,,;" + if (fullcmd.empty()) { + reply_command(op, -EINVAL, "command requires a prefix to be valid", 0); + return; + } + module = fullcmd[0]; // validate command is in leader map diff --git a/src/mon/Monitor.cc.orig b/src/mon/Monitor.cc.orig new file mode 100644 index 0000000000000..fd3a358db8bfc --- /dev/null +++ b/src/mon/Monitor.cc.orig @@ -0,0 +1,4550 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#include +#include +#include +#include +#include + +#include "Monitor.h" +#include "common/version.h" + +#include "osd/OSDMap.h" + +#include "MonitorStore.h" +#include "MonitorDBStore.h" + +#include "msg/Messenger.h" + +#include "messages/PaxosServiceMessage.h" +#include "messages/MMonMap.h" +#include "messages/MMonGetMap.h" +#include "messages/MMonGetVersion.h" +#include "messages/MMonGetVersionReply.h" +#include "messages/MGenericMessage.h" +#include "messages/MMonCommand.h" +#include "messages/MMonCommandAck.h" +#include "messages/MMonSync.h" +#include "messages/MMonScrub.h" +#include "messages/MMonProbe.h" +#include "messages/MMonJoin.h" +#include "messages/MMonPaxos.h" +#include "messages/MRoute.h" +#include "messages/MForward.h" + +#include "messages/MMonSubscribe.h" +#include "messages/MMonSubscribeAck.h" + +#include "messages/MAuthReply.h" + +#include "messages/MTimeCheck.h" +#include "messages/MMonHealth.h" +#include "messages/MPing.h" + +#include "common/strtol.h" +#include "common/ceph_argparse.h" +#include "common/Timer.h" +#include "common/Clock.h" +#include "common/errno.h" +#include "common/perf_counters.h" +#include "common/admin_socket.h" + +#include "include/color.h" +#include "include/ceph_fs.h" +#include "include/str_list.h" + +#include "OSDMonitor.h" +#include "MDSMonitor.h" +#include "MonmapMonitor.h" +#include "PGMonitor.h" +#include "LogMonitor.h" +#include "AuthMonitor.h" +#include "mon/QuorumService.h" +#include "mon/HealthMonitor.h" +#include "mon/ConfigKeyService.h" + +#include "auth/AuthMethodList.h" +#include "auth/KeyRing.h" + +#include "common/config.h" +#include "common/cmdparse.h" +#include "include/assert.h" + +#define dout_subsys ceph_subsys_mon +#undef dout_prefix +#define dout_prefix _prefix(_dout, this) +static ostream& _prefix(std::ostream *_dout, const Monitor *mon) { + return *_dout << "mon." << mon->name << "@" << mon->rank + << "(" << mon->get_state_name() << ") e" << mon->monmap->get_epoch() << " "; +} + +const string Monitor::MONITOR_NAME = "monitor"; +const string Monitor::MONITOR_STORE_PREFIX = "monitor_store"; + + +#undef COMMAND +MonCommand mon_commands[] = { +#define COMMAND(parsesig, helptext, modulename, req_perms, avail) \ + {parsesig, helptext, modulename, req_perms, avail}, +#include +}; +MonCommand classic_mon_commands[] = { +#include +}; + + +long parse_pos_long(const char *s, ostream *pss) +{ + if (*s == '-' || *s == '+') { + if (pss) + *pss << "expected numerical value, got: " << s; + return -EINVAL; + } + + string err; + long r = strict_strtol(s, 10, &err); + if ((r == 0) && !err.empty()) { + if (pss) + *pss << err; + return -1; + } + if (r < 0) { + if (pss) + *pss << "unable to parse positive integer '" << s << "'"; + return -1; + } + return r; +} + +Monitor::Monitor(CephContext* cct_, string nm, MonitorDBStore *s, + Messenger *m, MonMap *map) : + Dispatcher(cct_), + name(nm), + rank(-1), + messenger(m), + con_self(m ? m->get_loopback_connection() : NULL), + lock("Monitor::lock"), + timer(cct_, lock), + has_ever_joined(false), + logger(NULL), cluster_logger(NULL), cluster_logger_registered(false), + monmap(map), + clog(cct_, messenger, monmap, LogClient::FLAG_MON), + key_server(cct, &keyring), + auth_cluster_required(cct, + cct->_conf->auth_supported.length() ? + cct->_conf->auth_supported : cct->_conf->auth_cluster_required), + auth_service_required(cct, + cct->_conf->auth_supported.length() ? + cct->_conf->auth_supported : cct->_conf->auth_service_required), + leader_supported_mon_commands(NULL), + leader_supported_mon_commands_size(0), + store(s), + + state(STATE_PROBING), + + elector(this), + required_features(0), + leader(0), + quorum_features(0), + scrub_version(0), + + // sync state + sync_provider_count(0), + sync_cookie(0), + sync_full(false), + sync_start_version(0), + sync_timeout_event(NULL), + sync_last_committed_floor(0), + + timecheck_round(0), + timecheck_acks(0), + timecheck_event(NULL), + + probe_timeout_event(NULL), + + paxos_service(PAXOS_NUM), + admin_hook(NULL), + routed_request_tid(0) +{ + rank = -1; + + paxos = new Paxos(this, "paxos"); + + paxos_service[PAXOS_MDSMAP] = new MDSMonitor(this, paxos, "mdsmap"); + paxos_service[PAXOS_MONMAP] = new MonmapMonitor(this, paxos, "monmap"); + paxos_service[PAXOS_OSDMAP] = new OSDMonitor(this, paxos, "osdmap"); + paxos_service[PAXOS_PGMAP] = new PGMonitor(this, paxos, "pgmap"); + paxos_service[PAXOS_LOG] = new LogMonitor(this, paxos, "logm"); + paxos_service[PAXOS_AUTH] = new AuthMonitor(this, paxos, "auth"); + + health_monitor = new HealthMonitor(this); + config_key_service = new ConfigKeyService(this, paxos); + + mon_caps = new MonCap(); + bool r = mon_caps->parse("allow *", NULL); + assert(r); + + exited_quorum = ceph_clock_now(g_ceph_context); + + // assume our commands until we have an election. this only means + // we won't reply with EINVAL before the election; any command that + // actually matters will wait until we have quorum etc and then + // retry (and revalidate). + const MonCommand *cmds; + int cmdsize; + get_locally_supported_monitor_commands(&cmds, &cmdsize); + set_leader_supported_commands(cmds, cmdsize); +} + +PaxosService *Monitor::get_paxos_service_by_name(const string& name) +{ + if (name == "mdsmap") + return paxos_service[PAXOS_MDSMAP]; + if (name == "monmap") + return paxos_service[PAXOS_MONMAP]; + if (name == "osdmap") + return paxos_service[PAXOS_OSDMAP]; + if (name == "pgmap") + return paxos_service[PAXOS_PGMAP]; + if (name == "logm") + return paxos_service[PAXOS_LOG]; + if (name == "auth") + return paxos_service[PAXOS_AUTH]; + + assert(0 == "given name does not match known paxos service"); + return NULL; +} + +Monitor::~Monitor() +{ + for (vector::iterator p = paxos_service.begin(); p != paxos_service.end(); ++p) + delete *p; + delete health_monitor; + delete config_key_service; + delete paxos; + assert(session_map.sessions.empty()); + delete mon_caps; + if (leader_supported_mon_commands != mon_commands && + leader_supported_mon_commands != classic_mon_commands) + delete[] leader_supported_mon_commands; +} + + +enum { + l_mon_first = 456000, + l_mon_last, +}; + + +class AdminHook : public AdminSocketHook { + Monitor *mon; +public: + AdminHook(Monitor *m) : mon(m) {} + bool call(std::string command, cmdmap_t& cmdmap, std::string format, + bufferlist& out) { + stringstream ss; + mon->do_admin_command(command, cmdmap, format, ss); + out.append(ss); + return true; + } +}; + +void Monitor::do_admin_command(string command, cmdmap_t& cmdmap, string format, + ostream& ss) +{ + Mutex::Locker l(lock); + + boost::scoped_ptr f(new_formatter(format)); + + if (command == "mon_status") { + get_mon_status(f.get(), ss); + if (f) + f->flush(ss); + } else if (command == "quorum_status") + _quorum_status(f.get(), ss); + else if (command == "sync_force") { + string validate; + if ((!cmd_getval(g_ceph_context, cmdmap, "validate", validate)) || + (validate != "--yes-i-really-mean-it")) { + ss << "are you SURE? this will mean the monitor store will be erased " + "the next time the monitor is restarted. pass " + "'--yes-i-really-mean-it' if you really do."; + return; + } + sync_force(f.get(), ss); + } else if (command.find("add_bootstrap_peer_hint") == 0) { + _add_bootstrap_peer_hint(command, cmdmap, ss); + } else if (command.find("osdmonitor_prepare_command") == 0) { + _osdmonitor_prepare_command(cmdmap, ss); + } else if (command == "quorum enter") { + elector.start_participating(); + start_election(); + ss << "started responding to quorum, initiated new election"; + } else if (command == "quorum exit") { + start_election(); + elector.stop_participating(); + ss << "stopped responding to quorum, initiated new election"; + } else + assert(0 == "bad AdminSocket command binding"); +} + +void Monitor::handle_signal(int signum) +{ + assert(signum == SIGINT || signum == SIGTERM); + derr << "*** Got Signal " << sys_siglist[signum] << " ***" << dendl; + shutdown(); +} + +CompatSet Monitor::get_supported_features() +{ + CompatSet::FeatureSet ceph_mon_feature_compat; + CompatSet::FeatureSet ceph_mon_feature_ro_compat; + CompatSet::FeatureSet ceph_mon_feature_incompat; + ceph_mon_feature_incompat.insert(CEPH_MON_FEATURE_INCOMPAT_BASE); + ceph_mon_feature_incompat.insert(CEPH_MON_FEATURE_INCOMPAT_SINGLE_PAXOS); + ceph_mon_feature_incompat.insert(CEPH_MON_FEATURE_INCOMPAT_OSD_ERASURE_CODES); + ceph_mon_feature_incompat.insert(CEPH_MON_FEATURE_INCOMPAT_OSDMAP_ENC); + return CompatSet(ceph_mon_feature_compat, ceph_mon_feature_ro_compat, + ceph_mon_feature_incompat); +} + +CompatSet Monitor::get_legacy_features() +{ + CompatSet::FeatureSet ceph_mon_feature_compat; + CompatSet::FeatureSet ceph_mon_feature_ro_compat; + CompatSet::FeatureSet ceph_mon_feature_incompat; + ceph_mon_feature_incompat.insert(CEPH_MON_FEATURE_INCOMPAT_BASE); + return CompatSet(ceph_mon_feature_compat, ceph_mon_feature_ro_compat, + ceph_mon_feature_incompat); +} + +int Monitor::check_features(MonitorDBStore *store) +{ + CompatSet required = get_supported_features(); + CompatSet ondisk; + + read_features_off_disk(store, &ondisk); + + if (!required.writeable(ondisk)) { + CompatSet diff = required.unsupported(ondisk); + generic_derr << "ERROR: on disk data includes unsupported features: " << diff << dendl; + return -EPERM; + } + + return 0; +} + +void Monitor::read_features_off_disk(MonitorDBStore *store, CompatSet *features) +{ + bufferlist featuresbl; + store->get(MONITOR_NAME, COMPAT_SET_LOC, featuresbl); + if (featuresbl.length() == 0) { + generic_dout(0) << "WARNING: mon fs missing feature list.\n" + << "Assuming it is old-style and introducing one." << dendl; + //we only want the baseline ~v.18 features assumed to be on disk. + //If new features are introduced this code needs to disappear or + //be made smarter. + *features = get_legacy_features(); + + bufferlist bl; + features->encode(bl); + MonitorDBStore::Transaction t; + t.put(MONITOR_NAME, COMPAT_SET_LOC, bl); + store->apply_transaction(t); + } else { + bufferlist::iterator it = featuresbl.begin(); + features->decode(it); + } +} + +void Monitor::read_features() +{ + read_features_off_disk(store, &features); + dout(10) << "features " << features << dendl; + + apply_compatset_features_to_quorum_requirements(); + dout(10) << "required_features " << required_features << dendl; +} + +void Monitor::write_features(MonitorDBStore::Transaction &t) +{ + bufferlist bl; + features.encode(bl); + t.put(MONITOR_NAME, COMPAT_SET_LOC, bl); +} + +int Monitor::preinit() +{ + lock.Lock(); + + dout(1) << "preinit fsid " << monmap->fsid << dendl; + + assert(!logger); + { + PerfCountersBuilder pcb(g_ceph_context, "mon", l_mon_first, l_mon_last); + // ... + logger = pcb.create_perf_counters(); + cct->get_perfcounters_collection()->add(logger); + } + + assert(!cluster_logger); + { + PerfCountersBuilder pcb(g_ceph_context, "cluster", l_cluster_first, l_cluster_last); + pcb.add_u64(l_cluster_num_mon, "num_mon"); + pcb.add_u64(l_cluster_num_mon_quorum, "num_mon_quorum"); + pcb.add_u64(l_cluster_num_osd, "num_osd"); + pcb.add_u64(l_cluster_num_osd_up, "num_osd_up"); + pcb.add_u64(l_cluster_num_osd_in, "num_osd_in"); + pcb.add_u64(l_cluster_osd_epoch, "osd_epoch"); + pcb.add_u64(l_cluster_osd_kb, "osd_kb"); + pcb.add_u64(l_cluster_osd_kb_used, "osd_kb_used"); + pcb.add_u64(l_cluster_osd_kb_avail, "osd_kb_avail"); + pcb.add_u64(l_cluster_num_pool, "num_pool"); + pcb.add_u64(l_cluster_num_pg, "num_pg"); + pcb.add_u64(l_cluster_num_pg_active_clean, "num_pg_active_clean"); + pcb.add_u64(l_cluster_num_pg_active, "num_pg_active"); + pcb.add_u64(l_cluster_num_pg_peering, "num_pg_peering"); + pcb.add_u64(l_cluster_num_object, "num_object"); + pcb.add_u64(l_cluster_num_object_degraded, "num_object_degraded"); + pcb.add_u64(l_cluster_num_object_unfound, "num_object_unfound"); + pcb.add_u64(l_cluster_num_bytes, "num_bytes"); + pcb.add_u64(l_cluster_num_mds_up, "num_mds_up"); + pcb.add_u64(l_cluster_num_mds_in, "num_mds_in"); + pcb.add_u64(l_cluster_num_mds_failed, "num_mds_failed"); + pcb.add_u64(l_cluster_mds_epoch, "mds_epoch"); + cluster_logger = pcb.create_perf_counters(); + } + + // verify cluster_uuid + { + int r = check_fsid(); + if (r == -ENOENT) + r = write_fsid(); + if (r < 0) { + lock.Unlock(); + return r; + } + } + + // open compatset + read_features(); + + // have we ever joined a quorum? + has_ever_joined = (store->get(MONITOR_NAME, "joined") != 0); + dout(10) << "has_ever_joined = " << (int)has_ever_joined << dendl; + + if (!has_ever_joined) { + // impose initial quorum restrictions? + list initial_members; + get_str_list(g_conf->mon_initial_members, initial_members); + + if (!initial_members.empty()) { + dout(1) << " initial_members " << initial_members << ", filtering seed monmap" << dendl; + + monmap->set_initial_members(g_ceph_context, initial_members, name, messenger->get_myaddr(), + &extra_probe_peers); + + dout(10) << " monmap is " << *monmap << dendl; + dout(10) << " extra probe peers " << extra_probe_peers << dendl; + } + } else if (!monmap->contains(name)) { + derr << "not in monmap and have been in a quorum before; " + << "must have been removed" << dendl; + if (g_conf->mon_force_quorum_join) { + dout(0) << "we should have died but " + << "'mon_force_quorum_join' is set -- allowing boot" << dendl; + } else { + derr << "commit suicide!" << dendl; + return -ENOENT; + } + } + + { + // We have a potentially inconsistent store state in hands. Get rid of it + // and start fresh. + bool clear_store = false; + if (store->exists("mon_sync", "in_sync")) { + dout(1) << __func__ << " clean up potentially inconsistent store state" + << dendl; + clear_store = true; + } + + if (store->get("mon_sync", "force_sync") > 0) { + dout(1) << __func__ << " force sync by clearing store state" << dendl; + clear_store = true; + } + + if (clear_store) { + set sync_prefixes = get_sync_targets_names(); + store->clear(sync_prefixes); + } + } + + sync_last_committed_floor = store->get("mon_sync", "last_committed_floor"); + dout(10) << "sync_last_committed_floor " << sync_last_committed_floor << dendl; + + init_paxos(); + health_monitor->init(); + + int r; + + if (is_keyring_required()) { + // we need to bootstrap authentication keys so we can form an + // initial quorum. + if (authmon()->get_last_committed() == 0) { + dout(10) << "loading initial keyring to bootstrap authentication for mkfs" << dendl; + bufferlist bl; + store->get("mkfs", "keyring", bl); + KeyRing keyring; + bufferlist::iterator p = bl.begin(); + ::decode(keyring, p); + extract_save_mon_key(keyring); + } + + string keyring_loc = g_conf->mon_data + "/keyring"; + + r = keyring.load(cct, keyring_loc); + if (r < 0) { + EntityName mon_name; + mon_name.set_type(CEPH_ENTITY_TYPE_MON); + EntityAuth mon_key; + if (key_server.get_auth(mon_name, mon_key)) { + dout(1) << "copying mon. key from old db to external keyring" << dendl; + keyring.add(mon_name, mon_key); + bufferlist bl; + keyring.encode_plaintext(bl); + write_default_keyring(bl); + } else { + derr << "unable to load initial keyring " << g_conf->keyring << dendl; + lock.Unlock(); + return r; + } + } + } + + admin_hook = new AdminHook(this); + AdminSocket* admin_socket = cct->get_admin_socket(); + + // unlock while registering to avoid mon_lock -> admin socket lock dependency. + lock.Unlock(); + r = admin_socket->register_command("mon_status", "mon_status", admin_hook, + "show current monitor status"); + assert(r == 0); + if (g_conf->mon_advanced_debug_mode) { + r = admin_socket->register_command("osdmonitor_prepare_command", "osdmonitor_prepare_command", admin_hook, + "call OSDMonitor::prepare_command"); + assert(r == 0); + } + r = admin_socket->register_command("quorum_status", "quorum_status", + admin_hook, "show current quorum status"); + assert(r == 0); + r = admin_socket->register_command("sync_force", + "sync_force name=validate," + "type=CephChoices," + "strings=--yes-i-really-mean-it", + admin_hook, + "force sync of and clear monitor store"); + assert(r == 0); + r = admin_socket->register_command("add_bootstrap_peer_hint", + "add_bootstrap_peer_hint name=addr," + "type=CephIPAddr", + admin_hook, + "add peer address as potential bootstrap" + " peer for cluster bringup"); + assert(r == 0); + r = admin_socket->register_command("quorum enter", "quorum enter", + admin_hook, + "force monitor back into quorum"); + assert(r == 0); + r = admin_socket->register_command("quorum exit", "quorum exit", + admin_hook, + "force monitor out of the quorum"); + assert(r == 0); + lock.Lock(); + + lock.Unlock(); + return 0; +} + +int Monitor::init() +{ + dout(2) << "init" << dendl; + lock.Lock(); + + // start ticker + timer.init(); + new_tick(); + + // i'm ready! + messenger->add_dispatcher_tail(this); + + bootstrap(); + + // encode command sets + const MonCommand *cmds; + int cmdsize; + get_locally_supported_monitor_commands(&cmds, &cmdsize); + MonCommand::encode_array(cmds, cmdsize, supported_commands_bl); + get_classic_monitor_commands(&cmds, &cmdsize); + MonCommand::encode_array(cmds, cmdsize, classic_commands_bl); + + lock.Unlock(); + return 0; +} + +void Monitor::init_paxos() +{ + dout(10) << __func__ << dendl; + paxos->init(); + + // init services + for (int i = 0; i < PAXOS_NUM; ++i) { + paxos_service[i]->init(); + } + + refresh_from_paxos(NULL); +} + +void Monitor::refresh_from_paxos(bool *need_bootstrap) +{ + dout(10) << __func__ << dendl; + + bufferlist bl; + int r = store->get(MONITOR_NAME, "cluster_fingerprint", bl); + if (r >= 0) { + try { + bufferlist::iterator p = bl.begin(); + ::decode(fingerprint, p); + } + catch (buffer::error& e) { + dout(10) << __func__ << " failed to decode cluster_fingerprint" << dendl; + } + } else { + dout(10) << __func__ << " no cluster_fingerprint" << dendl; + } + + for (int i = 0; i < PAXOS_NUM; ++i) { + paxos_service[i]->refresh(need_bootstrap); + } + for (int i = 0; i < PAXOS_NUM; ++i) { + paxos_service[i]->post_paxos_update(); + } +} + +void Monitor::register_cluster_logger() +{ + if (!cluster_logger_registered) { + dout(10) << "register_cluster_logger" << dendl; + cluster_logger_registered = true; + cct->get_perfcounters_collection()->add(cluster_logger); + } else { + dout(10) << "register_cluster_logger - already registered" << dendl; + } +} + +void Monitor::unregister_cluster_logger() +{ + if (cluster_logger_registered) { + dout(10) << "unregister_cluster_logger" << dendl; + cluster_logger_registered = false; + cct->get_perfcounters_collection()->remove(cluster_logger); + } else { + dout(10) << "unregister_cluster_logger - not registered" << dendl; + } +} + +void Monitor::update_logger() +{ + cluster_logger->set(l_cluster_num_mon, monmap->size()); + cluster_logger->set(l_cluster_num_mon_quorum, quorum.size()); +} + +void Monitor::shutdown() +{ + dout(1) << "shutdown" << dendl; + lock.Lock(); + + state = STATE_SHUTDOWN; + + if (admin_hook) { + AdminSocket* admin_socket = cct->get_admin_socket(); + admin_socket->unregister_command("mon_status"); + admin_socket->unregister_command("quorum_status"); + admin_socket->unregister_command("sync_force"); + admin_socket->unregister_command("add_bootstrap_peer_hint"); + delete admin_hook; + admin_hook = NULL; + } + + elector.shutdown(); + + if (logger) { + cct->get_perfcounters_collection()->remove(logger); + delete logger; + logger = NULL; + } + if (cluster_logger) { + if (cluster_logger_registered) + cct->get_perfcounters_collection()->remove(cluster_logger); + delete cluster_logger; + cluster_logger = NULL; + } + + // clean up + paxos->shutdown(); + for (vector::iterator p = paxos_service.begin(); p != paxos_service.end(); ++p) + (*p)->shutdown(); + health_monitor->shutdown(); + + finish_contexts(g_ceph_context, waitfor_quorum, -ECANCELED); + finish_contexts(g_ceph_context, maybe_wait_for_quorum, -ECANCELED); + + timer.shutdown(); + + remove_all_sessions(); + + // unlock before msgr shutdown... + lock.Unlock(); + + messenger->shutdown(); // last thing! ceph_mon.cc will delete mon. +} + +void Monitor::bootstrap() +{ + dout(10) << "bootstrap" << dendl; + + sync_reset_requester(); + unregister_cluster_logger(); + cancel_probe_timeout(); + + // note my rank + int newrank = monmap->get_rank(messenger->get_myaddr()); + if (newrank < 0 && rank >= 0) { + // was i ever part of the quorum? + if (has_ever_joined) { + dout(0) << " removed from monmap, suicide." << dendl; + exit(0); + } + } + if (newrank != rank) { + dout(0) << " my rank is now " << newrank << " (was " << rank << ")" << dendl; + messenger->set_myname(entity_name_t::MON(newrank)); + rank = newrank; + + // reset all connections, or else our peers will think we are someone else. + messenger->mark_down_all(); + } + + // reset + state = STATE_PROBING; + + _reset(); + + // sync store + if (g_conf->mon_compact_on_bootstrap) { + dout(10) << "bootstrap -- triggering compaction" << dendl; + store->compact(); + dout(10) << "bootstrap -- finished compaction" << dendl; + } + + // singleton monitor? + if (monmap->size() == 1 && rank == 0) { + win_standalone_election(); + return; + } + + reset_probe_timeout(); + + // i'm outside the quorum + if (monmap->contains(name)) + outside_quorum.insert(name); + + // probe monitors + dout(10) << "probing other monitors" << dendl; + for (unsigned i = 0; i < monmap->size(); i++) { + if ((int)i != rank) + messenger->send_message(new MMonProbe(monmap->fsid, MMonProbe::OP_PROBE, name, has_ever_joined), + monmap->get_inst(i)); + } + for (set::iterator p = extra_probe_peers.begin(); + p != extra_probe_peers.end(); + ++p) { + if (*p != messenger->get_myaddr()) { + entity_inst_t i; + i.name = entity_name_t::MON(-1); + i.addr = *p; + messenger->send_message(new MMonProbe(monmap->fsid, MMonProbe::OP_PROBE, name, has_ever_joined), i); + } + } +} + +void Monitor::_osdmonitor_prepare_command(cmdmap_t& cmdmap, ostream& ss) +{ + if (!is_leader()) { + ss << "mon must be a leader"; + return; + } + + string cmd; + cmd_getval(g_ceph_context, cmdmap, "prepare", cmd); + cmdmap["prefix"] = cmdmap["prepare"]; + + OSDMonitor *monitor = osdmon(); + MMonCommand *m = static_cast((new MMonCommand())->get()); + if (monitor->prepare_command_impl(m, cmdmap)) + ss << "true"; + else + ss << "false"; + m->put(); +} + +void Monitor::_add_bootstrap_peer_hint(string cmd, cmdmap_t& cmdmap, ostream& ss) +{ + string addrstr; + if (!cmd_getval(g_ceph_context, cmdmap, "addr", addrstr)) { + ss << "unable to parse address string value '" + << cmd_vartype_stringify(cmdmap["addr"]) << "'"; + return; + } + dout(10) << "_add_bootstrap_peer_hint '" << cmd << "' '" + << addrstr << "'" << dendl; + + entity_addr_t addr; + const char *end = 0; + if (!addr.parse(addrstr.c_str(), &end)) { + ss << "failed to parse addr '" << addrstr << "'; syntax is 'add_bootstrap_peer_hint ip[:port]'"; + return; + } + + if (is_leader() || is_peon()) { + ss << "mon already active; ignoring bootstrap hint"; + return; + } + + if (addr.get_port() == 0) + addr.set_port(CEPH_MON_PORT); + + extra_probe_peers.insert(addr); + ss << "adding peer " << addr << " to list: " << extra_probe_peers; +} + +// called by bootstrap(), or on leader|peon -> electing +void Monitor::_reset() +{ + dout(10) << __func__ << dendl; + + assert(state == STATE_ELECTING || + state == STATE_PROBING); + + cancel_probe_timeout(); + timecheck_finish(); + + leader_since = utime_t(); + if (!quorum.empty()) { + exited_quorum = ceph_clock_now(g_ceph_context); + } + quorum.clear(); + outside_quorum.clear(); + + scrub_reset(); + + paxos->restart(); + + for (vector::iterator p = paxos_service.begin(); p != paxos_service.end(); ++p) + (*p)->restart(); + health_monitor->finish(); +} + + +// ----------------------------------------------------------- +// sync + +set Monitor::get_sync_targets_names() +{ + set targets; + targets.insert(paxos->get_name()); + for (int i = 0; i < PAXOS_NUM; ++i) + paxos_service[i]->get_store_prefixes(targets); + + return targets; +} + + +void Monitor::sync_timeout() +{ + dout(10) << __func__ << dendl; + assert(state == STATE_SYNCHRONIZING); + bootstrap(); +} + +void Monitor::sync_obtain_latest_monmap(bufferlist &bl) +{ + dout(1) << __func__ << dendl; + + MonMap latest_monmap; + + // Grab latest monmap from MonmapMonitor + bufferlist monmon_bl; + int err = monmon()->get_monmap(monmon_bl); + if (err < 0) { + if (err != -ENOENT) { + derr << __func__ + << " something wrong happened while reading the store: " + << cpp_strerror(err) << dendl; + assert(0 == "error reading the store"); + } + } else { + latest_monmap.decode(monmon_bl); + } + + // Grab last backed up monmap (if any) and compare epochs + if (store->exists("mon_sync", "latest_monmap")) { + bufferlist backup_bl; + int err = store->get("mon_sync", "latest_monmap", backup_bl); + if (err < 0) { + assert(err != -ENOENT); + derr << __func__ + << " something wrong happened while reading the store: " + << cpp_strerror(err) << dendl; + assert(0 == "error reading the store"); + } + assert(backup_bl.length() > 0); + + MonMap backup_monmap; + backup_monmap.decode(backup_bl); + + if (backup_monmap.epoch > latest_monmap.epoch) + latest_monmap = backup_monmap; + } + + // Check if our current monmap's epoch is greater than the one we've + // got so far. + if (monmap->epoch > latest_monmap.epoch) + latest_monmap = *monmap; + + dout(1) << __func__ << " obtained monmap e" << latest_monmap.epoch << dendl; + + latest_monmap.encode(bl, quorum_features); +} + +void Monitor::sync_reset_requester() +{ + dout(10) << __func__ << dendl; + + if (sync_timeout_event) { + timer.cancel_event(sync_timeout_event); + sync_timeout_event = NULL; + } + + sync_provider = entity_inst_t(); + sync_cookie = 0; + sync_full = false; + sync_start_version = 0; +} + +void Monitor::sync_reset_provider() +{ + dout(10) << __func__ << dendl; + sync_providers.clear(); +} + +void Monitor::sync_start(entity_inst_t &other, bool full) +{ + dout(10) << __func__ << " " << other << (full ? " full" : " recent") << dendl; + + assert(state == STATE_PROBING || + state == STATE_SYNCHRONIZING); + state = STATE_SYNCHRONIZING; + + // make sure are not a provider for anyone! + sync_reset_provider(); + + sync_full = full; + + if (sync_full) { + // stash key state, and mark that we are syncing + MonitorDBStore::Transaction t; + sync_stash_critical_state(&t); + t.put("mon_sync", "in_sync", 1); + + sync_last_committed_floor = MAX(sync_last_committed_floor, paxos->get_version()); + dout(10) << __func__ << " marking sync in progress, storing sync_last_committed_floor " + << sync_last_committed_floor << dendl; + t.put("mon_sync", "last_committed_floor", sync_last_committed_floor); + + store->apply_transaction(t); + + assert(g_conf->mon_sync_requester_kill_at != 1); + + // clear the underlying store + set targets = get_sync_targets_names(); + dout(10) << __func__ << " clearing prefixes " << targets << dendl; + store->clear(targets); + + // make sure paxos knows it has been reset. this prevents a + // bootstrap and then different probe reply order from possibly + // deciding a partial or no sync is needed. + paxos->init(); + + assert(g_conf->mon_sync_requester_kill_at != 2); + } + + // assume 'other' as the leader. We will update the leader once we receive + // a reply to the sync start. + sync_provider = other; + + sync_reset_timeout(); + + MMonSync *m = new MMonSync(sync_full ? MMonSync::OP_GET_COOKIE_FULL : MMonSync::OP_GET_COOKIE_RECENT); + if (!sync_full) + m->last_committed = paxos->get_version(); + messenger->send_message(m, sync_provider); +} + +void Monitor::sync_stash_critical_state(MonitorDBStore::Transaction *t) +{ + dout(10) << __func__ << dendl; + bufferlist backup_monmap; + sync_obtain_latest_monmap(backup_monmap); + assert(backup_monmap.length() > 0); + t->put("mon_sync", "latest_monmap", backup_monmap); +} + +void Monitor::sync_reset_timeout() +{ + dout(10) << __func__ << dendl; + if (sync_timeout_event) + timer.cancel_event(sync_timeout_event); + sync_timeout_event = new C_SyncTimeout(this); + timer.add_event_after(g_conf->mon_sync_timeout, sync_timeout_event); +} + +void Monitor::sync_finish(version_t last_committed) +{ + dout(10) << __func__ << " lc " << last_committed << " from " << sync_provider << dendl; + + assert(g_conf->mon_sync_requester_kill_at != 7); + + if (sync_full) { + // finalize the paxos commits + MonitorDBStore::Transaction tx; + paxos->read_and_prepare_transactions(&tx, sync_start_version, last_committed); + tx.put(paxos->get_name(), "last_committed", last_committed); + + dout(30) << __func__ << " final tx dump:\n"; + JSONFormatter f(true); + tx.dump(&f); + f.flush(*_dout); + *_dout << dendl; + + store->apply_transaction(tx); + } + + assert(g_conf->mon_sync_requester_kill_at != 8); + + MonitorDBStore::Transaction t; + t.erase("mon_sync", "in_sync"); + t.erase("mon_sync", "force_sync"); + t.erase("mon_sync", "last_committed_floor"); + store->apply_transaction(t); + + assert(g_conf->mon_sync_requester_kill_at != 9); + + init_paxos(); + + assert(g_conf->mon_sync_requester_kill_at != 10); + + bootstrap(); +} + +void Monitor::handle_sync(MMonSync *m) +{ + dout(10) << __func__ << " " << *m << dendl; + switch (m->op) { + + // provider --------- + + case MMonSync::OP_GET_COOKIE_FULL: + case MMonSync::OP_GET_COOKIE_RECENT: + handle_sync_get_cookie(m); + break; + case MMonSync::OP_GET_CHUNK: + handle_sync_get_chunk(m); + break; + + // client ----------- + + case MMonSync::OP_COOKIE: + handle_sync_cookie(m); + break; + + case MMonSync::OP_CHUNK: + case MMonSync::OP_LAST_CHUNK: + handle_sync_chunk(m); + break; + case MMonSync::OP_NO_COOKIE: + handle_sync_no_cookie(m); + break; + + default: + dout(0) << __func__ << " unknown op " << m->op << dendl; + assert(0 == "unknown op"); + } + m->put(); +} + +// leader + +void Monitor::_sync_reply_no_cookie(MMonSync *m) +{ + MMonSync *reply = new MMonSync(MMonSync::OP_NO_COOKIE, m->cookie); + messenger->send_message(reply, m->get_connection()); +} + +void Monitor::handle_sync_get_cookie(MMonSync *m) +{ + if (is_synchronizing()) { + _sync_reply_no_cookie(m); + return; + } + + assert(g_conf->mon_sync_provider_kill_at != 1); + + // make sure they can understand us. + if ((required_features ^ m->get_connection()->get_features()) & + required_features) { + dout(5) << " ignoring peer mon." << m->get_source().num() + << " has features " << std::hex + << m->get_connection()->get_features() + << " but we require " << required_features << std::dec << dendl; + return; + } + + // make up a unique cookie. include election epoch (which persists + // across restarts for the whole cluster) and a counter for this + // process instance. there is no need to be unique *across* + // monitors, though. + uint64_t cookie = ((unsigned long long)elector.get_epoch() << 24) + ++sync_provider_count; + assert(sync_providers.count(cookie) == 0); + + dout(10) << __func__ << " cookie " << cookie << " for " << m->get_source_inst() << dendl; + + SyncProvider& sp = sync_providers[cookie]; + sp.cookie = cookie; + sp.entity = m->get_source_inst(); + sp.reset_timeout(g_ceph_context, g_conf->mon_sync_timeout * 2); + + set sync_targets; + if (m->op == MMonSync::OP_GET_COOKIE_FULL) { + // full scan + sync_targets = get_sync_targets_names(); + sp.last_committed = paxos->get_version(); + sp.synchronizer = store->get_synchronizer(sp.last_key, sync_targets); + sp.full = true; + dout(10) << __func__ << " will sync prefixes " << sync_targets << dendl; + } else { + // just catch up paxos + sp.last_committed = m->last_committed; + } + dout(10) << __func__ << " will sync from version " << sp.last_committed << dendl; + + MMonSync *reply = new MMonSync(MMonSync::OP_COOKIE, sp.cookie); + reply->last_committed = sp.last_committed; + messenger->send_message(reply, m->get_connection()); +} + +void Monitor::handle_sync_get_chunk(MMonSync *m) +{ + dout(10) << __func__ << " " << *m << dendl; + + if (sync_providers.count(m->cookie) == 0) { + dout(10) << __func__ << " no cookie " << m->cookie << dendl; + _sync_reply_no_cookie(m); + return; + } + + assert(g_conf->mon_sync_provider_kill_at != 2); + + SyncProvider& sp = sync_providers[m->cookie]; + sp.reset_timeout(g_ceph_context, g_conf->mon_sync_timeout * 2); + + if (sp.last_committed < paxos->get_first_committed() && + paxos->get_first_committed() > 1) { + dout(10) << __func__ << " sync requester fell behind paxos, their lc " << sp.last_committed + << " < our fc " << paxos->get_first_committed() << dendl; + sync_providers.erase(m->cookie); + _sync_reply_no_cookie(m); + return; + } + + MMonSync *reply = new MMonSync(MMonSync::OP_CHUNK, sp.cookie); + MonitorDBStore::Transaction tx; + + int left = g_conf->mon_sync_max_payload_size; + while (sp.last_committed < paxos->get_version() && left > 0) { + bufferlist bl; + sp.last_committed++; + store->get(paxos->get_name(), sp.last_committed, bl); + tx.put(paxos->get_name(), sp.last_committed, bl); + left -= bl.length(); + dout(20) << __func__ << " including paxos state " << sp.last_committed << dendl; + } + reply->last_committed = sp.last_committed; + + if (sp.full && left > 0) { + sp.synchronizer->get_chunk_tx(tx, left); + sp.last_key = sp.synchronizer->get_last_key(); + reply->last_key = sp.last_key; + } + + if ((sp.full && sp.synchronizer->has_next_chunk()) || + sp.last_committed < paxos->get_version()) { + dout(10) << __func__ << " chunk, through version " << sp.last_committed << " key " << sp.last_key << dendl; + } else { + dout(10) << __func__ << " last chunk, through version " << sp.last_committed << " key " << sp.last_key << dendl; + reply->op = MMonSync::OP_LAST_CHUNK; + + assert(g_conf->mon_sync_provider_kill_at != 3); + + // clean up our local state + sync_providers.erase(sp.cookie); + } + + ::encode(tx, reply->chunk_bl); + + messenger->send_message(reply, m->get_connection()); +} + +// requester + +void Monitor::handle_sync_cookie(MMonSync *m) +{ + dout(10) << __func__ << " " << *m << dendl; + if (sync_cookie) { + dout(10) << __func__ << " already have a cookie, ignoring" << dendl; + return; + } + if (m->get_source_inst() != sync_provider) { + dout(10) << __func__ << " source does not match, discarding" << dendl; + return; + } + sync_cookie = m->cookie; + sync_start_version = m->last_committed; + + sync_reset_timeout(); + sync_get_next_chunk(); + + assert(g_conf->mon_sync_requester_kill_at != 3); +} + +void Monitor::sync_get_next_chunk() +{ + dout(20) << __func__ << " cookie " << sync_cookie << " provider " << sync_provider << dendl; + if (g_conf->mon_inject_sync_get_chunk_delay > 0) { + dout(20) << __func__ << " injecting delay of " << g_conf->mon_inject_sync_get_chunk_delay << dendl; + usleep((long long)(g_conf->mon_inject_sync_get_chunk_delay * 1000000.0)); + } + MMonSync *r = new MMonSync(MMonSync::OP_GET_CHUNK, sync_cookie); + messenger->send_message(r, sync_provider); + + assert(g_conf->mon_sync_requester_kill_at != 4); +} + +void Monitor::handle_sync_chunk(MMonSync *m) +{ + dout(10) << __func__ << " " << *m << dendl; + + if (m->cookie != sync_cookie) { + dout(10) << __func__ << " cookie does not match, discarding" << dendl; + return; + } + if (m->get_source_inst() != sync_provider) { + dout(10) << __func__ << " source does not match, discarding" << dendl; + return; + } + + assert(state == STATE_SYNCHRONIZING); + assert(g_conf->mon_sync_requester_kill_at != 5); + + MonitorDBStore::Transaction tx; + tx.append_from_encoded(m->chunk_bl); + + dout(30) << __func__ << " tx dump:\n"; + JSONFormatter f(true); + tx.dump(&f); + f.flush(*_dout); + *_dout << dendl; + + store->apply_transaction(tx); + + assert(g_conf->mon_sync_requester_kill_at != 6); + + if (!sync_full) { + dout(10) << __func__ << " applying recent paxos transactions as we go" << dendl; + MonitorDBStore::Transaction tx; + paxos->read_and_prepare_transactions(&tx, paxos->get_version() + 1, m->last_committed); + tx.put(paxos->get_name(), "last_committed", m->last_committed); + + dout(30) << __func__ << " tx dump:\n"; + JSONFormatter f(true); + tx.dump(&f); + f.flush(*_dout); + *_dout << dendl; + + store->apply_transaction(tx); + paxos->init(); // to refresh what we just wrote + } + + if (m->op == MMonSync::OP_CHUNK) { + sync_reset_timeout(); + sync_get_next_chunk(); + } else if (m->op == MMonSync::OP_LAST_CHUNK) { + sync_finish(m->last_committed); + } +} + +void Monitor::handle_sync_no_cookie(MMonSync *m) +{ + dout(10) << __func__ << dendl; + bootstrap(); +} + +void Monitor::sync_trim_providers() +{ + dout(20) << __func__ << dendl; + + utime_t now = ceph_clock_now(g_ceph_context); + map::iterator p = sync_providers.begin(); + while (p != sync_providers.end()) { + if (now > p->second.timeout) { + dout(10) << __func__ << " expiring cookie " << p->second.cookie << " for " << p->second.entity << dendl; + sync_providers.erase(p++); + } else { + ++p; + } + } +} + +// --------------------------------------------------- +// probe + +void Monitor::cancel_probe_timeout() +{ + if (probe_timeout_event) { + dout(10) << "cancel_probe_timeout " << probe_timeout_event << dendl; + timer.cancel_event(probe_timeout_event); + probe_timeout_event = NULL; + } else { + dout(10) << "cancel_probe_timeout (none scheduled)" << dendl; + } +} + +void Monitor::reset_probe_timeout() +{ + cancel_probe_timeout(); + probe_timeout_event = new C_ProbeTimeout(this); + double t = g_conf->mon_probe_timeout; + timer.add_event_after(t, probe_timeout_event); + dout(10) << "reset_probe_timeout " << probe_timeout_event << " after " << t << " seconds" << dendl; +} + +void Monitor::probe_timeout(int r) +{ + dout(4) << "probe_timeout " << probe_timeout_event << dendl; + assert(is_probing() || is_synchronizing()); + assert(probe_timeout_event); + probe_timeout_event = NULL; + bootstrap(); +} + +void Monitor::handle_probe(MMonProbe *m) +{ + dout(10) << "handle_probe " << *m << dendl; + + if (m->fsid != monmap->fsid) { + dout(0) << "handle_probe ignoring fsid " << m->fsid << " != " << monmap->fsid << dendl; + m->put(); + return; + } + + switch (m->op) { + case MMonProbe::OP_PROBE: + handle_probe_probe(m); + break; + + case MMonProbe::OP_REPLY: + handle_probe_reply(m); + break; + + case MMonProbe::OP_MISSING_FEATURES: + derr << __func__ << " missing features, have " << CEPH_FEATURES_ALL + << ", required " << required_features + << ", missing " << (required_features & ~CEPH_FEATURES_ALL) + << dendl; + break; + + default: + m->put(); + } +} + +/** + * @todo fix this. This is going to cause trouble. + */ +void Monitor::handle_probe_probe(MMonProbe *m) +{ + dout(10) << "handle_probe_probe " << m->get_source_inst() << *m + << " features " << m->get_connection()->get_features() << dendl; + uint64_t missing = required_features & ~m->get_connection()->get_features(); + if (missing) { + dout(1) << " peer " << m->get_source_addr() << " missing features " + << missing << dendl; + if (m->get_connection()->has_feature(CEPH_FEATURE_OSD_PRIMARY_AFFINITY)) { + MMonProbe *r = new MMonProbe(monmap->fsid, MMonProbe::OP_MISSING_FEATURES, + name, has_ever_joined); + m->required_features = required_features; + messenger->send_message(r, m->get_connection()); + } + m->put(); + return; + } + + MMonProbe *r = new MMonProbe(monmap->fsid, MMonProbe::OP_REPLY, + name, has_ever_joined); + r->name = name; + r->quorum = quorum; + monmap->encode(r->monmap_bl, m->get_connection()->get_features()); + r->paxos_first_version = paxos->get_first_committed(); + r->paxos_last_version = paxos->get_version(); + messenger->send_message(r, m->get_connection()); + + // did we discover a peer here? + if (!monmap->contains(m->get_source_addr())) { + dout(1) << " adding peer " << m->get_source_addr() + << " to list of hints" << dendl; + extra_probe_peers.insert(m->get_source_addr()); + } + + m->put(); +} + +void Monitor::handle_probe_reply(MMonProbe *m) +{ + dout(10) << "handle_probe_reply " << m->get_source_inst() << *m << dendl; + dout(10) << " monmap is " << *monmap << dendl; + + // discover name and addrs during probing or electing states. + if (!is_probing() && !is_electing()) { + m->put(); + return; + } + + // newer map, or they've joined a quorum and we haven't? + bufferlist mybl; + monmap->encode(mybl, m->get_connection()->get_features()); + // make sure it's actually different; the checks below err toward + // taking the other guy's map, which could cause us to loop. + if (!mybl.contents_equal(m->monmap_bl)) { + MonMap *newmap = new MonMap; + newmap->decode(m->monmap_bl); + if (m->has_ever_joined && (newmap->get_epoch() > monmap->get_epoch() || + !has_ever_joined)) { + dout(10) << " got newer/committed monmap epoch " << newmap->get_epoch() + << ", mine was " << monmap->get_epoch() << dendl; + delete newmap; + monmap->decode(m->monmap_bl); + m->put(); + + bootstrap(); + return; + } + delete newmap; + } + + // rename peer? + string peer_name = monmap->get_name(m->get_source_addr()); + if (monmap->get_epoch() == 0 && peer_name.find("noname-") == 0) { + dout(10) << " renaming peer " << m->get_source_addr() << " " + << peer_name << " -> " << m->name << " in my monmap" + << dendl; + monmap->rename(peer_name, m->name); + + if (is_electing()) { + m->put(); + bootstrap(); + return; + } + } else { + dout(10) << " peer name is " << peer_name << dendl; + } + + // new initial peer? + if (monmap->get_epoch() == 0 && + monmap->contains(m->name) && + monmap->get_addr(m->name).is_blank_ip()) { + dout(1) << " learned initial mon " << m->name << " addr " << m->get_source_addr() << dendl; + monmap->set_addr(m->name, m->get_source_addr()); + m->put(); + + bootstrap(); + return; + } + + // end discover phase + if (!is_probing()) { + m->put(); + return; + } + + assert(paxos != NULL); + + if (is_synchronizing()) { + dout(10) << " currently syncing" << dendl; + m->put(); + return; + } + + entity_inst_t other = m->get_source_inst(); + + if (m->paxos_last_version < sync_last_committed_floor) { + dout(10) << " peer paxos versions [" << m->paxos_first_version + << "," << m->paxos_last_version << "] < my sync_last_committed_floor " + << sync_last_committed_floor << ", ignoring" + << dendl; + } else { + if (paxos->get_version() < m->paxos_first_version && + m->paxos_first_version > 1) { // no need to sync if we're 0 and they start at 1. + dout(10) << " peer paxos versions [" << m->paxos_first_version + << "," << m->paxos_last_version << "]" + << " vs my version " << paxos->get_version() + << " (too far ahead)" + << dendl; + cancel_probe_timeout(); + sync_start(other, true); + m->put(); + return; + } + if (paxos->get_version() + g_conf->paxos_max_join_drift < m->paxos_last_version) { + dout(10) << " peer paxos version " << m->paxos_last_version + << " vs my version " << paxos->get_version() + << " (too far ahead)" + << dendl; + cancel_probe_timeout(); + sync_start(other, false); + m->put(); + return; + } + } + + // is there an existing quorum? + if (m->quorum.size()) { + dout(10) << " existing quorum " << m->quorum << dendl; + + dout(10) << " peer paxos version " << m->paxos_last_version + << " vs my version " << paxos->get_version() + << " (ok)" + << dendl; + + if (monmap->contains(name) && + !monmap->get_addr(name).is_blank_ip()) { + // i'm part of the cluster; just initiate a new election + start_election(); + } else { + dout(10) << " ready to join, but i'm not in the monmap or my addr is blank, trying to join" << dendl; + messenger->send_message(new MMonJoin(monmap->fsid, name, messenger->get_myaddr()), + monmap->get_inst(*m->quorum.begin())); + } + } else { + if (monmap->contains(m->name)) { + dout(10) << " mon." << m->name << " is outside the quorum" << dendl; + outside_quorum.insert(m->name); + } else { + dout(10) << " mostly ignoring mon." << m->name << ", not part of monmap" << dendl; + m->put(); + return; + } + + unsigned need = monmap->size() / 2 + 1; + dout(10) << " outside_quorum now " << outside_quorum << ", need " << need << dendl; + if (outside_quorum.size() >= need) { + if (outside_quorum.count(name)) { + dout(10) << " that's enough to form a new quorum, calling election" << dendl; + start_election(); + } else { + dout(10) << " that's enough to form a new quorum, but it does not include me; waiting" << dendl; + } + } else { + dout(10) << " that's not yet enough for a new quorum, waiting" << dendl; + } + } + m->put(); +} + +void Monitor::join_election() +{ + dout(10) << __func__ << dendl; + state = STATE_ELECTING; + _reset(); +} + +void Monitor::start_election() +{ + dout(10) << "start_election" << dendl; + state = STATE_ELECTING; + _reset(); + + cancel_probe_timeout(); + + clog.info() << "mon." << name << " calling new monitor election\n"; + elector.call_election(); +} + +void Monitor::win_standalone_election() +{ + dout(1) << "win_standalone_election" << dendl; + + // bump election epoch, in case the previous epoch included other + // monitors; we need to be able to make the distinction. + elector.advance_epoch(); + + rank = monmap->get_rank(name); + assert(rank == 0); + set q; + q.insert(rank); + + const MonCommand *my_cmds; + int cmdsize; + get_locally_supported_monitor_commands(&my_cmds, &cmdsize); + win_election(1, q, CEPH_FEATURES_ALL, my_cmds, cmdsize, NULL); +} + +const utime_t& Monitor::get_leader_since() const +{ + assert(state == STATE_LEADER); + return leader_since; +} + +epoch_t Monitor::get_epoch() +{ + return elector.get_epoch(); +} + +void Monitor::win_election(epoch_t epoch, set& active, uint64_t features, + const MonCommand *cmdset, int cmdsize, + const set *classic_monitors) +{ + dout(10) << __func__ << " epoch " << epoch << " quorum " << active + << " features " << features << dendl; + assert(is_electing()); + state = STATE_LEADER; + leader_since = ceph_clock_now(g_ceph_context); + leader = rank; + quorum = active; + quorum_features = features; + outside_quorum.clear(); + + clog.info() << "mon." << name << "@" << rank + << " won leader election with quorum " << quorum << "\n"; + + set_leader_supported_commands(cmdset, cmdsize); + if (classic_monitors) + classic_mons = *classic_monitors; + + paxos->leader_init(); + // NOTE: tell monmap monitor first. This is important for the + // bootstrap case to ensure that the very first paxos proposal + // codifies the monmap. Otherwise any manner of chaos can ensue + // when monitors are call elections or participating in a paxos + // round without agreeing on who the participants are. + monmon()->election_finished(); + for (vector::iterator p = paxos_service.begin(); + p != paxos_service.end(); ++p) { + if (*p != monmon()) + (*p)->election_finished(); + } + health_monitor->start(epoch); + + finish_election(); + if (monmap->size() > 1 && + monmap->get_epoch() > 0) + timecheck_start(); +} + +void Monitor::lose_election(epoch_t epoch, set &q, int l, uint64_t features) +{ + state = STATE_PEON; + leader_since = utime_t(); + leader = l; + quorum = q; + outside_quorum.clear(); + quorum_features = features; + dout(10) << "lose_election, epoch " << epoch << " leader is mon" << leader + << " quorum is " << quorum << " features are " << quorum_features << dendl; + + paxos->peon_init(); + for (vector::iterator p = paxos_service.begin(); p != paxos_service.end(); ++p) + (*p)->election_finished(); + health_monitor->start(epoch); + + finish_election(); +} + +void Monitor::finish_election() +{ + apply_quorum_to_compatset_features(); + timecheck_finish(); + exited_quorum = utime_t(); + finish_contexts(g_ceph_context, waitfor_quorum); + finish_contexts(g_ceph_context, maybe_wait_for_quorum); + resend_routed_requests(); + update_logger(); + register_cluster_logger(); + + // am i named properly? + string cur_name = monmap->get_name(messenger->get_myaddr()); + if (cur_name != name) { + dout(10) << " renaming myself from " << cur_name << " -> " << name << dendl; + messenger->send_message(new MMonJoin(monmap->fsid, name, messenger->get_myaddr()), + monmap->get_inst(*quorum.begin())); + } +} + +void Monitor::apply_quorum_to_compatset_features() +{ + CompatSet new_features(features); + if (quorum_features & CEPH_FEATURE_OSD_ERASURE_CODES) { + new_features.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_OSD_ERASURE_CODES); + } + if (quorum_features & CEPH_FEATURE_OSDMAP_ENC) { + new_features.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_OSDMAP_ENC); + } + + if (new_features.compare(features) != 0) { + CompatSet diff = features.unsupported(new_features); + dout(1) << __func__ << " enabling new quorum features: " << diff << dendl; + features = new_features; + + MonitorDBStore::Transaction t; + write_features(t); + store->apply_transaction(t); + + apply_compatset_features_to_quorum_requirements(); + } +} + +void Monitor::apply_compatset_features_to_quorum_requirements() +{ + required_features = 0; + if (features.incompat.contains(CEPH_MON_FEATURE_INCOMPAT_OSD_ERASURE_CODES)) { + required_features |= CEPH_FEATURE_OSD_ERASURE_CODES; + } + if (features.incompat.contains(CEPH_MON_FEATURE_INCOMPAT_OSDMAP_ENC)) { + required_features |= CEPH_FEATURE_OSDMAP_ENC; + } + dout(10) << __func__ << " required_features " << required_features << dendl; +} + +void Monitor::sync_force(Formatter *f, ostream& ss) +{ + bool free_formatter = false; + + if (!f) { + // louzy/lazy hack: default to json if no formatter has been defined + f = new JSONFormatter(); + free_formatter = true; + } + + MonitorDBStore::Transaction tx; + sync_stash_critical_state(&tx); + tx.put("mon_sync", "force_sync", 1); + store->apply_transaction(tx); + + f->open_object_section("sync_force"); + f->dump_int("ret", 0); + f->dump_stream("msg") << "forcing store sync the next time the monitor starts"; + f->close_section(); // sync_force + f->flush(ss); + if (free_formatter) + delete f; +} + +void Monitor::_quorum_status(Formatter *f, ostream& ss) +{ + bool free_formatter = false; + + if (!f) { + // louzy/lazy hack: default to json if no formatter has been defined + f = new JSONFormatter(); + free_formatter = true; + } + f->open_object_section("quorum_status"); + f->dump_int("election_epoch", get_epoch()); + + f->open_array_section("quorum"); + for (set::iterator p = quorum.begin(); p != quorum.end(); ++p) + f->dump_int("mon", *p); + f->close_section(); // quorum + + list quorum_names = get_quorum_names(); + f->open_array_section("quorum_names"); + for (list::iterator p = quorum_names.begin(); p != quorum_names.end(); ++p) + f->dump_string("mon", *p); + f->close_section(); // quorum_names + + f->dump_string("quorum_leader_name", quorum.empty() ? string() : monmap->get_name(*quorum.begin())); + + f->open_object_section("monmap"); + monmap->dump(f); + f->close_section(); // monmap + + f->close_section(); // quorum_status + f->flush(ss); + if (free_formatter) + delete f; +} + +void Monitor::get_mon_status(Formatter *f, ostream& ss) +{ + bool free_formatter = false; + + if (!f) { + // louzy/lazy hack: default to json if no formatter has been defined + f = new JSONFormatter(); + free_formatter = true; + } + + f->open_object_section("mon_status"); + f->dump_string("name", name); + f->dump_int("rank", rank); + f->dump_string("state", get_state_name()); + f->dump_int("election_epoch", get_epoch()); + + f->open_array_section("quorum"); + for (set::iterator p = quorum.begin(); p != quorum.end(); ++p) { + f->dump_int("mon", *p); + } + + f->close_section(); // quorum + + f->open_array_section("outside_quorum"); + for (set::iterator p = outside_quorum.begin(); p != outside_quorum.end(); ++p) + f->dump_string("mon", *p); + f->close_section(); // outside_quorum + + f->open_array_section("extra_probe_peers"); + for (set::iterator p = extra_probe_peers.begin(); + p != extra_probe_peers.end(); + ++p) + f->dump_stream("peer") << *p; + f->close_section(); // extra_probe_peers + + f->open_array_section("sync_provider"); + for (map::const_iterator p = sync_providers.begin(); + p != sync_providers.end(); + ++p) { + f->dump_unsigned("cookie", p->second.cookie); + f->dump_stream("entity") << p->second.entity; + f->dump_stream("timeout") << p->second.timeout; + f->dump_unsigned("last_committed", p->second.last_committed); + f->dump_stream("last_key") << p->second.last_key; + } + f->close_section(); + + if (is_synchronizing()) { + f->open_object_section("sync"); + f->dump_stream("sync_provider") << sync_provider; + f->dump_unsigned("sync_cookie", sync_cookie); + f->dump_unsigned("sync_start_version", sync_start_version); + f->close_section(); + } + + if (g_conf->mon_sync_provider_kill_at > 0) + f->dump_int("provider_kill_at", g_conf->mon_sync_provider_kill_at); + if (g_conf->mon_sync_requester_kill_at > 0) + f->dump_int("requester_kill_at", g_conf->mon_sync_requester_kill_at); + + f->open_object_section("monmap"); + monmap->dump(f); + f->close_section(); + + f->close_section(); // mon_status + + if (free_formatter) { + // flush formatter to ss and delete it iff we created the formatter + f->flush(ss); + delete f; + } +} + +void Monitor::get_health(string& status, bufferlist *detailbl, Formatter *f) +{ + list > summary; + list > detail; + + if (f) + f->open_object_section("health"); + + for (vector::iterator p = paxos_service.begin(); + p != paxos_service.end(); + ++p) { + PaxosService *s = *p; + s->get_health(summary, detailbl ? &detail : NULL); + } + + health_monitor->get_health(f, summary, (detailbl ? &detail : NULL)); + + if (f) + f->open_array_section("summary"); + stringstream ss; + health_status_t overall = HEALTH_OK; + if (!summary.empty()) { + ss << ' '; + while (!summary.empty()) { + if (overall > summary.front().first) + overall = summary.front().first; + ss << summary.front().second; + if (f) { + f->open_object_section("item"); + f->dump_stream("severity") << summary.front().first; + f->dump_string("summary", summary.front().second); + f->close_section(); + } + summary.pop_front(); + if (!summary.empty()) + ss << "; "; + } + } + if (f) + f->close_section(); + + if (f) { + f->open_object_section("timechecks"); + f->dump_int("epoch", get_epoch()); + f->dump_int("round", timecheck_round); + f->dump_stream("round_status") + << ((timecheck_round%2) ? "on-going" : "finished"); + } + + if (!timecheck_skews.empty()) { + list warns; + if (f) + f->open_array_section("mons"); + for (map::iterator i = timecheck_skews.begin(); + i != timecheck_skews.end(); ++i) { + entity_inst_t inst = i->first; + double skew = i->second; + double latency = timecheck_latencies[inst]; + string name = monmap->get_name(inst.addr); + + ostringstream tcss; + health_status_t tcstatus = timecheck_status(tcss, skew, latency); + if (tcstatus != HEALTH_OK) { + if (overall > tcstatus) + overall = tcstatus; + warns.push_back(name); + + ostringstream tmp_ss; + tmp_ss << "mon." << name + << " addr " << inst.addr << " " << tcss.str() + << " (latency " << latency << "s)"; + detail.push_back(make_pair(tcstatus, tmp_ss.str())); + } + + if (f) { + f->open_object_section("mon"); + f->dump_string("name", name.c_str()); + f->dump_float("skew", skew); + f->dump_float("latency", latency); + f->dump_stream("health") << tcstatus; + if (tcstatus != HEALTH_OK) + f->dump_stream("details") << tcss.str(); + f->close_section(); + } + } + if (!warns.empty()) { + if (!ss.str().empty()) + ss << ";"; + ss << " clock skew detected on"; + while (!warns.empty()) { + ss << " mon." << warns.front(); + warns.pop_front(); + if (!warns.empty()) + ss << ","; + } + } + if (f) + f->close_section(); + } + if (f) + f->close_section(); + + stringstream fss; + fss << overall; + status = fss.str() + ss.str(); + if (f) + f->dump_stream("overall_status") << overall; + + if (f) + f->open_array_section("detail"); + while (!detail.empty()) { + if (f) + f->dump_string("item", detail.front().second); + else if (detailbl != NULL) { + detailbl->append(detail.front().second); + detailbl->append('\n'); + } + detail.pop_front(); + } + if (f) + f->close_section(); + + if (f) + f->close_section(); +} + +void Monitor::get_cluster_status(stringstream &ss, Formatter *f) +{ + if (f) + f->open_object_section("status"); + + // reply with the status for all the components + string health; + get_health(health, NULL, f); + + if (f) { + f->dump_stream("fsid") << monmap->get_fsid(); + f->dump_unsigned("election_epoch", get_epoch()); + { + f->open_array_section("quorum"); + for (set::iterator p = quorum.begin(); p != quorum.end(); ++p) + f->dump_int("rank", *p); + f->close_section(); + f->open_array_section("quorum_names"); + for (set::iterator p = quorum.begin(); p != quorum.end(); ++p) + f->dump_string("id", monmap->get_name(*p)); + f->close_section(); + } + f->open_object_section("monmap"); + monmap->dump(f); + f->close_section(); + f->open_object_section("osdmap"); + osdmon()->osdmap.print_summary(f, cout); + f->close_section(); + f->open_object_section("pgmap"); + pgmon()->pg_map.print_summary(f, NULL); + f->close_section(); + f->open_object_section("mdsmap"); + mdsmon()->mdsmap.print_summary(f, NULL); + f->close_section(); + f->close_section(); + } else { + ss << " cluster " << monmap->get_fsid() << "\n"; + ss << " health " << health << "\n"; + ss << " monmap " << *monmap << ", election epoch " << get_epoch() + << ", quorum " << get_quorum() << " " << get_quorum_names() << "\n"; + if (mdsmon()->mdsmap.get_epoch() > 1) + ss << " mdsmap " << mdsmon()->mdsmap << "\n"; + osdmon()->osdmap.print_summary(NULL, ss); + pgmon()->pg_map.print_summary(NULL, &ss); + } +} + +void Monitor::_generate_command_map(map& cmdmap, + map ¶m_str_map) +{ + for (map::const_iterator p = cmdmap.begin(); + p != cmdmap.end(); ++p) { + if (p->first == "prefix") + continue; + if (p->first == "caps") { + vector cv; + if (cmd_getval(g_ceph_context, cmdmap, "caps", cv) && + cv.size() % 2 == 0) { + for (unsigned i = 0; i < cv.size(); i += 2) { + string k = string("caps_") + cv[i]; + param_str_map[k] = cv[i + 1]; + } + continue; + } + } + param_str_map[p->first] = cmd_vartype_stringify(p->second); + } +} + +const MonCommand *Monitor::_get_moncommand(const string &cmd_prefix, + MonCommand *cmds, int cmds_size) +{ + MonCommand *this_cmd = NULL; + for (MonCommand *cp = cmds; + cp < &cmds[cmds_size]; cp++) { + if (cp->cmdstring.find(cmd_prefix) != string::npos) { + this_cmd = cp; + break; + } + } + return this_cmd; +} + +bool Monitor::_allowed_command(MonSession *s, string &module, string &prefix, + const map& cmdmap, + const map& param_str_map, + const MonCommand *this_cmd) { + + bool cmd_r = (this_cmd->req_perms.find('r') != string::npos); + bool cmd_w = (this_cmd->req_perms.find('w') != string::npos); + bool cmd_x = (this_cmd->req_perms.find('x') != string::npos); + + bool capable = s->caps.is_capable(g_ceph_context, s->inst.name, + module, prefix, param_str_map, + cmd_r, cmd_w, cmd_x); + + dout(10) << __func__ << " " << (capable ? "" : "not ") << "capable" << dendl; + return capable; +} + +void Monitor::format_command_descriptions(const MonCommand *commands, + unsigned commands_size, + Formatter *f, + bufferlist *rdata) +{ + int cmdnum = 0; + f->open_object_section("command_descriptions"); + for (const MonCommand *cp = commands; + cp < &commands[commands_size]; cp++) { + + ostringstream secname; + secname << "cmd" << setfill('0') << std::setw(3) << cmdnum; + dump_cmddesc_to_json(f, secname.str(), + cp->cmdstring, cp->helpstring, cp->module, + cp->req_perms, cp->availability); + cmdnum++; + } + f->close_section(); // command_descriptions + + f->flush(*rdata); +} + +void Monitor::get_locally_supported_monitor_commands(const MonCommand **cmds, + int *count) +{ + *cmds = mon_commands; + *count = ARRAY_SIZE(mon_commands); +} +void Monitor::get_leader_supported_commands(const MonCommand **cmds, int *count) +{ + *cmds = leader_supported_mon_commands; + *count = leader_supported_mon_commands_size; +} +void Monitor::get_classic_monitor_commands(const MonCommand **cmds, int *count) +{ + *cmds = classic_mon_commands; + *count = ARRAY_SIZE(classic_mon_commands); +} +void Monitor::set_leader_supported_commands(const MonCommand *cmds, int size) +{ + if (leader_supported_mon_commands != mon_commands && + leader_supported_mon_commands != classic_mon_commands) + delete[] leader_supported_mon_commands; + leader_supported_mon_commands = cmds; + leader_supported_mon_commands_size = size; +} + +bool Monitor::is_keyring_required() +{ + string auth_cluster_required = g_conf->auth_supported.length() ? + g_conf->auth_supported : g_conf->auth_cluster_required; + string auth_service_required = g_conf->auth_supported.length() ? + g_conf->auth_supported : g_conf->auth_service_required; + + return auth_service_required == "cephx" || + auth_cluster_required == "cephx"; +} + +void Monitor::handle_command(MMonCommand *m) +{ + if (m->fsid != monmap->fsid) { + dout(0) << "handle_command on fsid " << m->fsid << " != " << monmap->fsid << dendl; + reply_command(m, -EPERM, "wrong fsid", 0); + return; + } + + MonSession *session = m->get_session(); + if (!session) { + string rs = "Access denied"; + reply_command(m, -EACCES, rs, 0); + return; + } + + if (m->cmd.empty()) { + string rs = "No command supplied"; + reply_command(m, -EINVAL, rs, 0); + return; + } + + string prefix; + vector fullcmd; + map cmdmap; + stringstream ss, ds; + bufferlist rdata; + string rs; + int r = -EINVAL; + rs = "unrecognized command"; + + if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) { + // ss has reason for failure + r = -EINVAL; + rs = ss.str(); + if (!m->get_source().is_mon()) // don't reply to mon->mon commands + reply_command(m, r, rs, 0); + else + m->put(); + return; + } + + cmd_getval(g_ceph_context, cmdmap, "prefix", prefix); + if (prefix == "get_command_descriptions") { + bufferlist rdata; + Formatter *f = new_formatter("json"); + format_command_descriptions(leader_supported_mon_commands, + leader_supported_mon_commands_size, f, &rdata); + delete f; + reply_command(m, 0, "", rdata, 0); + return; + } + + string module; + string err; + + dout(0) << "handle_command " << *m << dendl; + + string format; + cmd_getval(g_ceph_context, cmdmap, "format", format, string("plain")); + boost::scoped_ptr f(new_formatter(format)); + + get_str_vec(prefix, fullcmd); + module = fullcmd[0]; + + // validate command is in leader map + + const MonCommand *leader_cmd; + leader_cmd = _get_moncommand(prefix, + // the boost underlying this isn't const for some reason + const_cast(leader_supported_mon_commands), + leader_supported_mon_commands_size); + if (!leader_cmd) { + reply_command(m, -EINVAL, "command not known", 0); + return; + } + // validate command is in our map & matches, or forward + const MonCommand *mon_cmd = _get_moncommand(prefix, mon_commands, + ARRAY_SIZE(mon_commands)); + if (!is_leader() && (!mon_cmd || + (*leader_cmd != *mon_cmd))) { + dout(10) << "We don't match leader, forwarding request " << m << dendl; + forward_request_leader(m); + return; + } + // validate user's permissions for requested command + map param_str_map; + _generate_command_map(cmdmap, param_str_map); + if (!_allowed_command(session, module, prefix, cmdmap, + param_str_map, mon_cmd)) { + dout(1) << __func__ << " access denied" << dendl; + reply_command(m, -EACCES, "access denied", 0); + return; + } + + if (module == "mds") { + mdsmon()->dispatch(m); + return; + } + if (module == "osd") { + osdmon()->dispatch(m); + return; + } + + if (module == "pg") { + pgmon()->dispatch(m); + return; + } + if (module == "mon") { + monmon()->dispatch(m); + return; + } + if (module == "auth") { + authmon()->dispatch(m); + return; + } + if (module == "log") { + logmon()->dispatch(m); + return; + } + + if (module == "config-key") { + config_key_service->dispatch(m); + return; + } + + if (prefix == "fsid") { + if (f) { + f->open_object_section("fsid"); + f->dump_stream("fsid") << monmap->fsid; + f->close_section(); + f->flush(rdata); + } else { + ds << monmap->fsid; + rdata.append(ds); + } + reply_command(m, 0, "", rdata, 0); + return; + } + + if (prefix == "scrub") { + if (is_leader()) { + int r = scrub(); + reply_command(m, r, "", rdata, 0); + } else if (is_peon()) { + forward_request_leader(m); + } else { + reply_command(m, -EAGAIN, "no quorum", rdata, 0); + } + return; + } + + if (prefix == "compact") { + dout(1) << "triggering manual compaction" << dendl; + utime_t start = ceph_clock_now(g_ceph_context); + store->compact(); + utime_t end = ceph_clock_now(g_ceph_context); + end -= start; + dout(1) << "finished manual compaction in " << end << " seconds" << dendl; + ostringstream oss; + oss << "compacted leveldb in " << end; + rs = oss.str(); + r = 0; + } + else if (prefix == "injectargs") { + vector injected_args; + cmd_getval(g_ceph_context, cmdmap, "injected_args", injected_args); + if (!injected_args.empty()) { + dout(0) << "parsing injected options '" << injected_args << "'" << dendl; + ostringstream oss; + r = g_conf->injectargs(str_join(injected_args, " "), &oss); + ss << "injectargs:" << oss.str(); + rs = ss.str(); + goto out; + } else { + rs = "must supply options to be parsed in a single string"; + r = -EINVAL; + } + } else if (prefix == "status" || + prefix == "health" || + prefix == "df") { + string detail; + cmd_getval(g_ceph_context, cmdmap, "detail", detail); + + if (prefix == "status") { + // get_cluster_status handles f == NULL + get_cluster_status(ds, f.get()); + + if (f) { + f->flush(ds); + ds << '\n'; + } + rdata.append(ds); + } else if (prefix == "health") { + string health_str; + get_health(health_str, detail == "detail" ? &rdata : NULL, f.get()); + if (f) { + f->flush(ds); + ds << '\n'; + } else { + ds << health_str; + } + bufferlist comb; + comb.append(ds); + if (detail == "detail") + comb.append(rdata); + rdata = comb; + r = 0; + } else if (prefix == "df") { + bool verbose = (detail == "detail"); + if (f) + f->open_object_section("stats"); + + pgmon()->dump_fs_stats(ds, f.get(), verbose); + if (!f) + ds << '\n'; + pgmon()->dump_pool_stats(ds, f.get(), verbose); + + if (f) { + f->close_section(); + f->flush(ds); + ds << '\n'; + } + } else { + assert(0 == "We should never get here!"); + return; + } + rdata.append(ds); + rs = ""; + r = 0; + } else if (prefix == "report") { + + // this must be formatted, in its current form + if (!f) + f.reset(new_formatter("json-pretty")); + f->open_object_section("report"); + f->dump_stream("cluster_fingerprint") << fingerprint; + f->dump_string("version", ceph_version_to_str()); + f->dump_string("commit", git_version_to_str()); + f->dump_stream("timestamp") << ceph_clock_now(NULL); + + vector tagsvec; + cmd_getval(g_ceph_context, cmdmap, "tags", tagsvec); + string tagstr = str_join(tagsvec, " "); + if (!tagstr.empty()) + tagstr = tagstr.substr(0, tagstr.find_last_of(' ')); + f->dump_string("tag", tagstr); + + string hs; + get_health(hs, NULL, f.get()); + + monmon()->dump_info(f.get()); + osdmon()->dump_info(f.get()); + mdsmon()->dump_info(f.get()); + pgmon()->dump_info(f.get()); + authmon()->dump_info(f.get()); + + paxos->dump_info(f.get()); + + f->close_section(); + f->flush(rdata); + + ostringstream ss2; + ss2 << "report " << rdata.crc32c(6789); + rs = ss2.str(); + r = 0; + } else if (prefix == "quorum_status") { + // make sure our map is readable and up to date + if (!is_leader() && !is_peon()) { + dout(10) << " waiting for quorum" << dendl; + waitfor_quorum.push_back(new C_RetryMessage(this, m)); + return; + } + _quorum_status(f.get(), ds); + rdata.append(ds); + rs = ""; + r = 0; + } else if (prefix == "mon_status") { + get_mon_status(f.get(), ds); + if (f) + f->flush(ds); + rdata.append(ds); + rs = ""; + r = 0; + } else if (prefix == "sync force") { + string validate1, validate2; + cmd_getval(g_ceph_context, cmdmap, "validate1", validate1); + cmd_getval(g_ceph_context, cmdmap, "validate2", validate2); + if (validate1 != "--yes-i-really-mean-it" || + validate2 != "--i-know-what-i-am-doing") { + r = -EINVAL; + rs = "are you SURE? this will mean the monitor store will be " + "erased. pass '--yes-i-really-mean-it " + "--i-know-what-i-am-doing' if you really do."; + goto out; + } + sync_force(f.get(), ds); + rs = ds.str(); + r = 0; + } else if (prefix == "heap") { + if (!ceph_using_tcmalloc()) + rs = "tcmalloc not enabled, can't use heap profiler commands\n"; + else { + string heapcmd; + cmd_getval(g_ceph_context, cmdmap, "heapcmd", heapcmd); + // XXX 1-element vector, change at callee or make vector here? + vector heapcmd_vec; + get_str_vec(heapcmd, heapcmd_vec); + ceph_heap_profiler_handle_command(heapcmd_vec, ds); + rdata.append(ds); + rs = ""; + r = 0; + } + } else if (prefix == "quorum") { + string quorumcmd; + cmd_getval(g_ceph_context, cmdmap, "quorumcmd", quorumcmd); + if (quorumcmd == "exit") { + start_election(); + elector.stop_participating(); + rs = "stopped responding to quorum, initiated new election"; + r = 0; + } else if (quorumcmd == "enter") { + elector.start_participating(); + start_election(); + rs = "started responding to quorum, initiated new election"; + r = 0; + } else { + rs = "needs a valid 'quorum' command"; + r = -EINVAL; + } + } + + out: + if (!m->get_source().is_mon()) // don't reply to mon->mon commands + reply_command(m, r, rs, rdata, 0); + else + m->put(); +} + +void Monitor::reply_command(MMonCommand *m, int rc, const string &rs, version_t version) +{ + bufferlist rdata; + reply_command(m, rc, rs, rdata, version); +} + +void Monitor::reply_command(MMonCommand *m, int rc, const string &rs, bufferlist& rdata, version_t version) +{ + MMonCommandAck *reply = new MMonCommandAck(m->cmd, rc, rs, version); + reply->set_tid(m->get_tid()); + reply->set_data(rdata); + send_reply(m, reply); + m->put(); +} + + +// ------------------------ +// request/reply routing +// +// a client/mds/osd will connect to a random monitor. we need to forward any +// messages requiring state updates to the leader, and then route any replies +// back via the correct monitor and back to them. (the monitor will not +// initiate any connections.) + +void Monitor::forward_request_leader(PaxosServiceMessage *req) +{ + int mon = get_leader(); + MonSession *session = 0; + if (req->get_connection()) + session = static_cast(req->get_connection()->get_priv()); + if (req->get_source().is_mon() && req->get_source_addr() != messenger->get_myaddr()) { + dout(10) << "forward_request won't forward (non-local) mon request " << *req << dendl; + req->put(); + } else if (session && session->proxy_con) { + dout(10) << "forward_request won't double fwd request " << *req << dendl; + req->put(); + } else if (session && !session->closed) { + RoutedRequest *rr = new RoutedRequest; + rr->tid = ++routed_request_tid; + rr->client_inst = req->get_source_inst(); + rr->con = req->get_connection(); + rr->con_features = rr->con->get_features(); + encode_message(req, CEPH_FEATURES_ALL, rr->request_bl); // for my use only; use all features + rr->session = static_cast(session->get()); + routed_requests[rr->tid] = rr; + session->routed_request_tids.insert(rr->tid); + + dout(10) << "forward_request " << rr->tid << " request " << *req << dendl; + + MForward *forward = new MForward(rr->tid, req, + rr->con_features, + rr->session->caps); + forward->set_priority(req->get_priority()); + messenger->send_message(forward, monmap->get_inst(mon)); + } else { + dout(10) << "forward_request no session for request " << *req << dendl; + req->put(); + } + if (session) + session->put(); +} + +//extract the original message and put it into the regular dispatch function +void Monitor::handle_forward(MForward *m) +{ + dout(10) << "received forwarded message from " << m->client + << " via " << m->get_source_inst() << dendl; + MonSession *session = static_cast(m->get_connection()->get_priv()); + assert(session); + + if (!session->is_capable("mon", MON_CAP_X)) { + dout(0) << "forward from entity with insufficient caps! " + << session->caps << dendl; + } else { + Connection *c = new Connection(NULL); // msgr must be null; see PaxosService::dispatch() + MonSession *s = new MonSession(m->msg->get_source_inst(), c); + c->set_priv(s); + c->set_peer_addr(m->client.addr); + c->set_peer_type(m->client.name.type()); + c->set_features(m->con_features); + + s->caps = m->client_caps; + dout(10) << " caps are " << s->caps << dendl; + s->proxy_con = m->get_connection(); + s->proxy_tid = m->tid; + + PaxosServiceMessage *req = m->msg; + m->msg = NULL; // so ~MForward doesn't delete it + req->set_connection(c); + + // not super accurate, but better than nothing. + req->set_recv_stamp(m->get_recv_stamp()); + + /* + * note which election epoch this is; we will drop the message if + * there is a future election since our peers will resend routed + * requests in that case. + */ + req->rx_election_epoch = get_epoch(); + + /* Because this is a special fake connection, we need to break + the ref loop between Connection and MonSession differently + than we normally do. Here, the Message refers to the Connection + which refers to the Session, and nobody else refers to the Connection + or the Session. And due to the special nature of this message, + nobody refers to the Connection via the Session. So, clear out that + half of the ref loop.*/ + s->con.reset(NULL); + + dout(10) << " mesg " << req << " from " << m->get_source_addr() << dendl; + + _ms_dispatch(req); + } + session->put(); + m->put(); +} + +void Monitor::try_send_message(Message *m, const entity_inst_t& to) +{ + dout(10) << "try_send_message " << *m << " to " << to << dendl; + + bufferlist bl; + encode_message(m, quorum_features, bl); + + messenger->send_message(m, to); + + for (int i=0; i<(int)monmap->size(); i++) { + if (i != rank) + messenger->send_message(new MRoute(bl, to), monmap->get_inst(i)); + } +} + +void Monitor::send_reply(PaxosServiceMessage *req, Message *reply) +{ + ConnectionRef connection = req->get_connection(); + if (!connection) { + dout(2) << "send_reply no connection, dropping reply " << *reply + << " to " << req << " " << *req << dendl; + reply->put(); + return; + } + MonSession *session = static_cast(connection->get_priv()); + if (!session) { + dout(2) << "send_reply no session, dropping reply " << *reply + << " to " << req << " " << *req << dendl; + reply->put(); + return; + } + if (session->proxy_con) { + dout(15) << "send_reply routing reply to " << req->get_connection()->get_peer_addr() + << " via " << session->proxy_con->get_peer_addr() + << " for request " << *req << dendl; + messenger->send_message(new MRoute(session->proxy_tid, reply), + session->proxy_con); + } else { + messenger->send_message(reply, session->con); + } + session->put(); +} + +void Monitor::no_reply(PaxosServiceMessage *req) +{ + MonSession *session = static_cast(req->get_connection()->get_priv()); + if (!session) { + dout(2) << "no_reply no session, dropping non-reply to " << req << " " << *req << dendl; + return; + } + if (session->proxy_con) { + if (get_quorum_features() & CEPH_FEATURE_MON_NULLROUTE) { + dout(10) << "no_reply to " << req->get_source_inst() + << " via " << session->proxy_con->get_peer_addr() + << " for request " << *req << dendl; + messenger->send_message(new MRoute(session->proxy_tid, NULL), + session->proxy_con); + } else { + dout(10) << "no_reply no quorum nullroute feature for " << req->get_source_inst() + << " via " << session->proxy_con->get_peer_addr() + << " for request " << *req << dendl; + } + } else { + dout(10) << "no_reply to " << req->get_source_inst() << " " << *req << dendl; + } + session->put(); +} + +void Monitor::handle_route(MRoute *m) +{ + MonSession *session = static_cast(m->get_connection()->get_priv()); + //check privileges + if (session && !session->is_capable("mon", MON_CAP_X)) { + dout(0) << "MRoute received from entity without appropriate perms! " + << dendl; + session->put(); + m->put(); + return; + } + if (m->msg) + dout(10) << "handle_route " << *m->msg << " to " << m->dest << dendl; + else + dout(10) << "handle_route null to " << m->dest << dendl; + + // look it up + if (m->session_mon_tid) { + if (routed_requests.count(m->session_mon_tid)) { + RoutedRequest *rr = routed_requests[m->session_mon_tid]; + + // reset payload, in case encoding is dependent on target features + if (m->msg) { + m->msg->clear_payload(); + messenger->send_message(m->msg, rr->con); + m->msg = NULL; + } + routed_requests.erase(m->session_mon_tid); + rr->session->routed_request_tids.insert(rr->tid); + delete rr; + } else { + dout(10) << " don't have routed request tid " << m->session_mon_tid << dendl; + } + } else { + dout(10) << " not a routed request, trying to send anyway" << dendl; + if (m->msg) { + messenger->lazy_send_message(m->msg, m->dest); + m->msg = NULL; + } + } + m->put(); + if (session) + session->put(); +} + +void Monitor::resend_routed_requests() +{ + dout(10) << "resend_routed_requests" << dendl; + int mon = get_leader(); + list retry; + for (map::iterator p = routed_requests.begin(); + p != routed_requests.end(); + ++p) { + RoutedRequest *rr = p->second; + + bufferlist::iterator q = rr->request_bl.begin(); + PaxosServiceMessage *req = (PaxosServiceMessage *)decode_message(cct, q); + + if (mon == rank) { + dout(10) << " requeue for self tid " << rr->tid << " " << *req << dendl; + req->set_connection(rr->con); + retry.push_back(new C_RetryMessage(this, req)); + delete rr; + } else { + dout(10) << " resend to mon." << mon << " tid " << rr->tid << " " << *req << dendl; + MForward *forward = new MForward(rr->tid, req, rr->con_features, + rr->session->caps); + forward->client = rr->client_inst; + forward->set_priority(req->get_priority()); + messenger->send_message(forward, monmap->get_inst(mon)); + } + } + if (mon == rank) { + routed_requests.clear(); + finish_contexts(g_ceph_context, retry); + } +} + +void Monitor::remove_session(MonSession *s) +{ + dout(10) << "remove_session " << s << " " << s->inst << dendl; + assert(!s->closed); + for (set::iterator p = s->routed_request_tids.begin(); + p != s->routed_request_tids.end(); + ++p) { + if (routed_requests.count(*p)) { + RoutedRequest *rr = routed_requests[*p]; + dout(10) << " dropping routed request " << rr->tid << dendl; + delete rr; + routed_requests.erase(*p); + } + } + s->con->set_priv(NULL); + session_map.remove_session(s); +} + +void Monitor::remove_all_sessions() +{ + while (!session_map.sessions.empty()) { + MonSession *s = session_map.sessions.front(); + remove_session(s); + } +} + +void Monitor::send_command(const entity_inst_t& inst, + const vector& com) +{ + dout(10) << "send_command " << inst << "" << com << dendl; + MMonCommand *c = new MMonCommand(monmap->fsid); + c->cmd = com; + try_send_message(c, inst); +} + +void Monitor::waitlist_or_zap_client(Message *m) +{ + /** + * Wait list the new session until we're in the quorum, assuming it's + * sufficiently new. + * tick() will periodically send them back through so we can send + * the client elsewhere if we don't think we're getting back in. + * + * But we whitelist a few sorts of messages: + * 1) Monitors can talk to us at any time, of course. + * 2) auth messages. It's unlikely to go through much faster, but + * it's possible we've just lost our quorum status and we want to take... + * 3) command messages. We want to accept these under all possible + * circumstances. + */ + ConnectionRef con = m->get_connection(); + utime_t too_old = ceph_clock_now(g_ceph_context); + too_old -= g_ceph_context->_conf->mon_lease; + if (m->get_recv_stamp() > too_old && + con->is_connected()) { + dout(5) << "waitlisting message " << *m << dendl; + maybe_wait_for_quorum.push_back(new C_RetryMessage(this, m)); + } else { + dout(5) << "discarding message " << *m << " and sending client elsewhere" << dendl; + messenger->mark_down(con); + m->put(); + } +} + +bool Monitor::_ms_dispatch(Message *m) +{ + bool ret = true; + + if (is_shutdown()) { + m->put(); + return true; + } + + ConnectionRef connection = m->get_connection(); + MonSession *s = NULL; + MonCap caps; + EntityName entity_name; + bool src_is_mon; + + // regardless of who we are or who the sender is, the message must + // have a connection associated. If it doesn't then something fishy + // is going on. + assert(connection); + + src_is_mon = (connection->get_peer_type() & CEPH_ENTITY_TYPE_MON); + + bool reuse_caps = false; + dout(20) << "have connection" << dendl; + s = static_cast(connection->get_priv()); + if (s && s->closed) { + caps = s->caps; + reuse_caps = true; + s->put(); + s = NULL; + } + if (!s) { + // if the sender is not a monitor, make sure their first message for a + // session is an MAuth. If it is not, assume it's a stray message, + // and considering that we are creating a new session it is safe to + // assume that the sender hasn't authenticated yet, so we have no way + // of assessing whether we should handle it or not. + if (!src_is_mon && (m->get_type() != CEPH_MSG_AUTH && + m->get_type() != CEPH_MSG_MON_GET_MAP)) { + if (m->get_type() == CEPH_MSG_PING) { + // let it go through and be dispatched immediately! + return dispatch(s, m, false); + } + dout(1) << __func__ << " dropping stray message " << *m + << " from " << m->get_source_inst() << dendl; + m->put(); + return true; + } + + if (!exited_quorum.is_zero() && !src_is_mon) { + waitlist_or_zap_client(m); + return true; + } + + dout(10) << "do not have session, making new one" << dendl; + s = session_map.new_session(m->get_source_inst(), m->get_connection().get()); + m->get_connection()->set_priv(s->get()); + dout(10) << "ms_dispatch new session " << s << " for " << s->inst << dendl; + + if (!src_is_mon) { + dout(10) << "setting timeout on session" << dendl; + // set an initial timeout here, so we will trim this session even if they don't + // do anything. + s->until = ceph_clock_now(g_ceph_context); + s->until += g_conf->mon_subscribe_interval; + } else { + //give it monitor caps; the peer type has been authenticated + reuse_caps = false; + dout(5) << "setting monitor caps on this connection" << dendl; + if (!s->caps.is_allow_all()) //but no need to repeatedly copy + s->caps = *mon_caps; + } + if (reuse_caps) + s->caps = caps; + } else { + dout(20) << "ms_dispatch existing session " << s << " for " << s->inst << dendl; + } + + if (s) { + if (s->auth_handler) { + entity_name = s->auth_handler->get_entity_name(); + } + dout(20) << " caps " << s->caps.get_str() << dendl; + } + + if (is_synchronizing() && !src_is_mon) { + waitlist_or_zap_client(m); + return true; + } + + ret = dispatch(s, m, src_is_mon); + + if (s) { + s->put(); + } + + return ret; +} + +bool Monitor::dispatch(MonSession *s, Message *m, const bool src_is_mon) +{ + bool ret = true; + + assert(m != NULL); + + switch (m->get_type()) { + + case MSG_ROUTE: + handle_route(static_cast(m)); + break; + + // misc + case CEPH_MSG_MON_GET_MAP: + handle_mon_get_map(static_cast(m)); + break; + + case CEPH_MSG_MON_GET_VERSION: + handle_get_version(static_cast(m)); + break; + + case MSG_MON_COMMAND: + handle_command(static_cast(m)); + break; + + case CEPH_MSG_MON_SUBSCRIBE: + /* FIXME: check what's being subscribed, filter accordingly */ + handle_subscribe(static_cast(m)); + break; + + case MSG_MON_PROBE: + handle_probe(static_cast(m)); + break; + + // Sync (i.e., the new slurp, but on steroids) + case MSG_MON_SYNC: + handle_sync(static_cast(m)); + break; + case MSG_MON_SCRUB: + handle_scrub(static_cast(m)); + break; + + // OSDs + case MSG_OSD_MARK_ME_DOWN: + case MSG_OSD_FAILURE: + case MSG_OSD_BOOT: + case MSG_OSD_ALIVE: + case MSG_OSD_PGTEMP: + paxos_service[PAXOS_OSDMAP]->dispatch((PaxosServiceMessage*)m); + break; + + case MSG_REMOVE_SNAPS: + paxos_service[PAXOS_OSDMAP]->dispatch((PaxosServiceMessage*)m); + break; + + // MDSs + case MSG_MDS_BEACON: + case MSG_MDS_OFFLOAD_TARGETS: + paxos_service[PAXOS_MDSMAP]->dispatch((PaxosServiceMessage*)m); + break; + + // auth + case MSG_MON_GLOBAL_ID: + case CEPH_MSG_AUTH: + /* no need to check caps here */ + paxos_service[PAXOS_AUTH]->dispatch((PaxosServiceMessage*)m); + break; + + // pg + case CEPH_MSG_STATFS: + case MSG_PGSTATS: + case MSG_GETPOOLSTATS: + paxos_service[PAXOS_PGMAP]->dispatch((PaxosServiceMessage*)m); + break; + + case CEPH_MSG_POOLOP: + paxos_service[PAXOS_OSDMAP]->dispatch((PaxosServiceMessage*)m); + break; + + // log + case MSG_LOG: + paxos_service[PAXOS_LOG]->dispatch((PaxosServiceMessage*)m); + break; + + case MSG_LOGACK: + clog.handle_log_ack((MLogAck*)m); + break; + + // monmap + case MSG_MON_JOIN: + paxos_service[PAXOS_MONMAP]->dispatch((PaxosServiceMessage*)m); + break; + + // paxos + case MSG_MON_PAXOS: + { + MMonPaxos *pm = static_cast(m); + if (!src_is_mon || + !s->is_capable("mon", MON_CAP_X)) { + //can't send these! + pm->put(); + break; + } + + if (state == STATE_SYNCHRONIZING) { + // we are synchronizing. These messages would do us no + // good, thus just drop them and ignore them. + dout(10) << __func__ << " ignore paxos msg from " + << pm->get_source_inst() << dendl; + pm->put(); + break; + } + + // sanitize + if (pm->epoch > get_epoch()) { + bootstrap(); + pm->put(); + break; + } + if (pm->epoch != get_epoch()) { + pm->put(); + break; + } + + paxos->dispatch((PaxosServiceMessage*)m); + } + break; + + // elector messages + case MSG_MON_ELECTION: + //check privileges here for simplicity + if (s && + !s->is_capable("mon", MON_CAP_X)) { + dout(0) << "MMonElection received from entity without enough caps!" + << s->caps << dendl; + m->put(); + break; + } + if (!is_probing() && !is_synchronizing()) { + elector.dispatch(m); + } else { + m->put(); + } + break; + + case MSG_FORWARD: + handle_forward(static_cast(m)); + break; + + case MSG_TIMECHECK: + handle_timecheck(static_cast(m)); + break; + + case MSG_MON_HEALTH: + health_monitor->dispatch(static_cast(m)); + break; + + case CEPH_MSG_PING: + handle_ping(static_cast(m)); + break; + + default: + ret = false; + } + + return ret; +} + +void Monitor::handle_ping(MPing *m) +{ + dout(10) << __func__ << " " << *m << dendl; + MPing *reply = new MPing; + entity_inst_t inst = m->get_source_inst(); + bufferlist payload; + Formatter *f = new JSONFormatter(true); + f->open_object_section("pong"); + + string health_str; + get_health(health_str, NULL, f); + { + stringstream ss; + get_mon_status(f, ss); + } + + f->close_section(); + stringstream ss; + f->flush(ss); + ::encode(ss.str(), payload); + reply->set_payload(payload); + dout(10) << __func__ << " reply payload len " << reply->get_payload().length() << dendl; + messenger->send_message(reply, inst); + m->put(); +} + +void Monitor::timecheck_start() +{ + dout(10) << __func__ << dendl; + timecheck_cleanup(); + timecheck_start_round(); +} + +void Monitor::timecheck_finish() +{ + dout(10) << __func__ << dendl; + timecheck_cleanup(); +} + +void Monitor::timecheck_start_round() +{ + dout(10) << __func__ << " curr " << timecheck_round << dendl; + assert(is_leader()); + + if (monmap->size() == 1) { + assert(0 == "We are alone; this shouldn't have been scheduled!"); + return; + } + + if (timecheck_round % 2) { + dout(10) << __func__ << " there's a timecheck going on" << dendl; + utime_t curr_time = ceph_clock_now(g_ceph_context); + double max = g_conf->mon_timecheck_interval*3; + if (curr_time - timecheck_round_start > max) { + dout(10) << __func__ << " keep current round going" << dendl; + goto out; + } else { + dout(10) << __func__ + << " finish current timecheck and start new" << dendl; + timecheck_cancel_round(); + } + } + + assert(timecheck_round % 2 == 0); + timecheck_acks = 0; + timecheck_round ++; + timecheck_round_start = ceph_clock_now(g_ceph_context); + dout(10) << __func__ << " new " << timecheck_round << dendl; + + timecheck(); +out: + dout(10) << __func__ << " setting up next event" << dendl; + timecheck_event = new C_TimeCheck(this); + timer.add_event_after(g_conf->mon_timecheck_interval, timecheck_event); +} + +void Monitor::timecheck_finish_round(bool success) +{ + dout(10) << __func__ << " curr " << timecheck_round << dendl; + assert(timecheck_round % 2); + timecheck_round ++; + timecheck_round_start = utime_t(); + + if (success) { + assert(timecheck_waiting.empty()); + assert(timecheck_acks == quorum.size()); + timecheck_report(); + return; + } + + dout(10) << __func__ << " " << timecheck_waiting.size() + << " peers still waiting:"; + for (map::iterator p = timecheck_waiting.begin(); + p != timecheck_waiting.end(); ++p) { + *_dout << " " << p->first.name; + } + *_dout << dendl; + timecheck_waiting.clear(); + + dout(10) << __func__ << " finished to " << timecheck_round << dendl; +} + +void Monitor::timecheck_cancel_round() +{ + timecheck_finish_round(false); +} + +void Monitor::timecheck_cleanup() +{ + timecheck_round = 0; + timecheck_acks = 0; + timecheck_round_start = utime_t(); + + if (timecheck_event) { + timer.cancel_event(timecheck_event); + timecheck_event = NULL; + } + timecheck_waiting.clear(); + timecheck_skews.clear(); + timecheck_latencies.clear(); +} + +void Monitor::timecheck_report() +{ + dout(10) << __func__ << dendl; + assert(is_leader()); + assert((timecheck_round % 2) == 0); + if (monmap->size() == 1) { + assert(0 == "We are alone; we shouldn't have gotten here!"); + return; + } + + assert(timecheck_latencies.size() == timecheck_skews.size()); + bool do_output = true; // only output report once + for (set::iterator q = quorum.begin(); q != quorum.end(); ++q) { + if (monmap->get_name(*q) == name) + continue; + + MTimeCheck *m = new MTimeCheck(MTimeCheck::OP_REPORT); + m->epoch = get_epoch(); + m->round = timecheck_round; + + for (map::iterator it = timecheck_skews.begin(); it != timecheck_skews.end(); ++it) { + double skew = it->second; + double latency = timecheck_latencies[it->first]; + + m->skews[it->first] = skew; + m->latencies[it->first] = latency; + + if (do_output) { + dout(25) << __func__ << " " << it->first + << " latency " << latency + << " skew " << skew << dendl; + } + } + do_output = false; + entity_inst_t inst = monmap->get_inst(*q); + dout(10) << __func__ << " send report to " << inst << dendl; + messenger->send_message(m, inst); + } +} + +void Monitor::timecheck() +{ + dout(10) << __func__ << dendl; + assert(is_leader()); + if (monmap->size() == 1) { + assert(0 == "We are alone; we shouldn't have gotten here!"); + return; + } + assert(timecheck_round % 2 != 0); + + timecheck_acks = 1; // we ack ourselves + + dout(10) << __func__ << " start timecheck epoch " << get_epoch() + << " round " << timecheck_round << dendl; + + // we are at the eye of the storm; the point of reference + timecheck_skews[messenger->get_myinst()] = 0.0; + timecheck_latencies[messenger->get_myinst()] = 0.0; + + for (set::iterator it = quorum.begin(); it != quorum.end(); ++it) { + if (monmap->get_name(*it) == name) + continue; + + entity_inst_t inst = monmap->get_inst(*it); + utime_t curr_time = ceph_clock_now(g_ceph_context); + timecheck_waiting[inst] = curr_time; + MTimeCheck *m = new MTimeCheck(MTimeCheck::OP_PING); + m->epoch = get_epoch(); + m->round = timecheck_round; + dout(10) << __func__ << " send " << *m << " to " << inst << dendl; + messenger->send_message(m, inst); + } +} + +health_status_t Monitor::timecheck_status(ostringstream &ss, + const double skew_bound, + const double latency) +{ + health_status_t status = HEALTH_OK; + double abs_skew = (skew_bound > 0 ? skew_bound : -skew_bound); + assert(latency >= 0); + + if (abs_skew > g_conf->mon_clock_drift_allowed) { + status = HEALTH_WARN; + ss << "clock skew " << abs_skew << "s" + << " > max " << g_conf->mon_clock_drift_allowed << "s"; + } + + return status; +} + +void Monitor::handle_timecheck_leader(MTimeCheck *m) +{ + dout(10) << __func__ << " " << *m << dendl; + /* handles PONG's */ + assert(m->op == MTimeCheck::OP_PONG); + + entity_inst_t other = m->get_source_inst(); + if (m->epoch < get_epoch()) { + dout(1) << __func__ << " got old timecheck epoch " << m->epoch + << " from " << other + << " curr " << get_epoch() + << " -- severely lagged? discard" << dendl; + return; + } + assert(m->epoch == get_epoch()); + + if (m->round < timecheck_round) { + dout(1) << __func__ << " got old round " << m->round + << " from " << other + << " curr " << timecheck_round << " -- discard" << dendl; + return; + } + + utime_t curr_time = ceph_clock_now(g_ceph_context); + + assert(timecheck_waiting.count(other) > 0); + utime_t timecheck_sent = timecheck_waiting[other]; + timecheck_waiting.erase(other); + if (curr_time < timecheck_sent) { + // our clock was readjusted -- drop everything until it all makes sense. + dout(1) << __func__ << " our clock was readjusted --" + << " bump round and drop current check" + << dendl; + timecheck_cancel_round(); + return; + } + + /* update peer latencies */ + double latency = (double)(curr_time - timecheck_sent); + + if (timecheck_latencies.count(other) == 0) + timecheck_latencies[other] = latency; + else { + double avg_latency = ((timecheck_latencies[other]*0.8)+(latency*0.2)); + timecheck_latencies[other] = avg_latency; + } + + /* + * update skews + * + * some nasty thing goes on if we were to do 'a - b' between two utime_t, + * and 'a' happens to be lower than 'b'; so we use double instead. + * + * latency is always expected to be >= 0. + * + * delta, the difference between theirs timestamp and ours, may either be + * lower or higher than 0; will hardly ever be 0. + * + * The absolute skew is the absolute delta minus the latency, which is + * taken as a whole instead of an rtt given that there is some queueing + * and dispatch times involved and it's hard to assess how long exactly + * it took for the message to travel to the other side and be handled. So + * we call it a bounded skew, the worst case scenario. + * + * Now, to math! + * + * Given that the latency is always positive, we can establish that the + * bounded skew will be: + * + * 1. positive if the absolute delta is higher than the latency and + * delta is positive + * 2. negative if the absolute delta is higher than the latency and + * delta is negative. + * 3. zero if the absolute delta is lower than the latency. + * + * On 3. we make a judgement call and treat the skew as non-existent. + * This is because that, if the absolute delta is lower than the + * latency, then the apparently existing skew is nothing more than a + * side-effect of the high latency at work. + * + * This may not be entirely true though, as a severely skewed clock + * may be masked by an even higher latency, but with high latencies + * we probably have worse issues to deal with than just skewed clocks. + */ + assert(latency >= 0); + + double delta = ((double) m->timestamp) - ((double) curr_time); + double abs_delta = (delta > 0 ? delta : -delta); + double skew_bound = abs_delta - latency; + if (skew_bound < 0) + skew_bound = 0; + else if (delta < 0) + skew_bound = -skew_bound; + + ostringstream ss; + health_status_t status = timecheck_status(ss, skew_bound, latency); + if (status == HEALTH_ERR) + clog.error() << other << " " << ss.str() << "\n"; + else if (status == HEALTH_WARN) + clog.warn() << other << " " << ss.str() << "\n"; + + dout(10) << __func__ << " from " << other << " ts " << m->timestamp + << " delta " << delta << " skew_bound " << skew_bound + << " latency " << latency << dendl; + + if (timecheck_skews.count(other) == 0) { + timecheck_skews[other] = skew_bound; + } else { + timecheck_skews[other] = (timecheck_skews[other]*0.8)+(skew_bound*0.2); + } + + timecheck_acks++; + if (timecheck_acks == quorum.size()) { + dout(10) << __func__ << " got pongs from everybody (" + << timecheck_acks << " total)" << dendl; + assert(timecheck_skews.size() == timecheck_acks); + assert(timecheck_waiting.empty()); + // everyone has acked, so bump the round to finish it. + timecheck_finish_round(); + } +} + +void Monitor::handle_timecheck_peon(MTimeCheck *m) +{ + dout(10) << __func__ << " " << *m << dendl; + + assert(is_peon()); + assert(m->op == MTimeCheck::OP_PING || m->op == MTimeCheck::OP_REPORT); + + if (m->epoch != get_epoch()) { + dout(1) << __func__ << " got wrong epoch " + << "(ours " << get_epoch() + << " theirs: " << m->epoch << ") -- discarding" << dendl; + return; + } + + if (m->round < timecheck_round) { + dout(1) << __func__ << " got old round " << m->round + << " current " << timecheck_round + << " (epoch " << get_epoch() << ") -- discarding" << dendl; + return; + } + + timecheck_round = m->round; + + if (m->op == MTimeCheck::OP_REPORT) { + assert((timecheck_round % 2) == 0); + timecheck_latencies.swap(m->latencies); + timecheck_skews.swap(m->skews); + return; + } + + assert((timecheck_round % 2) != 0); + MTimeCheck *reply = new MTimeCheck(MTimeCheck::OP_PONG); + utime_t curr_time = ceph_clock_now(g_ceph_context); + reply->timestamp = curr_time; + reply->epoch = m->epoch; + reply->round = m->round; + dout(10) << __func__ << " send " << *m + << " to " << m->get_source_inst() << dendl; + messenger->send_message(reply, m->get_connection()); +} + +void Monitor::handle_timecheck(MTimeCheck *m) +{ + dout(10) << __func__ << " " << *m << dendl; + + if (is_leader()) { + if (m->op != MTimeCheck::OP_PONG) { + dout(1) << __func__ << " drop unexpected msg (not pong)" << dendl; + } else { + handle_timecheck_leader(m); + } + } else if (is_peon()) { + if (m->op != MTimeCheck::OP_PING && m->op != MTimeCheck::OP_REPORT) { + dout(1) << __func__ << " drop unexpected msg (not ping or report)" << dendl; + } else { + handle_timecheck_peon(m); + } + } else { + dout(1) << __func__ << " drop unexpected msg" << dendl; + } + m->put(); +} + +void Monitor::handle_subscribe(MMonSubscribe *m) +{ + dout(10) << "handle_subscribe " << *m << dendl; + + bool reply = false; + + MonSession *s = static_cast(m->get_connection()->get_priv()); + if (!s) { + dout(10) << " no session, dropping" << dendl; + m->put(); + return; + } + + s->until = ceph_clock_now(g_ceph_context); + s->until += g_conf->mon_subscribe_interval; + for (map::iterator p = m->what.begin(); + p != m->what.end(); + ++p) { + // if there are any non-onetime subscriptions, we need to reply to start the resubscribe timer + if ((p->second.flags & CEPH_SUBSCRIBE_ONETIME) == 0) + reply = true; + + session_map.add_update_sub(s, p->first, p->second.start, + p->second.flags & CEPH_SUBSCRIBE_ONETIME, + m->get_connection()->has_feature(CEPH_FEATURE_INCSUBOSDMAP)); + + if (p->first == "mdsmap") { + if ((int)s->is_capable("mds", MON_CAP_R)) { + mdsmon()->check_sub(s->sub_map["mdsmap"]); + } + } else if (p->first == "osdmap") { + if ((int)s->is_capable("osd", MON_CAP_R)) { + osdmon()->check_sub(s->sub_map["osdmap"]); + } + } else if (p->first == "osd_pg_creates") { + if ((int)s->is_capable("osd", MON_CAP_W)) { + pgmon()->check_sub(s->sub_map["osd_pg_creates"]); + } + } else if (p->first == "monmap") { + check_sub(s->sub_map["monmap"]); + } else if (logmon()->sub_name_to_id(p->first) >= 0) { + logmon()->check_sub(s->sub_map[p->first]); + } + } + + // ??? + + if (reply) + messenger->send_message(new MMonSubscribeAck(monmap->get_fsid(), (int)g_conf->mon_subscribe_interval), + m->get_source_inst()); + + s->put(); + m->put(); +} + +void Monitor::handle_get_version(MMonGetVersion *m) +{ + dout(10) << "handle_get_version " << *m << dendl; + PaxosService *svc = NULL; + + MonSession *s = static_cast(m->get_connection()->get_priv()); + if (!s) { + dout(10) << " no session, dropping" << dendl; + m->put(); + return; + } + + if (!is_leader() && !is_peon()) { + dout(10) << " waiting for quorum" << dendl; + waitfor_quorum.push_back(new C_RetryMessage(this, m)); + goto out; + } + + if (m->what == "mdsmap") { + svc = mdsmon(); + } else if (m->what == "osdmap") { + svc = osdmon(); + } else if (m->what == "monmap") { + svc = monmon(); + } else { + derr << "invalid map type " << m->what << dendl; + } + + if (svc) { + if (!svc->is_readable()) { + svc->wait_for_readable(new C_RetryMessage(this, m)); + goto out; + } + MMonGetVersionReply *reply = new MMonGetVersionReply(); + reply->handle = m->handle; + reply->version = svc->get_last_committed(); + reply->oldest_version = svc->get_first_committed(); + messenger->send_message(reply, m->get_source_inst()); + } + + m->put(); + + out: + s->put(); +} + +bool Monitor::ms_handle_reset(Connection *con) +{ + dout(10) << "ms_handle_reset " << con << " " << con->get_peer_addr() << dendl; + + // ignore lossless monitor sessions + if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) + return false; + + MonSession *s = static_cast(con->get_priv()); + if (!s) + return false; + + // break any con <-> session ref cycle + s->con->set_priv(NULL); + + if (is_shutdown()) + return false; + + Mutex::Locker l(lock); + + dout(10) << "reset/close on session " << s->inst << dendl; + if (!s->closed) + remove_session(s); + s->put(); + return true; +} + +void Monitor::check_subs() +{ + string type = "monmap"; + if (session_map.subs.count(type) == 0) + return; + xlist::iterator p = session_map.subs[type]->begin(); + while (!p.end()) { + Subscription *sub = *p; + ++p; + check_sub(sub); + } +} + +void Monitor::check_sub(Subscription *sub) +{ + dout(10) << "check_sub monmap next " << sub->next << " have " << monmap->get_epoch() << dendl; + if (sub->next <= monmap->get_epoch()) { + send_latest_monmap(sub->session->con.get()); + if (sub->onetime) + session_map.remove_sub(sub); + else + sub->next = monmap->get_epoch() + 1; + } +} + + +// ----- + +void Monitor::send_latest_monmap(Connection *con) +{ + bufferlist bl; + monmap->encode(bl, con->get_features()); + messenger->send_message(new MMonMap(bl), con); +} + +void Monitor::handle_mon_get_map(MMonGetMap *m) +{ + dout(10) << "handle_mon_get_map" << dendl; + send_latest_monmap(m->get_connection().get()); + m->put(); +} + + + +// ---------------------------------------------- +// scrub + +int Monitor::scrub() +{ + dout(10) << __func__ << dendl; + assert(is_leader()); + + if ((get_quorum_features() & CEPH_FEATURE_MON_SCRUB) == 0) { + clog.warn() << "scrub not supported by entire quorum\n"; + return -EOPNOTSUPP; + } + + if (!scrub_result.empty()) { + clog.info() << "scrub already in progress\n"; + return -EBUSY; + } + + scrub_result.clear(); + scrub_version = paxos->get_version(); + + for (set::iterator p = quorum.begin(); + p != quorum.end(); + ++p) { + if (*p == rank) + continue; + MMonScrub *r = new MMonScrub(MMonScrub::OP_SCRUB, scrub_version); + messenger->send_message(r, monmap->get_inst(*p)); + } + + // scrub my keys + _scrub(&scrub_result[rank]); + + if (scrub_result.size() == quorum.size()) + scrub_finish(); + + return 0; +} + +void Monitor::handle_scrub(MMonScrub *m) +{ + dout(10) << __func__ << " " << *m << dendl; + switch (m->op) { + case MMonScrub::OP_SCRUB: + { + if (!is_peon()) + break; + if (m->version != paxos->get_version()) + break; + MMonScrub *reply = new MMonScrub(MMonScrub::OP_RESULT, m->version); + _scrub(&reply->result); + messenger->send_message(reply, m->get_connection()); + } + break; + + case MMonScrub::OP_RESULT: + { + if (!is_leader()) + break; + if (m->version != scrub_version) + break; + int from = m->get_source().num(); + assert(scrub_result.count(from) == 0); + scrub_result[from] = m->result; + + if (scrub_result.size() == quorum.size()) + scrub_finish(); + } + break; + } + m->put(); +} + +void Monitor::_scrub(ScrubResult *r) +{ + set prefixes = get_sync_targets_names(); + prefixes.erase("paxos"); // exclude paxos, as this one may have extra states for proposals, etc. + + dout(10) << __func__ << " prefixes " << prefixes << dendl; + + pair start; + MonitorDBStore::Synchronizer synchronizer = store->get_synchronizer(start, prefixes); + + while (synchronizer->has_next_chunk()) { + pair k = synchronizer->get_next_key(); + bufferlist bl; + store->get(k.first, k.second, bl); + dout(30) << __func__ << " " << k << " bl " << bl.length() << " bytes crc " << bl.crc32c(0) << dendl; + r->prefix_keys[k.first]++; + if (r->prefix_crc.count(k.first) == 0) + r->prefix_crc[k.first] = 0; + r->prefix_crc[k.first] = bl.crc32c(r->prefix_crc[k.first]); + } +} + +void Monitor::scrub_finish() +{ + dout(10) << __func__ << dendl; + + // compare + int errors = 0; + ScrubResult& mine = scrub_result[rank]; + for (map::iterator p = scrub_result.begin(); + p != scrub_result.end(); + ++p) { + if (p->first == rank) + continue; + if (p->second != mine) { + ++errors; + clog.error() << "scrub mismatch" << "\n"; + clog.error() << " mon." << rank << " " << mine << "\n"; + clog.error() << " mon." << p->first << " " << p->second << "\n"; + } + } + if (!errors) + clog.info() << "scrub ok on " << quorum << ": " << mine << "\n"; + + scrub_reset(); +} + +void Monitor::scrub_reset() +{ + dout(10) << __func__ << dendl; + scrub_version = 0; + scrub_result.clear(); +} + + + +/************ TICK ***************/ + +class C_Mon_Tick : public Context { + Monitor *mon; +public: + C_Mon_Tick(Monitor *m) : mon(m) {} + void finish(int r) { + mon->tick(); + } +}; + +void Monitor::new_tick() +{ + C_Mon_Tick *ctx = new C_Mon_Tick(this); + timer.add_event_after(g_conf->mon_tick_interval, ctx); +} + +void Monitor::tick() +{ + // ok go. + dout(11) << "tick" << dendl; + + for (vector::iterator p = paxos_service.begin(); p != paxos_service.end(); ++p) { + (*p)->tick(); + (*p)->maybe_trim(); + } + + // trim sessions + utime_t now = ceph_clock_now(g_ceph_context); + xlist::iterator p = session_map.sessions.begin(); + while (!p.end()) { + MonSession *s = *p; + ++p; + + // don't trim monitors + if (s->inst.name.is_mon()) + continue; + + if (!s->until.is_zero() && s->until < now) { + dout(10) << " trimming session " << s->con << " " << s->inst + << " (until " << s->until << " < now " << now << ")" << dendl; + messenger->mark_down(s->con); + remove_session(s); + } else if (!exited_quorum.is_zero()) { + if (now > (exited_quorum + 2 * g_conf->mon_lease)) { + // boot the client Session because we've taken too long getting back in + dout(10) << " trimming session " << s->con << " " << s->inst + << " because we've been out of quorum too long" << dendl; + messenger->mark_down(s->con); + remove_session(s); + } + } + } + + sync_trim_providers(); + + if (!maybe_wait_for_quorum.empty()) { + finish_contexts(g_ceph_context, maybe_wait_for_quorum); + } + + if (is_leader() && paxos->is_active() && fingerprint.is_zero()) { + // this is only necessary on upgraded clusters. + MonitorDBStore::Transaction t; + prepare_new_fingerprint(&t); + bufferlist tbl; + t.encode(tbl); + paxos->propose_new_value(tbl, new C_NoopContext); + } + + new_tick(); +} + +void Monitor::prepare_new_fingerprint(MonitorDBStore::Transaction *t) +{ + uuid_d nf; + nf.generate_random(); + dout(10) << __func__ << " proposing cluster_fingerprint " << nf << dendl; + + bufferlist bl; + ::encode(nf, bl); + t->put(MONITOR_NAME, "cluster_fingerprint", bl); +} + +int Monitor::check_fsid() +{ + if (!store->exists(MONITOR_NAME, "cluster_uuid")) + return -ENOENT; + + bufferlist ebl; + int r = store->get(MONITOR_NAME, "cluster_uuid", ebl); + assert(r == 0); + + string es(ebl.c_str(), ebl.length()); + + // only keep the first line + size_t pos = es.find_first_of('\n'); + if (pos != string::npos) + es.resize(pos); + + dout(10) << "check_fsid cluster_uuid contains '" << es << "'" << dendl; + uuid_d ondisk; + if (!ondisk.parse(es.c_str())) { + derr << "error: unable to parse uuid" << dendl; + return -EINVAL; + } + + if (monmap->get_fsid() != ondisk) { + derr << "error: cluster_uuid file exists with value " << ondisk + << ", != our uuid " << monmap->get_fsid() << dendl; + return -EEXIST; + } + + return 0; +} + +int Monitor::write_fsid() +{ + MonitorDBStore::Transaction t; + int r = write_fsid(t); + store->apply_transaction(t); + return r; +} + +int Monitor::write_fsid(MonitorDBStore::Transaction &t) +{ + ostringstream ss; + ss << monmap->get_fsid() << "\n"; + string us = ss.str(); + + bufferlist b; + b.append(us); + + t.put(MONITOR_NAME, "cluster_uuid", b); + return 0; +} + +/* + * this is the closest thing to a traditional 'mkfs' for ceph. + * initialize the monitor state machines to their initial values. + */ +int Monitor::mkfs(bufferlist& osdmapbl) +{ + MonitorDBStore::Transaction t; + + // verify cluster fsid + int r = check_fsid(); + if (r < 0 && r != -ENOENT) + return r; + + bufferlist magicbl; + magicbl.append(CEPH_MON_ONDISK_MAGIC); + magicbl.append("\n"); + t.put(MONITOR_NAME, "magic", magicbl); + + + features = get_supported_features(); + write_features(t); + + // save monmap, osdmap, keyring. + bufferlist monmapbl; + monmap->encode(monmapbl, CEPH_FEATURES_ALL); + monmap->set_epoch(0); // must be 0 to avoid confusing first MonmapMonitor::update_from_paxos() + t.put("mkfs", "monmap", monmapbl); + + if (osdmapbl.length()) { + // make sure it's a valid osdmap + try { + OSDMap om; + om.decode(osdmapbl); + } + catch (buffer::error& e) { + derr << "error decoding provided osdmap: " << e.what() << dendl; + return -EINVAL; + } + t.put("mkfs", "osdmap", osdmapbl); + } + + if (is_keyring_required()) { + KeyRing keyring; + string keyring_filename; + if (!ceph_resolve_file_search(g_conf->keyring, keyring_filename)) { + derr << "unable to find a keyring file on " << g_conf->keyring << dendl; + if (g_conf->key != "") { + string keyring_plaintext = "[mon.]\n\tkey = " + g_conf->key + + "\n\tcaps mon = \"allow *\"\n"; + bufferlist bl; + bl.append(keyring_plaintext); + try { + bufferlist::iterator i = bl.begin(); + keyring.decode_plaintext(i); + } + catch (const buffer::error& e) { + derr << "error decoding keyring " << keyring_plaintext + << ": " << e.what() << dendl; + return -EINVAL; + } + } else { + return -ENOENT; + } + } else { + r = keyring.load(g_ceph_context, keyring_filename); + if (r < 0) { + derr << "unable to load initial keyring " << g_conf->keyring << dendl; + return r; + } + } + + // put mon. key in external keyring; seed with everything else. + extract_save_mon_key(keyring); + + bufferlist keyringbl; + keyring.encode_plaintext(keyringbl); + t.put("mkfs", "keyring", keyringbl); + } + write_fsid(t); + store->apply_transaction(t); + + return 0; +} + +int Monitor::write_default_keyring(bufferlist& bl) +{ + ostringstream os; + os << g_conf->mon_data << "/keyring"; + + int err = 0; + int fd = ::open(os.str().c_str(), O_WRONLY|O_CREAT, 0644); + if (fd < 0) { + err = -errno; + dout(0) << __func__ << " failed to open " << os.str() + << ": " << cpp_strerror(err) << dendl; + return err; + } + + err = bl.write_fd(fd); + if (!err) + ::fsync(fd); + ::close(fd); + + return err; +} + +void Monitor::extract_save_mon_key(KeyRing& keyring) +{ + EntityName mon_name; + mon_name.set_type(CEPH_ENTITY_TYPE_MON); + EntityAuth mon_key; + if (keyring.get_auth(mon_name, mon_key)) { + dout(10) << "extract_save_mon_key moving mon. key to separate keyring" << dendl; + KeyRing pkey; + pkey.add(mon_name, mon_key); + bufferlist bl; + pkey.encode_plaintext(bl); + write_default_keyring(bl); + keyring.remove(mon_name); + } +} + +bool Monitor::ms_get_authorizer(int service_id, AuthAuthorizer **authorizer, bool force_new) +{ + dout(10) << "ms_get_authorizer for " << ceph_entity_type_name(service_id) << dendl; + + if (is_shutdown()) + return false; + + // we only connect to other monitors; every else connects to us. + if (service_id != CEPH_ENTITY_TYPE_MON) + return false; + + if (!auth_cluster_required.is_supported_auth(CEPH_AUTH_CEPHX)) + return false; + + CephXServiceTicketInfo auth_ticket_info; + CephXSessionAuthInfo info; + int ret; + EntityName name; + name.set_type(CEPH_ENTITY_TYPE_MON); + + auth_ticket_info.ticket.name = name; + auth_ticket_info.ticket.global_id = 0; + + CryptoKey secret; + if (!keyring.get_secret(name, secret) && + !key_server.get_secret(name, secret)) { + dout(0) << " couldn't get secret for mon service from keyring or keyserver" << dendl; + stringstream ss, ds; + int err = key_server.list_secrets(ds); + if (err < 0) + ss << "no installed auth entries!"; + else + ss << "installed auth entries:"; + dout(0) << ss.str() << "\n" << ds.str() << dendl; + return false; + } + + /* mon to mon authentication uses the private monitor shared key and not the + rotating key */ + ret = key_server.build_session_auth_info(service_id, auth_ticket_info, info, secret, (uint64_t)-1); + if (ret < 0) { + dout(0) << "ms_get_authorizer failed to build session auth_info for use with mon ret " << ret << dendl; + return false; + } + + CephXTicketBlob blob; + if (!cephx_build_service_ticket_blob(cct, info, blob)) { + dout(0) << "ms_get_authorizer failed to build service ticket use with mon" << dendl; + return false; + } + bufferlist ticket_data; + ::encode(blob, ticket_data); + + bufferlist::iterator iter = ticket_data.begin(); + CephXTicketHandler handler(g_ceph_context, service_id); + ::decode(handler.ticket, iter); + + handler.session_key = info.session_key; + + *authorizer = handler.build_authorizer(0); + + return true; +} + +bool Monitor::ms_verify_authorizer(Connection *con, int peer_type, + int protocol, bufferlist& authorizer_data, bufferlist& authorizer_reply, + bool& isvalid, CryptoKey& session_key) +{ + dout(10) << "ms_verify_authorizer " << con->get_peer_addr() + << " " << ceph_entity_type_name(peer_type) + << " protocol " << protocol << dendl; + + if (is_shutdown()) + return false; + + if (peer_type == CEPH_ENTITY_TYPE_MON && + auth_cluster_required.is_supported_auth(CEPH_AUTH_CEPHX)) { + // monitor, and cephx is enabled + isvalid = false; + if (protocol == CEPH_AUTH_CEPHX) { + bufferlist::iterator iter = authorizer_data.begin(); + CephXServiceTicketInfo auth_ticket_info; + + if (authorizer_data.length()) { + int ret = cephx_verify_authorizer(g_ceph_context, &keyring, iter, + auth_ticket_info, authorizer_reply); + if (ret >= 0) { + session_key = auth_ticket_info.session_key; + isvalid = true; + } else { + dout(0) << "ms_verify_authorizer bad authorizer from mon " << con->get_peer_addr() << dendl; + } + } + } else { + dout(0) << "ms_verify_authorizer cephx enabled, but no authorizer (required for mon)" << dendl; + } + } else { + // who cares. + isvalid = true; + } + return true; +}; + +#undef dout_prefix +#define dout_prefix *_dout + +void Monitor::StoreConverter::_convert_finish_features( + MonitorDBStore::Transaction &t) +{ + dout(20) << __func__ << dendl; + + assert(db->exists(MONITOR_NAME, COMPAT_SET_LOC)); + bufferlist features_bl; + db->get(MONITOR_NAME, COMPAT_SET_LOC, features_bl); + assert(features_bl.length()); + + CompatSet features; + bufferlist::iterator p = features_bl.begin(); + features.decode(p); + + assert(features.incompat.contains(CEPH_MON_FEATURE_INCOMPAT_GV)); + features.incompat.remove(CEPH_MON_FEATURE_INCOMPAT_GV); + assert(!features.incompat.contains(CEPH_MON_FEATURE_INCOMPAT_GV)); + + features.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_SINGLE_PAXOS); + assert(features.incompat.contains(CEPH_MON_FEATURE_INCOMPAT_SINGLE_PAXOS)); + + features_bl.clear(); + features.encode(features_bl); + + dout(20) << __func__ << " new features " << features << dendl; + t.put(MONITOR_NAME, COMPAT_SET_LOC, features_bl); +} + + +bool Monitor::StoreConverter::_check_gv_store() +{ + dout(20) << __func__ << dendl; + if (!store->exists_bl_ss(COMPAT_SET_LOC, 0)) + return false; + + bufferlist features_bl; + store->get_bl_ss_safe(features_bl, COMPAT_SET_LOC, 0); + if (!features_bl.length()) { + dout(20) << __func__ << " on-disk features length is zero" << dendl; + return false; + } + CompatSet features; + bufferlist::iterator p = features_bl.begin(); + features.decode(p); + return (features.incompat.contains(CEPH_MON_FEATURE_INCOMPAT_GV)); +} + +int Monitor::StoreConverter::needs_conversion() +{ + bufferlist magicbl; + int ret = 0; + + dout(10) << "check if store needs conversion from legacy format" << dendl; + _init(); + + int err = store->mount(); + if (err < 0) { + if (err == -ENOENT) { + derr << "unable to mount monitor store: " + << cpp_strerror(err) << dendl; + } else { + derr << "it appears that another monitor is running: " + << cpp_strerror(err) << dendl; + } + ret = err; + goto out; + } + assert(err == 0); + + if (store->exists_bl_ss("magic", 0)) { + if (_check_gv_store()) { + dout(1) << "found old GV monitor store format " + << "-- should convert!" << dendl; + ret = 1; + } else { + dout(0) << "Existing monitor store has not been converted " + << "to 0.52 (bobtail) format" << dendl; + assert(0 == "Existing store has not been converted to 0.52 format"); + } + } + assert(!store->umount()); + +out: + _deinit(); + return ret; +} + +int Monitor::StoreConverter::convert() +{ + _init(); + assert(!store->mount()); + if (db->exists("mon_convert", "on_going")) { + dout(0) << __func__ << " found a mon store in mid-convertion; abort!" + << dendl; + return -EEXIST; + } + + _mark_convert_start(); + _convert_monitor(); + _convert_machines(); + _convert_paxos(); + _mark_convert_finish(); + + store->umount(); + _deinit(); + + dout(0) << __func__ << " finished conversion" << dendl; + + return 0; +} + +void Monitor::StoreConverter::_convert_monitor() +{ + dout(10) << __func__ << dendl; + + assert(store->exists_bl_ss("magic")); + assert(store->exists_bl_ss("keyring")); + assert(store->exists_bl_ss("feature_set")); + assert(store->exists_bl_ss("election_epoch")); + + MonitorDBStore::Transaction tx; + + if (store->exists_bl_ss("joined")) { + version_t joined = store->get_int("joined"); + tx.put(MONITOR_NAME, "joined", joined); + } + + vector keys; + keys.push_back("magic"); + keys.push_back("feature_set"); + keys.push_back("cluster_uuid"); + + vector::iterator it; + for (it = keys.begin(); it != keys.end(); ++it) { + if (!store->exists_bl_ss((*it).c_str())) + continue; + + bufferlist bl; + int r = store->get_bl_ss(bl, (*it).c_str(), 0); + assert(r > 0); + tx.put(MONITOR_NAME, *it, bl); + } + version_t election_epoch = store->get_int("election_epoch"); + tx.put(MONITOR_NAME, "election_epoch", election_epoch); + + assert(!tx.empty()); + db->apply_transaction(tx); + dout(10) << __func__ << " finished" << dendl; +} + +void Monitor::StoreConverter::_convert_machines(string machine) +{ + dout(10) << __func__ << " " << machine << dendl; + + version_t first_committed = + store->get_int(machine.c_str(), "first_committed"); + version_t last_committed = + store->get_int(machine.c_str(), "last_committed"); + + version_t accepted_pn = store->get_int(machine.c_str(), "accepted_pn"); + version_t last_pn = store->get_int(machine.c_str(), "last_pn"); + + if (accepted_pn > highest_accepted_pn) + highest_accepted_pn = accepted_pn; + if (last_pn > highest_last_pn) + highest_last_pn = last_pn; + + string machine_gv(machine); + machine_gv.append("_gv"); + bool has_gv = true; + + if (!store->exists_bl_ss(machine_gv.c_str())) { + dout(1) << __func__ << " " << machine + << " no gv dir '" << machine_gv << "'" << dendl; + has_gv = false; + } + + for (version_t ver = first_committed; ver <= last_committed; ver++) { + if (!store->exists_bl_sn(machine.c_str(), ver)) { + dout(20) << __func__ << " " << machine + << " ver " << ver << " dne" << dendl; + continue; + } + + bufferlist bl; + int r = store->get_bl_sn(bl, machine.c_str(), ver); + assert(r >= 0); + dout(20) << __func__ << " " << machine + << " ver " << ver << " bl " << bl.length() << dendl; + + MonitorDBStore::Transaction tx; + tx.put(machine, ver, bl); + tx.put(machine, "last_committed", ver); + + if (has_gv && store->exists_bl_sn(machine_gv.c_str(), ver)) { + stringstream s; + s << ver; + string ver_str = s.str(); + + version_t gv = store->get_int(machine_gv.c_str(), ver_str.c_str()); + dout(20) << __func__ << " " << machine + << " ver " << ver << " -> " << gv << dendl; + + MonitorDBStore::Transaction paxos_tx; + + if (gvs.count(gv) == 0) { + gvs.insert(gv); + } else { + dout(0) << __func__ << " " << machine + << " gv " << gv << " already exists" + << dendl; + + // Duplicates aren't supposed to happen, but an old bug introduced + // them and the mds state machine wasn't ever trimmed, so many users + // will see them. So we'll just merge them all in one + // single paxos version. + // We know that they are either from another paxos machine or + // they are from the same paxos machine but their version is + // lower than ours -- given that we are iterating all versions + // from the lowest to the highest, duh! + // We'll just append our stuff to the existing paxos transaction + // as if nothing had happened. + + // Just make sure we are correct. This shouldn't take long and + // should never be triggered! + set >& s = gv_map[gv]; + for (set >::iterator it = s.begin(); + it != s.end(); ++it) { + if (it->first == machine) + assert(it->second + 1 == ver); + } + + bufferlist paxos_bl; + int r = db->get("paxos", gv, paxos_bl); + assert(r >= 0); + paxos_tx.append_from_encoded(paxos_bl); + } + gv_map[gv].insert(make_pair(machine,ver)); + + bufferlist tx_bl; + tx.encode(tx_bl); + paxos_tx.append_from_encoded(tx_bl); + bufferlist paxos_bl; + paxos_tx.encode(paxos_bl); + tx.put("paxos", gv, paxos_bl); + } + db->apply_transaction(tx); + } + + version_t lc = db->get(machine, "last_committed"); + dout(20) << __func__ << " lc " << lc << " last_committed " << last_committed << dendl; + assert(lc == last_committed); + + MonitorDBStore::Transaction tx; + tx.put(machine, "first_committed", first_committed); + tx.put(machine, "last_committed", last_committed); + tx.put(machine, "conversion_first", first_committed); + + if (store->exists_bl_ss(machine.c_str(), "latest")) { + bufferlist latest_bl_raw; + int r = store->get_bl_ss(latest_bl_raw, machine.c_str(), "latest"); + assert(r >= 0); + if (!latest_bl_raw.length()) { + dout(20) << __func__ << " machine " << machine + << " skip latest with size 0" << dendl; + goto out; + } + + tx.put(machine, "latest", latest_bl_raw); + + bufferlist::iterator lbl_it = latest_bl_raw.begin(); + bufferlist latest_bl; + version_t latest_ver; + ::decode(latest_ver, lbl_it); + ::decode(latest_bl, lbl_it); + + dout(20) << __func__ << " machine " << machine + << " latest ver " << latest_ver << dendl; + + tx.put(machine, "full_latest", latest_ver); + stringstream os; + os << "full_" << latest_ver; + tx.put(machine, os.str(), latest_bl); + } +out: + db->apply_transaction(tx); + dout(10) << __func__ << " machine " << machine << " finished" << dendl; +} + +void Monitor::StoreConverter::_convert_osdmap_full() +{ + dout(10) << __func__ << dendl; + version_t first_committed = + store->get_int("osdmap", "first_committed"); + version_t last_committed = + store->get_int("osdmap", "last_committed"); + + int err = 0; + for (version_t ver = first_committed; ver <= last_committed; ver++) { + if (!store->exists_bl_sn("osdmap_full", ver)) { + dout(20) << __func__ << " osdmap_full ver " << ver << " dne" << dendl; + err++; + continue; + } + + bufferlist bl; + int r = store->get_bl_sn(bl, "osdmap_full", ver); + assert(r >= 0); + dout(20) << __func__ << " osdmap_full ver " << ver + << " bl " << bl.length() << " bytes" << dendl; + + string full_key = "full_" + stringify(ver); + MonitorDBStore::Transaction tx; + tx.put("osdmap", full_key, bl); + db->apply_transaction(tx); + } + dout(10) << __func__ << " found " << err << " conversion errors!" << dendl; + assert(err == 0); +} + +void Monitor::StoreConverter::_convert_paxos() +{ + dout(10) << __func__ << dendl; + assert(!gvs.empty()); + + set::reverse_iterator rit = gvs.rbegin(); + version_t highest_gv = *rit; + version_t last_gv = highest_gv; + + int n = 0; + int max_versions = (g_conf->paxos_max_join_drift*2); + for (; (rit != gvs.rend()) && (n < max_versions); ++rit, ++n) { + version_t gv = *rit; + + if (last_gv == gv) + continue; + if ((last_gv - gv) > 1) { + // we are done; we found a gap and we are only interested in keeping + // contiguous paxos versions. + break; + } + last_gv = gv; + } + + // erase all paxos versions between [first, last_gv[, with first being the + // first gv in the map. + MonitorDBStore::Transaction tx; + set::iterator it = gvs.begin(); + dout(1) << __func__ << " first gv " << (*it) + << " last gv " << last_gv << dendl; + for (; it != gvs.end() && (*it < last_gv); ++it) { + tx.erase("paxos", *it); + } + tx.put("paxos", "first_committed", last_gv); + tx.put("paxos", "last_committed", highest_gv); + tx.put("paxos", "accepted_pn", highest_accepted_pn); + tx.put("paxos", "last_pn", highest_last_pn); + tx.put("paxos", "conversion_first", last_gv); + db->apply_transaction(tx); + + dout(10) << __func__ << " finished" << dendl; +} + +void Monitor::StoreConverter::_convert_machines() +{ + dout(10) << __func__ << dendl; + set machine_names = _get_machines_names(); + set::iterator it = machine_names.begin(); + + for (; it != machine_names.end(); ++it) { + _convert_machines(*it); + } + // convert osdmap full versions + // this stays here as these aren't really an independent paxos + // machine, but rather machine-specific and don't fit on the + // _convert_machines(string) function. + _convert_osdmap_full(); + + dout(10) << __func__ << " finished" << dendl; +} diff --git a/src/rgw/rgw_acl_s3.cc b/src/rgw/rgw_acl_s3.cc index b47c14df6e82d..1954317c4aaaf 100644 --- a/src/rgw/rgw_acl_s3.cc +++ b/src/rgw/rgw_acl_s3.cc @@ -537,7 +537,7 @@ bool RGWAccessControlPolicy_S3::compare_group_name(string& id, ACLGroupTypeEnum { switch (group) { case ACL_GROUP_ALL_USERS: - return (id.compare(rgw_uri_all_users) == 0); + return (id.compare(RGW_USER_ANON_ID) == 0); case ACL_GROUP_AUTHENTICATED_USERS: return (id.compare(rgw_uri_auth_users) == 0); default: diff --git a/src/rgw/rgw_acl_s3.cc.orig b/src/rgw/rgw_acl_s3.cc.orig new file mode 100644 index 0000000000000..b47c14df6e82d --- /dev/null +++ b/src/rgw/rgw_acl_s3.cc.orig @@ -0,0 +1,578 @@ +#include + +#include +#include + +#include "include/types.h" + +#include "rgw_acl_s3.h" +#include "rgw_user.h" + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + + +#define RGW_URI_ALL_USERS "http://acs.amazonaws.com/groups/global/AllUsers" +#define RGW_URI_AUTH_USERS "http://acs.amazonaws.com/groups/global/AuthenticatedUsers" + +static string rgw_uri_all_users = RGW_URI_ALL_USERS; +static string rgw_uri_auth_users = RGW_URI_AUTH_USERS; + +void ACLPermission_S3::to_xml(ostream& out) +{ + if ((flags & RGW_PERM_FULL_CONTROL) == RGW_PERM_FULL_CONTROL) { + out << "FULL_CONTROL"; + } else { + if (flags & RGW_PERM_READ) + out << "READ"; + if (flags & RGW_PERM_WRITE) + out << "WRITE"; + if (flags & RGW_PERM_READ_ACP) + out << "READ_ACP"; + if (flags & RGW_PERM_WRITE_ACP) + out << "WRITE_ACP"; + } +} + +bool ACLPermission_S3:: +xml_end(const char *el) +{ + const char *s = data.c_str(); + if (strcasecmp(s, "READ") == 0) { + flags |= RGW_PERM_READ; + return true; + } else if (strcasecmp(s, "WRITE") == 0) { + flags |= RGW_PERM_WRITE; + return true; + } else if (strcasecmp(s, "READ_ACP") == 0) { + flags |= RGW_PERM_READ_ACP; + return true; + } else if (strcasecmp(s, "WRITE_ACP") == 0) { + flags |= RGW_PERM_WRITE_ACP; + return true; + } else if (strcasecmp(s, "FULL_CONTROL") == 0) { + flags |= RGW_PERM_FULL_CONTROL; + return true; + } + return false; +} + + +class ACLGranteeType_S3 { +public: + static const char *to_string(ACLGranteeType& type) { + switch (type.get_type()) { + case ACL_TYPE_CANON_USER: + return "CanonicalUser"; + case ACL_TYPE_EMAIL_USER: + return "AmazonCustomerByEmail"; + case ACL_TYPE_GROUP: + return "Group"; + default: + return "unknown"; + } + } + + static void set(const char *s, ACLGranteeType& type) { + if (!s) { + type.set(ACL_TYPE_UNKNOWN); + return; + } + if (strcmp(s, "CanonicalUser") == 0) + type.set(ACL_TYPE_CANON_USER); + else if (strcmp(s, "AmazonCustomerByEmail") == 0) + type.set(ACL_TYPE_EMAIL_USER); + else if (strcmp(s, "Group") == 0) + type.set(ACL_TYPE_GROUP); + else + type.set(ACL_TYPE_UNKNOWN); + } +}; + +class ACLID_S3 : public XMLObj +{ +public: + ACLID_S3() {} + ~ACLID_S3() {} + string& to_str() { return data; } +}; + +class ACLURI_S3 : public XMLObj +{ +public: + ACLURI_S3() {} + ~ACLURI_S3() {} +}; + +class ACLEmail_S3 : public XMLObj +{ +public: + ACLEmail_S3() {} + ~ACLEmail_S3() {} +}; + +class ACLDisplayName_S3 : public XMLObj +{ +public: + ACLDisplayName_S3() {} + ~ACLDisplayName_S3() {} +}; + +bool ACLOwner_S3::xml_end(const char *el) { + ACLID_S3 *acl_id = static_cast(find_first("ID")); + ACLID_S3 *acl_name = static_cast(find_first("DisplayName")); + + // ID is mandatory + if (!acl_id) + return false; + id = acl_id->get_data(); + + // DisplayName is optional + if (acl_name) + display_name = acl_name->get_data(); + else + display_name = ""; + + return true; +} + +bool ACLGrant_S3::xml_end(const char *el) { + ACLGrantee_S3 *acl_grantee; + ACLID_S3 *acl_id; + ACLURI_S3 *acl_uri; + ACLEmail_S3 *acl_email; + ACLPermission_S3 *acl_permission; + ACLDisplayName_S3 *acl_name; + string uri; + + acl_grantee = static_cast(find_first("Grantee")); + if (!acl_grantee) + return false; + string type_str; + if (!acl_grantee->get_attr("xsi:type", type_str)) + return false; + ACLGranteeType_S3::set(type_str.c_str(), type); + + acl_permission = static_cast(find_first("Permission")); + if (!acl_permission) + return false; + + permission = *acl_permission; + + id.clear(); + name.clear(); + email.clear(); + + switch (type.get_type()) { + case ACL_TYPE_CANON_USER: + acl_id = static_cast(acl_grantee->find_first("ID")); + if (!acl_id) + return false; + id = acl_id->to_str(); + acl_name = static_cast(acl_grantee->find_first("DisplayName")); + if (acl_name) + name = acl_name->get_data(); + break; + case ACL_TYPE_GROUP: + acl_uri = static_cast(acl_grantee->find_first("URI")); + if (!acl_uri) + return false; + uri = acl_uri->get_data(); + group = uri_to_group(uri); + break; + case ACL_TYPE_EMAIL_USER: + acl_email = static_cast(acl_grantee->find_first("EmailAddress")); + if (!acl_email) + return false; + email = acl_email->get_data(); + break; + default: + // unknown user type + return false; + }; + return true; +} + +void ACLGrant_S3::to_xml(CephContext *cct, ostream& out) { + ACLPermission_S3& perm = static_cast(permission); + + /* only show s3 compatible permissions */ + if (!(perm.get_permissions() & RGW_PERM_ALL_S3)) + return; + + string uri; + + out << "" << + ""; + switch (type.get_type()) { + case ACL_TYPE_CANON_USER: + out << "" << id << ""; + if (name.size()) { + out << "" << name << ""; + } + break; + case ACL_TYPE_EMAIL_USER: + out << "" << email << ""; + break; + case ACL_TYPE_GROUP: + if (!group_to_uri(group, uri)) { + ldout(cct, 0) << "ERROR: group_to_uri failed with group=" << (int)group << dendl; + break; + } + out << "" << uri << ""; + break; + default: + break; + } + out << ""; + perm.to_xml(out); + out << ""; +} + +bool ACLGrant_S3::group_to_uri(ACLGroupTypeEnum group, string& uri) +{ + switch (group) { + case ACL_GROUP_ALL_USERS: + uri = rgw_uri_all_users; + return true; + case ACL_GROUP_AUTHENTICATED_USERS: + uri = rgw_uri_auth_users; + return true; + default: + return false; + } +} + +bool RGWAccessControlList_S3::xml_end(const char *el) { + XMLObjIter iter = find("Grant"); + ACLGrant_S3 *grant = static_cast(iter.get_next()); + while (grant) { + add_grant(grant); + grant = static_cast(iter.get_next()); + } + return true; +} + +struct s3_acl_header { + int rgw_perm; + const char *http_header; +}; + +static const char *get_acl_header(RGWEnv *env, + const struct s3_acl_header *perm) +{ + const char *header = perm->http_header; + + return env->get(header, NULL); +} + +static int parse_grantee_str(RGWRados *store, string& grantee_str, + const struct s3_acl_header *perm, ACLGrant& grant) +{ + string id_type, id_val_quoted; + int rgw_perm = perm->rgw_perm; + int ret; + + RGWUserInfo info; + + ret = parse_key_value(grantee_str, id_type, id_val_quoted); + if (ret < 0) + return ret; + + string id_val = rgw_trim_quotes(id_val_quoted); + + if (strcasecmp(id_type.c_str(), "emailAddress") == 0) { + ret = rgw_get_user_info_by_email(store, id_val, info); + if (ret < 0) + return ret; + + grant.set_canon(info.user_id, info.display_name, rgw_perm); + } else if (strcasecmp(id_type.c_str(), "id") == 0) { + ret = rgw_get_user_info_by_uid(store, id_val, info); + if (ret < 0) + return ret; + + grant.set_canon(info.user_id, info.display_name, rgw_perm); + } else if (strcasecmp(id_type.c_str(), "uri") == 0) { + ACLGroupTypeEnum gid = grant.uri_to_group(id_val); + if (gid == ACL_GROUP_NONE) + return -EINVAL; + + grant.set_group(gid, rgw_perm); + } else { + return -EINVAL; + } + + return 0; +} + +static int parse_acl_header(RGWRados *store, RGWEnv *env, + const struct s3_acl_header *perm, std::list& _grants) +{ + std::list grantees; + std::string hacl_str; + + const char *hacl = get_acl_header(env, perm); + if (hacl == NULL) + return 0; + + hacl_str = hacl; + get_str_list(hacl_str, ",", grantees); + + for (list::iterator it = grantees.begin(); it != grantees.end(); ++it) { + ACLGrant grant; + int ret = parse_grantee_str(store, *it, perm, grant); + if (ret < 0) + return ret; + + _grants.push_back(grant); + } + + return 0; +} + +int RGWAccessControlList_S3::create_canned(ACLOwner& owner, ACLOwner& bucket_owner, const string& canned_acl) +{ + acl_user_map.clear(); + grant_map.clear(); + + ACLGrant owner_grant; + + string bid = bucket_owner.get_id(); + string bname = bucket_owner.get_display_name(); + + /* owner gets full control */ + owner_grant.set_canon(owner.get_id(), owner.get_display_name(), RGW_PERM_FULL_CONTROL); + add_grant(&owner_grant); + + if (canned_acl.size() == 0 || canned_acl.compare("private") == 0) { + return 0; + } + + ACLGrant bucket_owner_grant; + ACLGrant group_grant; + if (canned_acl.compare("public-read") == 0) { + group_grant.set_group(ACL_GROUP_ALL_USERS, RGW_PERM_READ); + add_grant(&group_grant); + } else if (canned_acl.compare("public-read-write") == 0) { + group_grant.set_group(ACL_GROUP_ALL_USERS, RGW_PERM_READ); + add_grant(&group_grant); + group_grant.set_group(ACL_GROUP_ALL_USERS, RGW_PERM_WRITE); + add_grant(&group_grant); + } else if (canned_acl.compare("authenticated-read") == 0) { + group_grant.set_group(ACL_GROUP_AUTHENTICATED_USERS, RGW_PERM_READ); + add_grant(&group_grant); + } else if (canned_acl.compare("bucket-owner-read") == 0) { + bucket_owner_grant.set_canon(bid, bname, RGW_PERM_READ); + if (bid.compare(owner.get_id()) != 0) + add_grant(&bucket_owner_grant); + } else if (canned_acl.compare("bucket-owner-full-control") == 0) { + bucket_owner_grant.set_canon(bid, bname, RGW_PERM_FULL_CONTROL); + if (bid.compare(owner.get_id()) != 0) + add_grant(&bucket_owner_grant); + } else { + return -EINVAL; + } + + return 0; +} + +int RGWAccessControlList_S3::create_from_grants(std::list& grants) +{ + if (grants.empty()) + return -EINVAL; + + acl_user_map.clear(); + grant_map.clear(); + + for (std::list::iterator it = grants.begin(); it != grants.end(); ++it) { + ACLGrant g = *it; + add_grant(&g); + } + + return 0; +} + +bool RGWAccessControlPolicy_S3::xml_end(const char *el) { + RGWAccessControlList_S3 *s3acl = + static_cast(find_first("AccessControlList")); + if (!s3acl) + return false; + + acl = *s3acl; + + ACLOwner *owner_p = static_cast(find_first("Owner")); + if (!owner_p) + return false; + owner = *owner_p; + return true; +} + +static const s3_acl_header acl_header_perms[] = { + {RGW_PERM_READ, "HTTP_X_AMZ_GRANT_READ"}, + {RGW_PERM_WRITE, "HTTP_X_AMZ_GRANT_WRITE"}, + {RGW_PERM_READ_ACP,"HTTP_X_AMZ_GRANT_READ_ACP"}, + {RGW_PERM_WRITE_ACP, "HTTP_X_AMZ_GRANT_WRITE_ACP"}, + {RGW_PERM_FULL_CONTROL, "HTTP_X_AMZ_GRANT_FULL_CONTROL"}, + {0, NULL} +}; + +int RGWAccessControlPolicy_S3::create_from_headers(RGWRados *store, RGWEnv *env, ACLOwner& _owner) +{ + std::list grants; + + for (const struct s3_acl_header *p = acl_header_perms; p->rgw_perm; p++) { + if (parse_acl_header(store, env, p, grants) < 0) + return false; + } + + RGWAccessControlList_S3& _acl = static_cast(acl); + int r = _acl.create_from_grants(grants); + + owner = _owner; + + return r; +} + +/* + can only be called on object that was parsed + */ +int RGWAccessControlPolicy_S3::rebuild(RGWRados *store, ACLOwner *owner, RGWAccessControlPolicy& dest) +{ + if (!owner) + return -EINVAL; + + ACLOwner *requested_owner = static_cast(find_first("Owner")); + if (requested_owner) { + const string& requested_id = requested_owner->get_id(); + if (!requested_id.empty() && requested_id.compare(owner->get_id()) != 0) + return -EPERM; + } + + RGWUserInfo owner_info; + if (rgw_get_user_info_by_uid(store, owner->get_id(), owner_info) < 0) { + ldout(cct, 10) << "owner info does not exist" << dendl; + return -EINVAL; + } + ACLOwner& dest_owner = dest.get_owner(); + dest_owner.set_id(owner->get_id()); + dest_owner.set_name(owner_info.display_name); + + ldout(cct, 20) << "owner id=" << owner->get_id() << dendl; + ldout(cct, 20) << "dest owner id=" << dest.get_owner().get_id() << dendl; + + RGWAccessControlList& dst_acl = dest.get_acl(); + + multimap& grant_map = acl.get_grant_map(); + multimap::iterator iter; + for (iter = grant_map.begin(); iter != grant_map.end(); ++iter) { + ACLGrant& src_grant = iter->second; + ACLGranteeType& type = src_grant.get_type(); + ACLGrant new_grant; + bool grant_ok = false; + string uid; + RGWUserInfo grant_user; + switch (type.get_type()) { + case ACL_TYPE_EMAIL_USER: + { + string email; + if (!src_grant.get_id(email)) { + ldout(cct, 0) << "ERROR: src_grant.get_id() failed" << dendl; + return -EINVAL; + } + ldout(cct, 10) << "grant user email=" << email << dendl; + if (rgw_get_user_info_by_email(store, email, grant_user) < 0) { + ldout(cct, 10) << "grant user email not found or other error" << dendl; + return -ERR_UNRESOLVABLE_EMAIL; + } + uid = grant_user.user_id; + } + case ACL_TYPE_CANON_USER: + { + if (type.get_type() == ACL_TYPE_CANON_USER) { + if (!src_grant.get_id(uid)) { + ldout(cct, 0) << "ERROR: src_grant.get_id() failed" << dendl; + return -EINVAL; + } + } + + if (grant_user.user_id.empty() && rgw_get_user_info_by_uid(store, uid, grant_user) < 0) { + ldout(cct, 10) << "grant user does not exist:" << uid << dendl; + return -EINVAL; + } else { + ACLPermission& perm = src_grant.get_permission(); + new_grant.set_canon(uid, grant_user.display_name, perm.get_permissions()); + grant_ok = true; + string new_id; + new_grant.get_id(new_id); + ldout(cct, 10) << "new grant: " << new_id << ":" << grant_user.display_name << dendl; + } + } + break; + case ACL_TYPE_GROUP: + { + string uri; + if (ACLGrant_S3::group_to_uri(src_grant.get_group(), uri)) { + new_grant = src_grant; + grant_ok = true; + ldout(cct, 10) << "new grant: " << uri << dendl; + } else { + ldout(cct, 10) << "bad grant group:" << (int)src_grant.get_group() << dendl; + return -EINVAL; + } + } + default: + break; + } + if (grant_ok) { + dst_acl.add_grant(&new_grant); + } + } + + return 0; +} + +bool RGWAccessControlPolicy_S3::compare_group_name(string& id, ACLGroupTypeEnum group) +{ + switch (group) { + case ACL_GROUP_ALL_USERS: + return (id.compare(rgw_uri_all_users) == 0); + case ACL_GROUP_AUTHENTICATED_USERS: + return (id.compare(rgw_uri_auth_users) == 0); + default: + return id.empty(); + } + + // shouldn't get here + return false; +} + +XMLObj *RGWACLXMLParser_S3::alloc_obj(const char *el) +{ + XMLObj * obj = NULL; + if (strcmp(el, "AccessControlPolicy") == 0) { + obj = new RGWAccessControlPolicy_S3(cct); + } else if (strcmp(el, "Owner") == 0) { + obj = new ACLOwner_S3(); + } else if (strcmp(el, "AccessControlList") == 0) { + obj = new RGWAccessControlList_S3(cct); + } else if (strcmp(el, "ID") == 0) { + obj = new ACLID_S3(); + } else if (strcmp(el, "DisplayName") == 0) { + obj = new ACLDisplayName_S3(); + } else if (strcmp(el, "Grant") == 0) { + obj = new ACLGrant_S3(); + } else if (strcmp(el, "Grantee") == 0) { + obj = new ACLGrantee_S3(); + } else if (strcmp(el, "Permission") == 0) { + obj = new ACLPermission_S3(); + } else if (strcmp(el, "URI") == 0) { + obj = new ACLURI_S3(); + } else if (strcmp(el, "EmailAddress") == 0) { + obj = new ACLEmail_S3(); + } + + return obj; +} + diff --git a/src/rgw/rgw_cors.cc b/src/rgw/rgw_cors.cc index a120a6866455e..6391a77a007aa 100644 --- a/src/rgw/rgw_cors.cc +++ b/src/rgw/rgw_cors.cc @@ -134,11 +134,13 @@ bool RGWCORSRule::is_header_allowed(const char *h, size_t len) { void RGWCORSRule::format_exp_headers(string& s) { s = ""; - for(list::iterator it = exposable_hdrs.begin(); - it != exposable_hdrs.end(); ++it) { - if (s.length() > 0) - s.append(","); - s.append((*it)); + for (const auto& header : exposable_hdrs) { + if (s.length() > 0) + s.append(","); + // these values are sent to clients in a 'Access-Control-Expose-Headers' + // response header, so we escape '\n' and '\r' to avoid header injection + std::string tmp = boost::replace_all_copy(header, "\n", "\\n"); + boost::replace_all_copy(std::back_inserter(s), tmp, "\r", "\\r"); } } diff --git a/src/rgw/rgw_cors.cc.orig b/src/rgw/rgw_cors.cc.orig new file mode 100644 index 0000000000000..38657a579c2b0 --- /dev/null +++ b/src/rgw/rgw_cors.cc.orig @@ -0,0 +1,183 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 eNovance SAS + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#include + +#include +#include + +#include + +#include "include/types.h" +#include "common/debug.h" +#include "include/str_list.h" +#include "common/Formatter.h" + +#include "rgw_cors.h" + +#define dout_subsys ceph_subsys_rgw +using namespace std; + +void RGWCORSRule::dump_origins() { + unsigned num_origins = allowed_origins.size(); + dout(10) << "Allowed origins : " << num_origins << dendl; + for(set::iterator it = allowed_origins.begin(); + it != allowed_origins.end(); + ++it) { + dout(10) << *it << "," << dendl; + } +} + +void RGWCORSRule::erase_origin_if_present(string& origin, bool *rule_empty) { + set::iterator it = allowed_origins.find(origin); + if (!rule_empty) + return; + *rule_empty = false; + if (it != allowed_origins.end()) { + dout(10) << "Found origin " << origin << ", set size:" << + allowed_origins.size() << dendl; + allowed_origins.erase(it); + *rule_empty = (allowed_origins.empty()); + } +} + +/* + * make attrs look-like-this + * does not convert underscores or dashes + * + * Per CORS specification, section 3: + * === + * "Converting a string to ASCII lowercase" means replacing all characters in the + * range U+0041 LATIN CAPITAL LETTER A to U+005A LATIN CAPITAL LETTER Z with + * the corresponding characters in the range U+0061 LATIN SMALL LETTER A to + * U+007A LATIN SMALL LETTER Z). + * === + * + * @todo When UTF-8 is allowed in HTTP headers, this function will need to change + */ +string lowercase_http_attr(const string& orig) +{ + const char *s = orig.c_str(); + char buf[orig.size() + 1]; + buf[orig.size()] = '\0'; + + for (size_t i = 0; i < orig.size(); ++i, ++s) { + buf[i] = tolower(*s); + } + return string(buf); +} + + +static bool is_string_in_set(set& s, string h) { + if ((s.find("*") != s.end()) || + (s.find(h) != s.end())) { + return true; + } + /* The header can be Content-*-type, or Content-* */ + for(set::iterator it = s.begin(); + it != s.end(); ++it) { + size_t off; + if ((off = (*it).find("*"))!=string::npos) { + list ssplit; + unsigned flen = 0; + + get_str_list((*it), "* \t", ssplit); + if (off != 0) { + string sl = ssplit.front(); + flen = sl.length(); + dout(10) << "Finding " << sl << ", in " << h << ", at offset 0" << dendl; + if (!boost::algorithm::starts_with(h,sl)) + continue; + ssplit.pop_front(); + } + if (off != ((*it).length() - 1)) { + string sl = ssplit.front(); + dout(10) << "Finding " << sl << ", in " << h + << ", at offset not less than " << flen << dendl; + if (h.compare((h.size() - sl.size()), sl.size(), sl) != 0) + continue; + ssplit.pop_front(); + } + if (!ssplit.empty()) + continue; + return true; + } + } + return false; +} + +bool RGWCORSRule::is_origin_present(const char *o) { + string origin = o; + return is_string_in_set(allowed_origins, origin); +} + +bool RGWCORSRule::is_header_allowed(const char *h, size_t len) { + string hdr(h, len); + if(lowercase_allowed_hdrs.empty()) { + set::iterator iter; + for (iter = allowed_hdrs.begin(); iter != allowed_hdrs.end(); ++iter) { + lowercase_allowed_hdrs.insert(lowercase_http_attr(*iter)); + } + } + return is_string_in_set(lowercase_allowed_hdrs, lowercase_http_attr(hdr)); +} + +void RGWCORSRule::format_exp_headers(string& s) { + s = ""; + for (const auto& header : exposable_hdrs) { + if (s.length() > 0) + s.append(","); + // these values are sent to clients in a 'Access-Control-Expose-Headers' + // response header, so we escape '\n' to avoid header injection + boost::replace_all_copy(std::back_inserter(s), header, "\n", "\\n"); + } +} + +RGWCORSRule * RGWCORSConfiguration::host_name_rule(const char *origin) { + for(list::iterator it_r = rules.begin(); + it_r != rules.end(); ++it_r) { + RGWCORSRule& r = (*it_r); + if (r.is_origin_present(origin)) + return &r; + } + return NULL; +} + +void RGWCORSConfiguration::erase_host_name_rule(string& origin) { + bool rule_empty; + unsigned loop = 0; + /*Erase the host name from that rule*/ + dout(10) << "Num of rules : " << rules.size() << dendl; + for(list::iterator it_r = rules.begin(); + it_r != rules.end(); ++it_r, loop++) { + RGWCORSRule& r = (*it_r); + r.erase_origin_if_present(origin, &rule_empty); + dout(10) << "Origin:" << origin << ", rule num:" + << loop << ", emptying now:" << rule_empty << dendl; + if (rule_empty) { + rules.erase(it_r); + break; + } + } +} + +void RGWCORSConfiguration::dump() { + unsigned loop = 1; + unsigned num_rules = rules.size(); + dout(10) << "Number of rules: " << num_rules << dendl; + for(list::iterator it = rules.begin(); + it!= rules.end(); ++it, loop++) { + dout(10) << " <<<<<<< Rule " << loop << " >>>>>>> " << dendl; + (*it).dump_origins(); + } +} diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc index 76947480bb4f1..0c8624955e48b 100644 --- a/src/rgw/rgw_op.cc +++ b/src/rgw/rgw_op.cc @@ -15,6 +15,7 @@ #include "rgw_rest.h" #include "rgw_acl.h" #include "rgw_acl_s3.h" +#include "rgw_acl_swift.h" #include "rgw_user.h" #include "rgw_bucket.h" #include "rgw_log.h" @@ -322,7 +323,13 @@ static int rgw_build_policies(RGWRados *store, struct req_state *s, bool only_bu s->bucket_instance_id = s->info.args.get(RGW_SYS_PARAM_PREFIX "bucket-instance"); - s->bucket_acl = new RGWAccessControlPolicy(s->cct); + if(s->dialect.compare("s3") == 0) { + s->bucket_acl = new RGWAccessControlPolicy_S3(s->cct); + } else if(s->dialect.compare("swift") == 0) { + s->bucket_acl = new RGWAccessControlPolicy_SWIFT(s->cct); + } else { + s->bucket_acl = new RGWAccessControlPolicy(s->cct); + } if (s->copy_source) { /* check if copy source is within the current domain */ const char *src = s->copy_source; diff --git a/src/rgw/rgw_op.cc.orig b/src/rgw/rgw_op.cc.orig new file mode 100644 index 0000000000000..76947480bb4f1 --- /dev/null +++ b/src/rgw/rgw_op.cc.orig @@ -0,0 +1,3170 @@ + +#include +#include + +#include + +#include "common/Clock.h" +#include "common/armor.h" +#include "common/mime.h" +#include "common/utf8.h" +#include "common/ceph_json.h" + +#include "rgw_rados.h" +#include "rgw_op.h" +#include "rgw_rest.h" +#include "rgw_acl.h" +#include "rgw_acl_s3.h" +#include "rgw_user.h" +#include "rgw_bucket.h" +#include "rgw_log.h" +#include "rgw_multi.h" +#include "rgw_multi_del.h" +#include "rgw_cors.h" +#include "rgw_cors_s3.h" + +#include "rgw_client_io.h" + +#define dout_subsys ceph_subsys_rgw + +using namespace std; +using ceph::crypto::MD5; + +static string mp_ns = RGW_OBJ_NS_MULTIPART; +static string shadow_ns = RGW_OBJ_NS_SHADOW; + +#define MULTIPART_UPLOAD_ID_PREFIX "2/" // must contain a unique char that may not come up in gen_rand_alpha() + +class MultipartMetaFilter : public RGWAccessListFilter { +public: + MultipartMetaFilter() {} + bool filter(string& name, string& key) { + int len = name.size(); + if (len < 6) + return false; + + int pos = name.find(MP_META_SUFFIX, len - 5); + if (pos <= 0) + return false; + + pos = name.rfind('.', pos - 1); + if (pos < 0) + return false; + + key = name.substr(0, pos); + + return true; + } +}; + +static MultipartMetaFilter mp_filter; + +static int parse_range(const char *range, off_t& ofs, off_t& end, bool *partial_content) +{ + int r = -ERANGE; + string s(range); + string ofs_str; + string end_str; + + *partial_content = false; + + int pos = s.find("bytes="); + if (pos < 0) { + pos = 0; + while (isspace(s[pos])) + pos++; + int end = pos; + while (isalpha(s[end])) + end++; + if (strncasecmp(s.c_str(), "bytes", end - pos) != 0) + return 0; + while (isspace(s[end])) + end++; + if (s[end] != '=') + return 0; + s = s.substr(end + 1); + } else { + s = s.substr(pos + 6); /* size of("bytes=") */ + } + pos = s.find('-'); + if (pos < 0) + goto done; + + *partial_content = true; + + ofs_str = s.substr(0, pos); + end_str = s.substr(pos + 1); + if (end_str.length()) { + end = atoll(end_str.c_str()); + if (end < 0) + goto done; + } + + if (ofs_str.length()) { + ofs = atoll(ofs_str.c_str()); + } else { // RFC2616 suffix-byte-range-spec + ofs = -end; + end = -1; + } + + if (end >= 0 && end < ofs) + goto done; + + r = 0; +done: + return r; +} + +static void format_xattr(std::string &xattr) +{ + /* If the extended attribute is not valid UTF-8, we encode it using quoted-printable + * encoding. + */ + if ((check_utf8(xattr.c_str(), xattr.length()) != 0) || + (check_for_control_characters(xattr.c_str(), xattr.length()) != 0)) { + static const char MIME_PREFIX_STR[] = "=?UTF-8?Q?"; + static const int MIME_PREFIX_LEN = sizeof(MIME_PREFIX_STR) - 1; + static const char MIME_SUFFIX_STR[] = "?="; + static const int MIME_SUFFIX_LEN = sizeof(MIME_SUFFIX_STR) - 1; + int mlen = mime_encode_as_qp(xattr.c_str(), NULL, 0); + char *mime = new char[MIME_PREFIX_LEN + mlen + MIME_SUFFIX_LEN + 1]; + strcpy(mime, MIME_PREFIX_STR); + mime_encode_as_qp(xattr.c_str(), mime + MIME_PREFIX_LEN, mlen); + strcpy(mime + MIME_PREFIX_LEN + (mlen - 1), MIME_SUFFIX_STR); + xattr.assign(mime); + delete [] mime; + } +} + +/** + * Get the HTTP request metadata out of the req_state as a + * map(, where attr_name is RGW_ATTR_PREFIX.HTTP_NAME) + * s: The request state + * attrs: will be filled up with attrs mapped as + * + */ +static void rgw_get_request_metadata(CephContext *cct, struct req_info& info, map& attrs) +{ + map::iterator iter; + for (iter = info.x_meta_map.begin(); iter != info.x_meta_map.end(); ++iter) { + const string &name(iter->first); + string &xattr(iter->second); + ldout(cct, 10) << "x>> " << name << ":" << xattr << dendl; + format_xattr(xattr); + string attr_name(RGW_ATTR_PREFIX); + attr_name.append(name); + map::value_type v(attr_name, bufferlist()); + std::pair < map::iterator, bool > rval(attrs.insert(v)); + bufferlist& bl(rval.first->second); + bl.append(xattr.c_str(), xattr.size() + 1); + } +} + +static int decode_policy(CephContext *cct, bufferlist& bl, RGWAccessControlPolicy *policy) +{ + bufferlist::iterator iter = bl.begin(); + try { + policy->decode(iter); + } catch (buffer::error& err) { + ldout(cct, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl; + return -EIO; + } + if (cct->_conf->subsys.should_gather(ceph_subsys_rgw, 15)) { + RGWAccessControlPolicy_S3 *s3policy = static_cast(policy); + ldout(cct, 15) << "Read AccessControlPolicy"; + s3policy->to_xml(*_dout); + *_dout << dendl; + } + return 0; +} + +static int get_bucket_policy_from_attr(CephContext *cct, RGWRados *store, void *ctx, + RGWBucketInfo& bucket_info, map& bucket_attrs, + RGWAccessControlPolicy *policy, rgw_obj& obj) +{ + map::iterator aiter = bucket_attrs.find(RGW_ATTR_ACL); + + if (aiter != bucket_attrs.end()) { + int ret = decode_policy(cct, aiter->second, policy); + if (ret < 0) + return ret; + } else { + ldout(cct, 0) << "WARNING: couldn't find acl header for bucket, generating default" << dendl; + RGWUserInfo uinfo; + /* object exists, but policy is broken */ + int r = rgw_get_user_info_by_uid(store, bucket_info.owner, uinfo); + if (r < 0) + return r; + + policy->create_default(bucket_info.owner, uinfo.display_name); + } + return 0; +} + +static int get_obj_policy_from_attr(CephContext *cct, RGWRados *store, void *ctx, + RGWBucketInfo& bucket_info, map& bucket_attrs, + RGWAccessControlPolicy *policy, rgw_obj& obj) +{ + bufferlist bl; + int ret = 0; + + ret = store->get_attr(ctx, obj, RGW_ATTR_ACL, bl); + if (ret >= 0) { + ret = decode_policy(cct, bl, policy); + if (ret < 0) + return ret; + } else if (ret == -ENODATA) { + /* object exists, but policy is broken */ + ldout(cct, 0) << "WARNING: couldn't find acl header for object, generating default" << dendl; + RGWUserInfo uinfo; + ret = rgw_get_user_info_by_uid(store, bucket_info.owner, uinfo); + if (ret < 0) + return ret; + + policy->create_default(bucket_info.owner, uinfo.display_name); + } + return ret; +} + + +/** + * Get the AccessControlPolicy for an object off of disk. + * policy: must point to a valid RGWACL, and will be filled upon return. + * bucket: name of the bucket containing the object. + * object: name of the object to get the ACL for. + * Returns: 0 on success, -ERR# otherwise. + */ +static int get_policy_from_attr(CephContext *cct, RGWRados *store, void *ctx, + RGWBucketInfo& bucket_info, map& bucket_attrs, + RGWAccessControlPolicy *policy, rgw_obj& obj) +{ + if (obj.bucket.name.empty()) { + return 0; + } + + if (obj.object.empty()) { + rgw_obj instance_obj; + store->get_bucket_instance_obj(bucket_info.bucket, instance_obj); + return get_bucket_policy_from_attr(cct, store, ctx, bucket_info, bucket_attrs, + policy, instance_obj); + } + return get_obj_policy_from_attr(cct, store, ctx, bucket_info, bucket_attrs, + policy, obj); +} + +static int get_obj_attrs(RGWRados *store, struct req_state *s, rgw_obj& obj, map& attrs, + uint64_t *obj_size, RGWObjVersionTracker *objv_tracker) +{ + void *handle; + int ret = store->prepare_get_obj(s->obj_ctx, obj, NULL, NULL, &attrs, NULL, + NULL, NULL, NULL, NULL, NULL, obj_size, objv_tracker, &handle, &s->err); + store->finish_get_obj(&handle); + return ret; +} + +static int read_policy(RGWRados *store, struct req_state *s, + RGWBucketInfo& bucket_info, map& bucket_attrs, + RGWAccessControlPolicy *policy, rgw_bucket& bucket, string& object) +{ + string upload_id; + upload_id = s->info.args.get("uploadId"); + string oid = object; + rgw_obj obj; + + if (!s->system_request && bucket_info.flags & BUCKET_SUSPENDED) { + ldout(s->cct, 0) << "NOTICE: bucket " << bucket_info.bucket.name << " is suspended" << dendl; + return -ERR_USER_SUSPENDED; + } + + if (!oid.empty() && !upload_id.empty()) { + RGWMPObj mp(oid, upload_id); + oid = mp.get_meta(); + obj.init_ns(bucket, oid, mp_ns); + obj.set_in_extra_data(true); + } else { + obj.init(bucket, oid); + } + int ret = get_policy_from_attr(s->cct, store, s->obj_ctx, bucket_info, bucket_attrs, policy, obj); + if (ret == -ENOENT && object.size()) { + /* object does not exist checking the bucket's ACL to make sure + that we send a proper error code */ + RGWAccessControlPolicy bucket_policy(s->cct); + string no_object; + rgw_obj no_obj(bucket, no_object); + ret = get_policy_from_attr(s->cct, store, s->obj_ctx, bucket_info, bucket_attrs, &bucket_policy, no_obj); + if (ret < 0) + return ret; + string& owner = bucket_policy.get_owner().get_id(); + if (!s->system_request && owner.compare(s->user.user_id) != 0 && + !bucket_policy.verify_permission(s->user.user_id, s->perm_mask, RGW_PERM_READ)) + ret = -EACCES; + else + ret = -ENOENT; + + } else if (ret == -ENOENT) { + ret = -ERR_NO_SUCH_BUCKET; + } + + return ret; +} + +/** + * Get the AccessControlPolicy for a bucket or object off of disk. + * s: The req_state to draw information from. + * only_bucket: If true, reads the bucket ACL rather than the object ACL. + * Returns: 0 on success, -ERR# otherwise. + */ +static int rgw_build_policies(RGWRados *store, struct req_state *s, bool only_bucket, bool prefetch_data) +{ + int ret = 0; + string obj_str; + RGWUserInfo bucket_owner_info; + + s->bucket_instance_id = s->info.args.get(RGW_SYS_PARAM_PREFIX "bucket-instance"); + + s->bucket_acl = new RGWAccessControlPolicy(s->cct); + + if (s->copy_source) { /* check if copy source is within the current domain */ + const char *src = s->copy_source; + if (*src == '/') + ++src; + string copy_source_str(src); + + int pos = copy_source_str.find('/'); + if (pos > 0) + copy_source_str = copy_source_str.substr(0, pos); + + RGWBucketInfo source_info; + + ret = store->get_bucket_info(s->obj_ctx, copy_source_str, source_info, NULL); + if (ret == 0) { + string& region = source_info.region; + s->local_source = store->region.equals(region); + } + } + + if (!s->bucket_name_str.empty()) { + s->bucket_exists = true; + if (s->bucket_instance_id.empty()) { + ret = store->get_bucket_info(s->obj_ctx, s->bucket_name_str, s->bucket_info, NULL, &s->bucket_attrs); + } else { + ret = store->get_bucket_instance_info(s->obj_ctx, s->bucket_instance_id, s->bucket_info, NULL, &s->bucket_attrs); + } + if (ret < 0) { + if (ret != -ENOENT) { + ldout(s->cct, 0) << "NOTICE: couldn't get bucket from bucket_name (name=" << s->bucket_name_str << ")" << dendl; + return ret; + } + s->bucket_exists = false; + } + s->bucket = s->bucket_info.bucket; + + if (s->bucket_exists) { + string no_obj; + ret = read_policy(store, s, s->bucket_info, s->bucket_attrs, s->bucket_acl, s->bucket, no_obj); + } else { + s->bucket_acl->create_default(s->user.user_id, s->user.display_name); + ret = -ERR_NO_SUCH_BUCKET; + } + + s->bucket_owner = s->bucket_acl->get_owner(); + + string& region = s->bucket_info.region; + if (s->bucket_exists && !store->region.equals(region)) { + ldout(s->cct, 0) << "NOTICE: request for data in a different region (" << region << " != " << store->region.name << ")" << dendl; + /* we now need to make sure that the operation actually requires copy source, that is + * it's a copy operation + */ + if (store->region.is_master && s->op == OP_DELETE && s->system_request) { + /*If the operation is delete and if this is the master, don't redirect*/ + } else if (!s->local_source || + (s->op != OP_PUT && s->op != OP_COPY) || + s->object_str.empty()) { + return -ERR_PERMANENT_REDIRECT; + } + } + } + + /* we're passed only_bucket = true when we specifically need the bucket's + acls, that happens on write operations */ + if (!only_bucket && !s->object_str.empty()) { + if (!s->bucket_exists) { + return -ERR_NO_SUCH_BUCKET; + } + s->object_acl = new RGWAccessControlPolicy(s->cct); + + obj_str = s->object_str; + rgw_obj obj(s->bucket, obj_str); + store->set_atomic(s->obj_ctx, obj); + if (prefetch_data) { + store->set_prefetch_data(s->obj_ctx, obj); + } + ret = read_policy(store, s, s->bucket_info, s->bucket_attrs, s->object_acl, s->bucket, obj_str); + } + + return ret; +} + +static void rgw_bucket_object_pre_exec(struct req_state *s) +{ + if (s->expect_cont) + dump_continue(s); + + dump_bucket_from_state(s); +} + +int RGWGetObj::verify_permission() +{ + obj.init(s->bucket, s->object_str); + store->set_atomic(s->obj_ctx, obj); + store->set_prefetch_data(s->obj_ctx, obj); + + if (!verify_object_permission(s, RGW_PERM_READ)) + return -EACCES; + + return 0; +} + + +int RGWOp::verify_op_mask() +{ + uint32_t required_mask = op_mask(); + + ldout(s->cct, 20) << "required_mask= " << required_mask << " user.op_mask=" << s->user.op_mask << dendl; + + if ((s->user.op_mask & required_mask) != required_mask) { + return -EPERM; + } + + if (!s->system_request && (required_mask & RGW_OP_TYPE_MODIFY) && !store->zone.is_master) { + ldout(s->cct, 5) << "NOTICE: modify request to a non-master zone by a non-system user, permission denied" << dendl; + return -EPERM; + } + + return 0; +} + +int RGWOp::init_quota() +{ + /* no quota enforcement for system requests */ + if (s->system_request) + return 0; + + /* init quota related stuff */ + if (!(s->user.op_mask & RGW_OP_TYPE_MODIFY)) { + return 0; + } + + /* only interested in object related ops */ + if (s->object_str.empty()) { + return 0; + } + + RGWUserInfo owner_info; + RGWUserInfo *uinfo; + + if (s->user.user_id == s->bucket_owner.get_id()) { + uinfo = &s->user; + } else { + int r = rgw_get_user_info_by_uid(store, s->bucket_info.owner, owner_info); + if (r < 0) + return r; + uinfo = &owner_info; + } + + if (s->bucket_info.quota.enabled) { + bucket_quota = s->bucket_info.quota; + } else if (uinfo->bucket_quota.enabled) { + bucket_quota = uinfo->bucket_quota; + } else { + bucket_quota = store->region_map.bucket_quota; + } + + if (uinfo->user_quota.enabled) { + user_quota = uinfo->user_quota; + } else { + user_quota = store->region_map.user_quota; + } + + return 0; +} + +static bool validate_cors_rule_method(RGWCORSRule *rule, const char *req_meth) { + uint8_t flags = 0; + if (strcmp(req_meth, "GET") == 0) flags = RGW_CORS_GET; + else if (strcmp(req_meth, "POST") == 0) flags = RGW_CORS_POST; + else if (strcmp(req_meth, "PUT") == 0) flags = RGW_CORS_PUT; + else if (strcmp(req_meth, "DELETE") == 0) flags = RGW_CORS_DELETE; + else if (strcmp(req_meth, "HEAD") == 0) flags = RGW_CORS_HEAD; + + if ((rule->get_allowed_methods() & flags) == flags) { + dout(10) << "Method " << req_meth << " is supported" << dendl; + } else { + dout(5) << "Method " << req_meth << " is not supported" << dendl; + return false; + } + + return true; +} + +int RGWOp::read_bucket_cors() +{ + bufferlist bl; + + map::iterator aiter = s->bucket_attrs.find(RGW_ATTR_CORS); + if (aiter == s->bucket_attrs.end()) { + ldout(s->cct, 20) << "no CORS configuration attr found" << dendl; + cors_exist = false; + return 0; /* no CORS configuration found */ + } + + cors_exist = true; + + bl = aiter->second; + + bufferlist::iterator iter = bl.begin(); + try { + bucket_cors.decode(iter); + } catch (buffer::error& err) { + ldout(s->cct, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl; + return -EIO; + } + if (s->cct->_conf->subsys.should_gather(ceph_subsys_rgw, 15)) { + RGWCORSConfiguration_S3 *s3cors = static_cast(&bucket_cors); + ldout(s->cct, 15) << "Read RGWCORSConfiguration"; + s3cors->to_xml(*_dout); + *_dout << dendl; + } + return 0; +} + +/** CORS 6.2.6. + * If any of the header field-names is not a ASCII case-insensitive match for + * any of the values in list of headers do not set any additional headers and + * terminate this set of steps. + * */ +static void get_cors_response_headers(RGWCORSRule *rule, const char *req_hdrs, string& hdrs, string& exp_hdrs, unsigned *max_age) { + if (req_hdrs) { + list hl; + get_str_list(req_hdrs, hl); + for(list::iterator it = hl.begin(); it != hl.end(); ++it) { + if (!rule->is_header_allowed((*it).c_str(), (*it).length())) { + dout(5) << "Header " << (*it) << " is not registered in this rule" << dendl; + } else { + if (hdrs.length() > 0) hdrs.append(","); + hdrs.append((*it)); + } + } + } + rule->format_exp_headers(exp_hdrs); + *max_age = rule->get_max_age(); +} + +/** + * Generate the CORS header response + * + * This is described in the CORS standard, section 6.2. + */ +bool RGWOp::generate_cors_headers(string& origin, string& method, string& headers, string& exp_headers, unsigned *max_age) +{ + /* CORS 6.2.1. */ + const char *orig = s->info.env->get("HTTP_ORIGIN"); + if (!orig) { + return false; + } + + /* Custom: */ + origin = orig; + int ret = read_bucket_cors(); + if (ret < 0) { + return false; + } + + if (!cors_exist) { + dout(2) << "No CORS configuration set yet for this bucket" << dendl; + return false; + } + + /* CORS 6.2.2. */ + RGWCORSRule *rule = bucket_cors.host_name_rule(orig); + if (!rule) + return false; + + /* CORS 6.2.3. */ + const char *req_meth = s->info.env->get("HTTP_ACCESS_CONTROL_REQUEST_METHOD"); + if (!req_meth) { + req_meth = s->info.method; + } + + if (req_meth) + method = req_meth; + /* CORS 6.2.5. */ + if (!validate_cors_rule_method(rule, req_meth)) { + return false; + } + + /* CORS 6.2.4. */ + const char *req_hdrs = s->info.env->get("HTTP_ACCESS_CONTROL_REQUEST_HEADERS"); + + /* CORS 6.2.6. */ + get_cors_response_headers(rule, req_hdrs, headers, exp_headers, max_age); + + return true; +} + +int RGWGetObj::read_user_manifest_part(rgw_bucket& bucket, RGWObjEnt& ent, RGWAccessControlPolicy *bucket_policy, off_t start_ofs, off_t end_ofs) +{ + ldout(s->cct, 0) << "user manifest obj=" << ent.name << dendl; + + void *handle = NULL; + off_t cur_ofs = start_ofs; + off_t cur_end = end_ofs; + utime_t start_time = s->time; + + rgw_obj part(bucket, ent.name); + + map attrs; + + uint64_t obj_size; + void *obj_ctx = store->create_context(s); + RGWAccessControlPolicy obj_policy(s->cct); + + ldout(s->cct, 20) << "reading obj=" << part << " ofs=" << cur_ofs << " end=" << cur_end << dendl; + + store->set_atomic(obj_ctx, part); + store->set_prefetch_data(obj_ctx, part); + ret = store->prepare_get_obj(obj_ctx, part, &cur_ofs, &cur_end, &attrs, NULL, + NULL, NULL, NULL, NULL, NULL, &obj_size, NULL, &handle, &s->err); + if (ret < 0) + goto done_err; + + if (obj_size != ent.size) { + // hmm.. something wrong, object not as expected, abort! + ldout(s->cct, 0) << "ERROR: expected obj_size=" << obj_size << ", actual read size=" << ent.size << dendl; + ret = -EIO; + goto done_err; + } + + ret = rgw_policy_from_attrset(s->cct, attrs, &obj_policy); + if (ret < 0) + goto done_err; + + if (!verify_object_permission(s, bucket_policy, &obj_policy, RGW_PERM_READ)) { + ret = -EPERM; + goto done_err; + } + + perfcounter->inc(l_rgw_get_b, cur_end - cur_ofs); + while (cur_ofs <= cur_end) { + bufferlist bl; + ret = store->get_obj(obj_ctx, NULL, &handle, part, bl, cur_ofs, cur_end); + if (ret < 0) + goto done_err; + + off_t len = bl.length(); + cur_ofs += len; + ofs += len; + ret = 0; + perfcounter->tinc(l_rgw_get_lat, + (ceph_clock_now(s->cct) - start_time)); + send_response_data(bl, 0, len); + + start_time = ceph_clock_now(s->cct); + } + + store->destroy_context(obj_ctx); + obj_ctx = NULL; + + store->finish_get_obj(&handle); + + return 0; + +done_err: + if (obj_ctx) + store->destroy_context(obj_ctx); + return ret; +} + +static int iterate_user_manifest_parts(CephContext *cct, RGWRados *store, off_t ofs, off_t end, + rgw_bucket& bucket, string& obj_prefix, RGWAccessControlPolicy *bucket_policy, + uint64_t *ptotal_len, + int (*cb)(rgw_bucket& bucket, RGWObjEnt& ent, RGWAccessControlPolicy *bucket_policy, + off_t start_ofs, off_t end_ofs, void *param), void *cb_param) +{ + uint64_t obj_ofs = 0, len_count = 0; + bool found_start = false, found_end = false; + string delim; + string marker; + bool is_truncated; + string no_ns; + map common_prefixes; + vector objs; + + utime_t start_time = ceph_clock_now(cct); + + do { +#define MAX_LIST_OBJS 100 + int r = store->list_objects(bucket, MAX_LIST_OBJS, obj_prefix, delim, marker, NULL, + objs, common_prefixes, + true, no_ns, true, &is_truncated, NULL); + if (r < 0) + return r; + + vector::iterator viter; + + for (viter = objs.begin(); viter != objs.end() && !found_end; ++viter) { + RGWObjEnt& ent = *viter; + uint64_t cur_total_len = obj_ofs; + uint64_t start_ofs = 0, end_ofs = ent.size; + + if (!found_start && cur_total_len + ent.size > (uint64_t)ofs) { + start_ofs = ofs - obj_ofs; + found_start = true; + } + + obj_ofs += ent.size; + + if (!found_end && obj_ofs > (uint64_t)end) { + end_ofs = end - cur_total_len + 1; + found_end = true; + } + + perfcounter->tinc(l_rgw_get_lat, + (ceph_clock_now(cct) - start_time)); + + if (found_start) { + len_count += end_ofs - start_ofs; + + if (cb) { + r = cb(bucket, ent, bucket_policy, start_ofs, end_ofs, cb_param); + if (r < 0) + return r; + } + } + marker = ent.name; + + start_time = ceph_clock_now(cct); + } + } while (is_truncated && !found_end); + + if (ptotal_len) + *ptotal_len = len_count; + + return 0; +} + +static int get_obj_user_manifest_iterate_cb(rgw_bucket& bucket, RGWObjEnt& ent, RGWAccessControlPolicy *bucket_policy, off_t start_ofs, off_t end_ofs, + void *param) +{ + RGWGetObj *op = (RGWGetObj *)param; + return op->read_user_manifest_part(bucket, ent, bucket_policy, start_ofs, end_ofs); +} + +int RGWGetObj::handle_user_manifest(const char *prefix) +{ + ldout(s->cct, 2) << "RGWGetObj::handle_user_manifest() prefix=" << prefix << dendl; + + string prefix_str = prefix; + int pos = prefix_str.find('/'); + if (pos < 0) + return -EINVAL; + + string bucket_name = prefix_str.substr(0, pos); + string obj_prefix = prefix_str.substr(pos + 1); + + rgw_bucket bucket; + + RGWAccessControlPolicy _bucket_policy(s->cct); + RGWAccessControlPolicy *bucket_policy; + + if (bucket_name.compare(s->bucket.name) != 0) { + RGWBucketInfo bucket_info; + map bucket_attrs; + int r = store->get_bucket_info(NULL, bucket_name, bucket_info, NULL, &bucket_attrs); + if (r < 0) { + ldout(s->cct, 0) << "could not get bucket info for bucket=" << bucket_name << dendl; + return r; + } + bucket = bucket_info.bucket; + string no_obj; + bucket_policy = &_bucket_policy; + r = read_policy(store, s, bucket_info, bucket_attrs, bucket_policy, bucket, no_obj); + if (r < 0) { + ldout(s->cct, 0) << "failed to read bucket policy" << dendl; + return r; + } + } else { + bucket = s->bucket; + bucket_policy = s->bucket_acl; + } + + /* dry run to find out total length */ + int r = iterate_user_manifest_parts(s->cct, store, ofs, end, bucket, obj_prefix, bucket_policy, &total_len, NULL, NULL); + if (r < 0) + return r; + + s->obj_size = total_len; + + r = iterate_user_manifest_parts(s->cct, store, ofs, end, bucket, obj_prefix, bucket_policy, NULL, get_obj_user_manifest_iterate_cb, (void *)this); + if (r < 0) + return r; + + return 0; +} + +class RGWGetObj_CB : public RGWGetDataCB +{ + RGWGetObj *op; +public: + RGWGetObj_CB(RGWGetObj *_op) : op(_op) {} + virtual ~RGWGetObj_CB() {} + + int handle_data(bufferlist& bl, off_t bl_ofs, off_t bl_len) { + return op->get_data_cb(bl, bl_ofs, bl_len); + } +}; + +int RGWGetObj::get_data_cb(bufferlist& bl, off_t bl_ofs, off_t bl_len) +{ + /* garbage collection related handling */ + utime_t start_time = ceph_clock_now(s->cct); + if (start_time > gc_invalidate_time) { + int r = store->defer_gc(s->obj_ctx, obj); + if (r < 0) { + dout(0) << "WARNING: could not defer gc entry for obj" << dendl; + } + gc_invalidate_time = start_time; + gc_invalidate_time += (s->cct->_conf->rgw_gc_obj_min_wait / 2); + } + return send_response_data(bl, bl_ofs, bl_len); +} + +void RGWGetObj::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWGetObj::execute() +{ + void *handle = NULL; + utime_t start_time = s->time; + bufferlist bl; + gc_invalidate_time = ceph_clock_now(s->cct); + gc_invalidate_time += (s->cct->_conf->rgw_gc_obj_min_wait / 2); + + RGWGetObj_CB cb(this); + + map::iterator attr_iter; + + perfcounter->inc(l_rgw_get); + off_t new_ofs, new_end; + + ret = get_params(); + if (ret < 0) + goto done_err; + + ret = init_common(); + if (ret < 0) + goto done_err; + + new_ofs = ofs; + new_end = end; + + ret = store->prepare_get_obj(s->obj_ctx, obj, &new_ofs, &new_end, &attrs, mod_ptr, + unmod_ptr, &lastmod, if_match, if_nomatch, &total_len, &s->obj_size, NULL, &handle, &s->err); + if (ret < 0) + goto done_err; + + attr_iter = attrs.find(RGW_ATTR_USER_MANIFEST); + if (attr_iter != attrs.end()) { + ret = handle_user_manifest(attr_iter->second.c_str()); + if (ret < 0) { + ldout(s->cct, 0) << "ERROR: failed to handle user manifest ret=" << ret << dendl; + } + return; + } + + ofs = new_ofs; + end = new_end; + + start = ofs; + + if (!get_data || ofs > end) + goto done_err; + + perfcounter->inc(l_rgw_get_b, end - ofs); + + ret = store->get_obj_iterate(s->obj_ctx, &handle, obj, ofs, end, &cb); + + perfcounter->tinc(l_rgw_get_lat, + (ceph_clock_now(s->cct) - start_time)); + if (ret < 0) { + goto done_err; + } + + store->finish_get_obj(&handle); + +done_err: + send_response_data(bl, 0, 0); + store->finish_get_obj(&handle); +} + +int RGWGetObj::init_common() +{ + if (range_str) { + int r = parse_range(range_str, ofs, end, &partial_content); + if (r < 0) + return r; + } + if (if_mod) { + if (parse_time(if_mod, &mod_time) < 0) + return -EINVAL; + mod_ptr = &mod_time; + } + + if (if_unmod) { + if (parse_time(if_unmod, &unmod_time) < 0) + return -EINVAL; + unmod_ptr = &unmod_time; + } + + return 0; +} + +int RGWListBuckets::verify_permission() +{ + return 0; +} + +void RGWListBuckets::execute() +{ + bool done; + bool started = false; + uint64_t total_count = 0; + + uint64_t max_buckets = s->cct->_conf->rgw_list_buckets_max_chunk; + + ret = get_params(); + if (ret < 0) + goto send_end; + + do { + RGWUserBuckets buckets; + uint64_t read_count; + if (limit > 0) + read_count = min(limit - total_count, (uint64_t)max_buckets); + else + read_count = max_buckets; + + ret = rgw_read_user_buckets(store, s->user.user_id, buckets, + marker, read_count, should_get_stats()); + + if (!started) { + send_response_begin(buckets.count() > 0); + started = true; + } + + if (ret < 0) { + /* hmm.. something wrong here.. the user was authenticated, so it + should exist */ + ldout(s->cct, 10) << "WARNING: failed on rgw_get_user_buckets uid=" << s->user.user_id << dendl; + break; + } + map& m = buckets.get_buckets(); + + total_count += m.size(); + + done = (m.size() < read_count || (limit > 0 && total_count == limit)); + + if (!m.empty()) { + send_response_data(buckets); + + map::reverse_iterator riter = m.rbegin(); + marker = riter->first; + } + } while (!done); + +send_end: + if (!started) { + send_response_begin(false); + } + send_response_end(); +} + +int RGWStatAccount::verify_permission() +{ + return 0; +} + +void RGWStatAccount::execute() +{ + string marker; + bool done; + uint64_t max_buckets = s->cct->_conf->rgw_list_buckets_max_chunk; + + do { + RGWUserBuckets buckets; + + ret = rgw_read_user_buckets(store, s->user.user_id, buckets, marker, max_buckets, true); + if (ret < 0) { + /* hmm.. something wrong here.. the user was authenticated, so it + should exist */ + ldout(s->cct, 10) << "WARNING: failed on rgw_get_user_buckets uid=" << s->user.user_id << dendl; + break; + } else { + map& m = buckets.get_buckets(); + map::iterator iter; + for (iter = m.begin(); iter != m.end(); ++iter) { + RGWBucketEnt& bucket = iter->second; + buckets_size += bucket.size; + buckets_size_rounded += bucket.size_rounded; + buckets_objcount += bucket.count; + + marker = iter->first; + } + buckets_count += m.size(); + + done = (m.size() < max_buckets); + } + } while (!done); +} + +int RGWStatBucket::verify_permission() +{ + if (!verify_bucket_permission(s, RGW_PERM_READ)) + return -EACCES; + + return 0; +} + +void RGWStatBucket::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWStatBucket::execute() +{ + RGWUserBuckets buckets; + bucket.bucket = s->bucket; + buckets.add(bucket); + map& m = buckets.get_buckets(); + ret = store->update_containers_stats(m); + if (!ret) + ret = -EEXIST; + if (ret > 0) { + ret = 0; + map::iterator iter = m.find(bucket.bucket.name); + if (iter != m.end()) { + bucket = iter->second; + } else { + ret = -EINVAL; + } + } +} + +int RGWListBucket::verify_permission() +{ + if (!verify_bucket_permission(s, RGW_PERM_READ)) + return -EACCES; + + return 0; +} + +int RGWListBucket::parse_max_keys() +{ + if (!max_keys.empty()) { + char *endptr; + max = strtol(max_keys.c_str(), &endptr, 10); + if (endptr) { + while (*endptr && isspace(*endptr)) // ignore white space + endptr++; + if (*endptr) { + return -EINVAL; + } + } + } else { + max = default_max; + } + + return 0; +} + +void RGWListBucket::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWListBucket::execute() +{ + string no_ns; + + ret = get_params(); + if (ret < 0) + return; + + string *pnext_marker = (delimiter.empty() ? NULL : &next_marker); + + ret = store->list_objects(s->bucket, max, prefix, delimiter, marker, pnext_marker, objs, common_prefixes, + !!(s->prot_flags & RGW_REST_SWIFT), no_ns, true, &is_truncated, NULL); +} + +int RGWGetBucketLogging::verify_permission() +{ + if (s->user.user_id.compare(s->bucket_owner.get_id()) != 0) + return -EACCES; + + return 0; +} + +int RGWCreateBucket::verify_permission() +{ + if (!rgw_user_is_authenticated(s->user)) + return -EACCES; + + if (s->user.max_buckets) { + RGWUserBuckets buckets; + string marker; + int ret = rgw_read_user_buckets(store, s->user.user_id, buckets, marker, s->user.max_buckets, false); + if (ret < 0) + return ret; + + map& m = buckets.get_buckets(); + if (m.size() >= s->user.max_buckets) { + return -ERR_TOO_MANY_BUCKETS; + } + } + + return 0; +} + +static int forward_request_to_master(struct req_state *s, obj_version *objv, RGWRados *store, bufferlist& in_data, JSONParser *jp) +{ + if (!store->rest_master_conn) { + ldout(s->cct, 0) << "rest connection is invalid" << dendl; + return -EINVAL; + } + ldout(s->cct, 0) << "sending create_bucket request to master region" << dendl; + bufferlist response; +#define MAX_REST_RESPONSE (128 * 1024) // we expect a very small response + int ret = store->rest_master_conn->forward(s->user.user_id, s->info, objv, MAX_REST_RESPONSE, &in_data, &response); + if (ret < 0) + return ret; + + ldout(s->cct, 20) << "response: " << response.c_str() << dendl; + ret = jp->parse(response.c_str(), response.length()); + if (ret < 0) { + ldout(s->cct, 0) << "failed parsing response from master region" << dendl; + return ret; + } + + return 0; +} + +void RGWCreateBucket::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWCreateBucket::execute() +{ + RGWAccessControlPolicy old_policy(s->cct); + map attrs; + bufferlist aclbl; + bufferlist corsbl; + bool existed; + int r; + rgw_obj obj(store->zone.domain_root, s->bucket_name_str); + obj_version objv, *pobjv = NULL; + + ret = get_params(); + if (ret < 0) + return; + + if (!store->region.is_master && + store->region.api_name != location_constraint) { + ldout(s->cct, 0) << "location constraint (" << location_constraint << ") doesn't match region" << " (" << store->region.api_name << ")" << dendl; + ret = -EINVAL; + return; + } + + /* we need to make sure we read bucket info, it's not read before for this specific request */ + ret = store->get_bucket_info(s->obj_ctx, s->bucket_name_str, s->bucket_info, NULL, &s->bucket_attrs); + if (ret < 0 && ret != -ENOENT) + return; + s->bucket_exists = (ret != -ENOENT); + + s->bucket_owner.set_id(s->user.user_id); + s->bucket_owner.set_name(s->user.display_name); + if (s->bucket_exists) { + r = get_policy_from_attr(s->cct, store, s->obj_ctx, s->bucket_info, s->bucket_attrs, + &old_policy, obj); + if (r >= 0) { + if (old_policy.get_owner().get_id().compare(s->user.user_id) != 0) { + ret = -EEXIST; + return; + } + } + } + + RGWBucketInfo master_info; + rgw_bucket *pmaster_bucket; + time_t creation_time; + + if (!store->region.is_master) { + JSONParser jp; + ret = forward_request_to_master(s, NULL, store, in_data, &jp); + if (ret < 0) + return; + + JSONDecoder::decode_json("entry_point_object_ver", ep_objv, &jp); + JSONDecoder::decode_json("object_ver", objv, &jp); + JSONDecoder::decode_json("bucket_info", master_info, &jp); + ldout(s->cct, 20) << "parsed: objv.tag=" << objv.tag << " objv.ver=" << objv.ver << dendl; + ldout(s->cct, 20) << "got creation time: << " << master_info.creation_time << dendl; + pmaster_bucket= &master_info.bucket; + creation_time = master_info.creation_time; + pobjv = &objv; + } else { + pmaster_bucket = NULL; + creation_time = 0; + } + + string region_name; + + if (s->system_request) { + region_name = s->info.args.get(RGW_SYS_PARAM_PREFIX "region"); + if (region_name.empty()) { + region_name = store->region.name; + } + } else { + region_name = store->region.name; + } + + policy.encode(aclbl); + + attrs[RGW_ATTR_ACL] = aclbl; + + if (has_cors) { + cors_config.encode(corsbl); + attrs[RGW_ATTR_CORS] = corsbl; + } + s->bucket.name = s->bucket_name_str; + ret = store->create_bucket(s->user, s->bucket, region_name, placement_rule, attrs, info, pobjv, + &ep_objv, creation_time, pmaster_bucket, true); + /* continue if EEXIST and create_bucket will fail below. this way we can recover + * from a partial create by retrying it. */ + ldout(s->cct, 20) << "rgw_create_bucket returned ret=" << ret << " bucket=" << s->bucket << dendl; + + if (ret && ret != -EEXIST) + return; + + existed = (ret == -EEXIST); + + if (existed) { + /* bucket already existed, might have raced with another bucket creation, or + * might be partial bucket creation that never completed. Read existing bucket + * info, verify that the reported bucket owner is the current user. + * If all is ok then update the user's list of buckets + */ + if (info.owner.compare(s->user.user_id) != 0) { + ret = -ERR_BUCKET_EXISTS; + return; + } + s->bucket = info.bucket; + } + + ret = rgw_link_bucket(store, s->user.user_id, s->bucket, info.creation_time, false); + if (ret && !existed && ret != -EEXIST) { /* if it exists (or previously existed), don't remove it! */ + ret = rgw_unlink_bucket(store, s->user.user_id, s->bucket.name); + if (ret < 0) { + ldout(s->cct, 0) << "WARNING: failed to unlink bucket: ret=" << ret << dendl; + } + } + + if (ret == -EEXIST) + ret = -ERR_BUCKET_EXISTS; +} + +int RGWDeleteBucket::verify_permission() +{ + if (!verify_bucket_permission(s, RGW_PERM_WRITE)) + return -EACCES; + + return 0; +} + +void RGWDeleteBucket::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWDeleteBucket::execute() +{ + ret = -EINVAL; + + if (s->bucket_name_str.empty()) + return; + + RGWObjVersionTracker ot; + ot.read_version = s->bucket_info.ep_objv; + + if (s->system_request) { + string tag = s->info.args.get(RGW_SYS_PARAM_PREFIX "tag"); + string ver_str = s->info.args.get(RGW_SYS_PARAM_PREFIX "ver"); + if (!tag.empty()) { + ot.read_version.tag = tag; + uint64_t ver; + string err; + ver = strict_strtol(ver_str.c_str(), 10, &err); + if (!err.empty()) { + ldout(s->cct, 0) << "failed to parse ver param" << dendl; + ret = -EINVAL; + return; + } + ot.read_version.ver = ver; + } + } + + ret = store->delete_bucket(s->bucket, ot); + + if (ret == 0) { + ret = rgw_unlink_bucket(store, s->user.user_id, s->bucket.name, false); + if (ret < 0) { + ldout(s->cct, 0) << "WARNING: failed to unlink bucket: ret=" << ret << dendl; + } + } + + if (ret < 0) { + return; + } + + if (!store->region.is_master) { + bufferlist in_data; + JSONParser jp; + ret = forward_request_to_master(s, &ot.read_version, store, in_data, &jp); + if (ret < 0) { + if (ret == -ENOENT) { /* adjust error, + we want to return with NoSuchBucket and not NoSuchKey */ + ret = -ERR_NO_SUCH_BUCKET; + } + return; + } + } + +} + +int RGWPutObj::verify_permission() +{ + if (!verify_bucket_permission(s, RGW_PERM_WRITE)) + return -EACCES; + + return 0; +} + +class RGWPutObjProcessor_Multipart : public RGWPutObjProcessor_Atomic +{ + string part_num; + RGWMPObj mp; + req_state *s; + string upload_id; + +protected: + int prepare(RGWRados *store, void *obj_ctx, string *oid_rand); + int do_complete(string& etag, time_t *mtime, time_t set_mtime, map& attrs); + +public: + bool immutable_head() { return true; } + RGWPutObjProcessor_Multipart(const string& bucket_owner, uint64_t _p, req_state *_s) : + RGWPutObjProcessor_Atomic(bucket_owner, _s->bucket, _s->object_str, _p, _s->req_id), s(_s) {} +}; + +int RGWPutObjProcessor_Multipart::prepare(RGWRados *store, void *obj_ctx, string *oid_rand) +{ + int r = prepare_init(store, obj_ctx, NULL); + if (r < 0) { + return r; + } + + string oid = obj_str; + upload_id = s->info.args.get("uploadId"); + if (!oid_rand) { + mp.init(oid, upload_id); + } else { + mp.init(oid, upload_id, *oid_rand); + } + + part_num = s->info.args.get("partNumber"); + if (part_num.empty()) { + ldout(s->cct, 10) << "part number is empty" << dendl; + return -EINVAL; + } + + string err; + uint64_t num = (uint64_t)strict_strtol(part_num.c_str(), 10, &err); + + if (!err.empty()) { + ldout(s->cct, 10) << "bad part number: " << part_num << ": " << err << dendl; + return -EINVAL; + } + + string upload_prefix = oid + "."; + + if (!oid_rand) { + upload_prefix.append(upload_id); + } else { + upload_prefix.append(*oid_rand); + } + + rgw_obj target_obj; + target_obj.init(bucket, oid); + + manifest.set_prefix(upload_prefix); + + manifest.set_multipart_part_rule(store->ctx()->_conf->rgw_obj_stripe_size, num); + + r = manifest_gen.create_begin(store->ctx(), &manifest, bucket, target_obj); + if (r < 0) { + return r; + } + + head_obj = manifest_gen.get_cur_obj(); + cur_obj = head_obj; + add_obj(cur_obj); + + return 0; +} + +static bool is_v2_upload_id(const string& upload_id) +{ + const char *uid = upload_id.c_str(); + + return (strncmp(uid, MULTIPART_UPLOAD_ID_PREFIX, sizeof(MULTIPART_UPLOAD_ID_PREFIX) - 1) == 0); +} + +int RGWPutObjProcessor_Multipart::do_complete(string& etag, time_t *mtime, time_t set_mtime, map& attrs) +{ + complete_writing_data(); + + RGWRados::PutObjMetaExtraParams params; + params.set_mtime = set_mtime; + params.mtime = mtime; + params.owner = s->owner.get_id(); + + int r = store->put_obj_meta(obj_ctx, head_obj, s->obj_size, attrs, RGW_OBJ_CATEGORY_MAIN, 0, params); + if (r < 0) + return r; + + bufferlist bl; + RGWUploadPartInfo info; + string p = "part."; + bool sorted_omap = is_v2_upload_id(upload_id); + + if (sorted_omap) { + string err; + int part_num_int = strict_strtol(part_num.c_str(), 10, &err); + if (!err.empty()) { + dout(10) << "bad part number specified: " << part_num << dendl; + return -EINVAL; + } + char buf[32]; + snprintf(buf, sizeof(buf), "%08d", part_num_int); + p.append(buf); + } else { + p.append(part_num); + } + info.num = atoi(part_num.c_str()); + info.etag = etag; + info.size = s->obj_size; + info.modified = ceph_clock_now(store->ctx()); + info.manifest = manifest; + ::encode(info, bl); + + string multipart_meta_obj = mp.get_meta(); + + rgw_obj meta_obj; + meta_obj.init_ns(bucket, multipart_meta_obj, mp_ns); + meta_obj.set_in_extra_data(true); + + r = store->omap_set(meta_obj, p, bl); + + return r; +} + + +RGWPutObjProcessor *RGWPutObj::select_processor(bool *is_multipart) +{ + RGWPutObjProcessor *processor; + + bool multipart = s->info.args.exists("uploadId"); + + uint64_t part_size = s->cct->_conf->rgw_obj_stripe_size; + + const string& bucket_owner = s->bucket_owner.get_id(); + + if (!multipart) { + processor = new RGWPutObjProcessor_Atomic(bucket_owner, s->bucket, s->object_str, part_size, s->req_id); + } else { + processor = new RGWPutObjProcessor_Multipart(bucket_owner, part_size, s); + } + + if (is_multipart) { + *is_multipart = multipart; + } + + return processor; +} + +void RGWPutObj::dispose_processor(RGWPutObjProcessor *processor) +{ + delete processor; +} + +void RGWPutObj::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +static int put_obj_user_manifest_iterate_cb(rgw_bucket& bucket, RGWObjEnt& ent, RGWAccessControlPolicy *bucket_policy, off_t start_ofs, off_t end_ofs, + void *param) +{ + RGWPutObj *op = (RGWPutObj *)param; + return op->user_manifest_iterate_cb(bucket, ent, bucket_policy, start_ofs, end_ofs); +} + +int RGWPutObj::user_manifest_iterate_cb(rgw_bucket& bucket, RGWObjEnt& ent, RGWAccessControlPolicy *bucket_policy, off_t start_ofs, off_t end_ofs) +{ + rgw_obj part(bucket, ent.name); + + map attrs; + + int ret = get_obj_attrs(store, s, part, attrs, NULL, NULL); + if (ret < 0) { + return ret; + } + map::iterator iter = attrs.find(RGW_ATTR_ETAG); + if (iter == attrs.end()) { + return 0; + } + bufferlist& bl = iter->second; + const char *buf = bl.c_str(); + int len = bl.length(); + while (len > 0 && buf[len - 1] == '\0') { + len--; + } + if (len > 0) { + user_manifest_parts_hash->Update((const byte *)bl.c_str(), len); + } + + if (s->cct->_conf->subsys.should_gather(ceph_subsys_rgw, 20)) { + string e(bl.c_str(), bl.length()); + ldout(s->cct, 20) << __func__ << ": appending user manifest etag: " << e << dendl; + } + + return 0; +} + +static int put_data_and_throttle(RGWPutObjProcessor *processor, bufferlist& data, off_t ofs, + MD5 *hash, bool need_to_wait) +{ + const unsigned char *data_ptr = (hash ? (const unsigned char *)data.c_str() : NULL); + bool again; + uint64_t len = data.length(); + + do { + void *handle; + + int ret = processor->handle_data(data, ofs, &handle, &again); + if (ret < 0) + return ret; + + if (hash) { + hash->Update(data_ptr, len); + hash = NULL; /* only calculate hash once */ + } + + ret = processor->throttle_data(handle, need_to_wait); + if (ret < 0) + return ret; + + need_to_wait = false; /* the need to wait only applies to the first iteration */ + } while (again); + + return 0; +} + + +void RGWPutObj::execute() +{ + RGWPutObjProcessor *processor = NULL; + char supplied_md5_bin[CEPH_CRYPTO_MD5_DIGESTSIZE + 1]; + char supplied_md5[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1]; + char calc_md5[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1]; + unsigned char m[CEPH_CRYPTO_MD5_DIGESTSIZE]; + MD5 hash; + bufferlist bl, aclbl; + map attrs; + int len; + map::iterator iter; + bool multipart; + + bool need_calc_md5 = (obj_manifest == NULL); + + + perfcounter->inc(l_rgw_put); + ret = -EINVAL; + if (!s->object) { + goto done; + } + + ret = get_params(); + if (ret < 0) + goto done; + + if (supplied_md5_b64) { + need_calc_md5 = true; + + ldout(s->cct, 15) << "supplied_md5_b64=" << supplied_md5_b64 << dendl; + ret = ceph_unarmor(supplied_md5_bin, &supplied_md5_bin[CEPH_CRYPTO_MD5_DIGESTSIZE + 1], + supplied_md5_b64, supplied_md5_b64 + strlen(supplied_md5_b64)); + ldout(s->cct, 15) << "ceph_armor ret=" << ret << dendl; + if (ret != CEPH_CRYPTO_MD5_DIGESTSIZE) { + ret = -ERR_INVALID_DIGEST; + goto done; + } + + buf_to_hex((const unsigned char *)supplied_md5_bin, CEPH_CRYPTO_MD5_DIGESTSIZE, supplied_md5); + ldout(s->cct, 15) << "supplied_md5=" << supplied_md5 << dendl; + } + + if (!chunked_upload) { /* with chunked upload we don't know how big is the upload. + we also check sizes at the end anyway */ + ret = store->check_quota(s->bucket_owner.get_id(), s->bucket, + user_quota, bucket_quota, s->content_length); + if (ret < 0) { + goto done; + } + } + + if (supplied_etag) { + strncpy(supplied_md5, supplied_etag, sizeof(supplied_md5) - 1); + supplied_md5[sizeof(supplied_md5) - 1] = '\0'; + } + + processor = select_processor(&multipart); + + ret = processor->prepare(store, s->obj_ctx, NULL); + if (ret < 0) + goto done; + + do { + bufferlist data; + len = get_data(data); + if (len < 0) { + ret = len; + goto done; + } + if (!len) + break; + + /* do we need this operation to be synchronous? if we're dealing with an object with immutable + * head, e.g., multipart object we need to make sure we're the first one writing to this object + */ + bool need_to_wait = (ofs == 0) && multipart; + + ret = put_data_and_throttle(processor, data, ofs, (need_calc_md5 ? &hash : NULL), need_to_wait); + if (ret < 0) { + if (!need_to_wait || ret != -EEXIST) { + ldout(s->cct, 20) << "processor->thottle_data() returned ret=" << ret << dendl; + goto done; + } + + ldout(s->cct, 5) << "NOTICE: processor->throttle_data() returned -EEXIST, need to restart write" << dendl; + + /* restart processing with different oid suffix */ + + dispose_processor(processor); + processor = select_processor(&multipart); + + string oid_rand; + char buf[33]; + gen_rand_alphanumeric(store->ctx(), buf, sizeof(buf) - 1); + oid_rand.append(buf); + + ret = processor->prepare(store, s->obj_ctx, &oid_rand); + if (ret < 0) { + ldout(s->cct, 0) << "ERROR: processor->prepare() returned " << ret << dendl; + goto done; + } + + ret = put_data_and_throttle(processor, data, ofs, NULL, false); + if (ret < 0) { + goto done; + } + } + + ofs += len; + } while (len > 0); + + if (!chunked_upload && (uint64_t)ofs != s->content_length) { + ret = -ERR_REQUEST_TIMEOUT; + goto done; + } + s->obj_size = ofs; + perfcounter->inc(l_rgw_put_b, s->obj_size); + + ret = store->check_quota(s->bucket_owner.get_id(), s->bucket, + user_quota, bucket_quota, s->obj_size); + if (ret < 0) { + goto done; + } + + if (need_calc_md5) { + hash.Final(m); + + buf_to_hex(m, CEPH_CRYPTO_MD5_DIGESTSIZE, calc_md5); + etag = calc_md5; + + if (supplied_md5_b64 && strcmp(calc_md5, supplied_md5)) { + ret = -ERR_BAD_DIGEST; + goto done; + } + } + + policy.encode(aclbl); + + attrs[RGW_ATTR_ACL] = aclbl; + if (obj_manifest) { + bufferlist manifest_bl; + string manifest_obj_prefix; + string manifest_bucket; + RGWBucketInfo bucket_info; + + char etag_buf[CEPH_CRYPTO_MD5_DIGESTSIZE]; + char etag_buf_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 16]; + + manifest_bl.append(obj_manifest, strlen(obj_manifest) + 1); + attrs[RGW_ATTR_USER_MANIFEST] = manifest_bl; + user_manifest_parts_hash = &hash; + string prefix_str = obj_manifest; + int pos = prefix_str.find('/'); + if (pos < 0) { + ldout(s->cct, 0) << "bad user manifest, missing slash separator: " << obj_manifest << dendl; + goto done; + } + + manifest_bucket = prefix_str.substr(0, pos); + manifest_obj_prefix = prefix_str.substr(pos + 1); + + ret = store->get_bucket_info(NULL, manifest_bucket, bucket_info, NULL, NULL); + if (ret < 0) { + ldout(s->cct, 0) << "could not get bucket info for bucket=" << manifest_bucket << dendl; + } + ret = iterate_user_manifest_parts(s->cct, store, 0, -1, bucket_info.bucket, manifest_obj_prefix, + NULL, NULL, put_obj_user_manifest_iterate_cb, (void *)this); + if (ret < 0) { + goto done; + } + + hash.Final((byte *)etag_buf); + buf_to_hex((const unsigned char *)etag_buf, CEPH_CRYPTO_MD5_DIGESTSIZE, etag_buf_str); + + ldout(s->cct, 0) << __func__ << ": calculated md5 for user manifest: " << etag_buf_str << dendl; + + etag = etag_buf_str; + } + if (supplied_etag && etag.compare(supplied_etag) != 0) { + ret = -ERR_UNPROCESSABLE_ENTITY; + goto done; + } + bl.append(etag.c_str(), etag.size() + 1); + attrs[RGW_ATTR_ETAG] = bl; + + for (iter = s->generic_attrs.begin(); iter != s->generic_attrs.end(); ++iter) { + bufferlist& attrbl = attrs[iter->first]; + const string& val = iter->second; + attrbl.append(val.c_str(), val.size() + 1); + } + + rgw_get_request_metadata(s->cct, s->info, attrs); + + ret = processor->complete(etag, &mtime, 0, attrs); +done: + dispose_processor(processor); + perfcounter->tinc(l_rgw_put_lat, + (ceph_clock_now(s->cct) - s->time)); +} + +int RGWPostObj::verify_permission() +{ + return 0; +} + +RGWPutObjProcessor *RGWPostObj::select_processor() +{ + RGWPutObjProcessor *processor; + + uint64_t part_size = s->cct->_conf->rgw_obj_stripe_size; + + processor = new RGWPutObjProcessor_Atomic(s->bucket_owner.get_id(), s->bucket, s->object_str, part_size, s->req_id); + + return processor; +} + +void RGWPostObj::dispose_processor(RGWPutObjProcessor *processor) +{ + delete processor; +} + +void RGWPostObj::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWPostObj::execute() +{ + RGWPutObjProcessor *processor = NULL; + char calc_md5[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1]; + unsigned char m[CEPH_CRYPTO_MD5_DIGESTSIZE]; + MD5 hash; + bufferlist bl, aclbl; + int len = 0; + + // read in the data from the POST form + ret = get_params(); + if (ret < 0) + goto done; + + ret = verify_params(); + if (ret < 0) + goto done; + + if (!verify_bucket_permission(s, RGW_PERM_WRITE)) { + ret = -EACCES; + goto done; + } + + processor = select_processor(); + + ret = processor->prepare(store, s->obj_ctx, NULL); + if (ret < 0) + goto done; + + while (data_pending) { + bufferlist data; + len = get_data(data); + + if (len < 0) { + ret = len; + goto done; + } + + if (!len) + break; + + ret = put_data_and_throttle(processor, data, ofs, &hash, false); + + ofs += len; + + if (ofs > max_len) { + ret = -ERR_TOO_LARGE; + goto done; + } + } + + if (len < min_len) { + ret = -ERR_TOO_SMALL; + goto done; + } + + s->obj_size = ofs; + + hash.Final(m); + buf_to_hex(m, CEPH_CRYPTO_MD5_DIGESTSIZE, calc_md5); + + policy.encode(aclbl); + etag = calc_md5; + + bl.append(etag.c_str(), etag.size() + 1); + attrs[RGW_ATTR_ETAG] = bl; + attrs[RGW_ATTR_ACL] = aclbl; + + if (content_type.size()) { + bufferlist ct_bl; + ct_bl.append(content_type.c_str(), content_type.size() + 1); + attrs[RGW_ATTR_CONTENT_TYPE] = ct_bl; + } + + ret = processor->complete(etag, NULL, 0, attrs); + +done: + dispose_processor(processor); +} + + +int RGWPutMetadata::verify_permission() +{ + if (s->object) { + if (!verify_object_permission(s, RGW_PERM_WRITE)) + return -EACCES; + } else { + if (!verify_bucket_permission(s, RGW_PERM_WRITE)) + return -EACCES; + } + + return 0; +} + +void RGWPutMetadata::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWPutMetadata::execute() +{ + const char *meta_prefix = RGW_ATTR_META_PREFIX; + int meta_prefix_len = sizeof(RGW_ATTR_META_PREFIX) - 1; + map attrs, orig_attrs, rmattrs; + map::iterator iter; + bufferlist bl, cors_bl; + + rgw_obj obj(s->bucket, s->object_str); + + store->set_atomic(s->obj_ctx, obj); + + ret = get_params(); + if (ret < 0) + return; + + rgw_get_request_metadata(s->cct, s->info, attrs); + + /* no need to track object versioning, need it for bucket's data only */ + RGWObjVersionTracker *ptracker = (s->object ? NULL : &s->bucket_info.objv_tracker); + + /* check if obj exists, read orig attrs */ + ret = get_obj_attrs(store, s, obj, orig_attrs, NULL, ptracker); + if (ret < 0) + return; + + /* only remove meta attrs */ + for (iter = orig_attrs.begin(); iter != orig_attrs.end(); ++iter) { + const string& name = iter->first; + if (name.compare(0, meta_prefix_len, meta_prefix) == 0) { + rmattrs[name] = iter->second; + } else if (attrs.find(name) == attrs.end()) { + attrs[name] = iter->second; + } + } + + map::iterator giter; + for (giter = s->generic_attrs.begin(); giter != s->generic_attrs.end(); ++giter) { + bufferlist& attrbl = attrs[giter->first]; + const string& val = giter->second; + attrbl.append(val.c_str(), val.size() + 1); + } + + if (has_policy) { + policy.encode(bl); + attrs[RGW_ATTR_ACL] = bl; + } + if (has_cors) { + cors_config.encode(cors_bl); + attrs[RGW_ATTR_CORS] = cors_bl; + } + if (s->object) { + ret = store->set_attrs(s->obj_ctx, obj, attrs, &rmattrs, ptracker); + } else { + ret = rgw_bucket_set_attrs(store, s->bucket_info, attrs, &rmattrs, ptracker); + } +} + +int RGWSetTempUrl::verify_permission() +{ + if (s->perm_mask != RGW_PERM_FULL_CONTROL) + return -EACCES; + + return 0; +} + +void RGWSetTempUrl::execute() +{ + ret = get_params(); + if (ret < 0) + return; + + RGWUserAdminOpState user_op; + user_op.set_user_id(s->user.user_id); + map::iterator iter; + for (iter = temp_url_keys.begin(); iter != temp_url_keys.end(); ++iter) { + user_op.set_temp_url_key(iter->second, iter->first); + } + + RGWUser user; + ret = user.init(store, user_op); + if (ret < 0) { + ldout(store->ctx(), 0) << "ERROR: could not init user ret=" << ret << dendl; + return; + } + string err_msg; + ret = user.modify(user_op, &err_msg); + if (ret < 0) { + ldout(store->ctx(), 10) << "user.modify() returned " << ret << ": " << err_msg << dendl; + return; + } +} + + +int RGWDeleteObj::verify_permission() +{ + if (!verify_bucket_permission(s, RGW_PERM_WRITE)) + return -EACCES; + + return 0; +} + +void RGWDeleteObj::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWDeleteObj::execute() +{ + ret = -EINVAL; + rgw_obj obj(s->bucket, s->object_str); + if (s->object) { + store->set_atomic(s->obj_ctx, obj); + ret = store->delete_obj(s->obj_ctx, s->bucket_owner.get_id(), obj); + } +} + +bool RGWCopyObj::parse_copy_location(const char *src, string& bucket_name, string& object) +{ + string url_src(src); + string dec_src; + + url_decode(url_src, dec_src); + src = dec_src.c_str(); + + if (*src == '/') ++src; + + string str(src); + + int pos = str.find("/"); + if (pos <= 0) + return false; + + bucket_name = str.substr(0, pos); + object = str.substr(pos + 1); + + if (object.size() == 0) + return false; + + return true; +} + +int RGWCopyObj::verify_permission() +{ + string empty_str; + RGWAccessControlPolicy src_policy(s->cct); + ret = get_params(); + if (ret < 0) + return ret; + + map src_attrs; + + ret = store->get_bucket_info(s->obj_ctx, src_bucket_name, src_bucket_info, NULL, &src_attrs); + if (ret < 0) + return ret; + + src_bucket = src_bucket_info.bucket; + + /* get buckets info (source and dest) */ + if (s->local_source && source_zone.empty()) { + rgw_obj src_obj(src_bucket, src_object); + store->set_atomic(s->obj_ctx, src_obj); + store->set_prefetch_data(s->obj_ctx, src_obj); + + /* check source object permissions */ + ret = read_policy(store, s, src_bucket_info, src_attrs, &src_policy, src_bucket, src_object); + if (ret < 0) + return ret; + + if (!s->system_request && /* system request overrides permission checks */ + !src_policy.verify_permission(s->user.user_id, s->perm_mask, RGW_PERM_READ)) + return -EACCES; + } + + RGWAccessControlPolicy dest_bucket_policy(s->cct); + map dest_attrs; + + if (src_bucket_name.compare(dest_bucket_name) == 0) { /* will only happen if s->local_source */ + dest_bucket_info = src_bucket_info; + } else { + ret = store->get_bucket_info(s->obj_ctx, dest_bucket_name, dest_bucket_info, NULL, &dest_attrs); + if (ret < 0) + return ret; + } + + dest_bucket = dest_bucket_info.bucket; + + rgw_obj dest_obj(dest_bucket, dest_object); + store->set_atomic(s->obj_ctx, dest_obj); + + /* check dest bucket permissions */ + ret = read_policy(store, s, dest_bucket_info, dest_attrs, &dest_bucket_policy, dest_bucket, empty_str); + if (ret < 0) + return ret; + + if (!s->system_request && /* system request overrides permission checks */ + !dest_bucket_policy.verify_permission(s->user.user_id, s->perm_mask, RGW_PERM_WRITE)) + return -EACCES; + + ret = init_dest_policy(); + if (ret < 0) + return ret; + + return 0; +} + + +int RGWCopyObj::init_common() +{ + if (if_mod) { + if (parse_time(if_mod, &mod_time) < 0) { + ret = -EINVAL; + return ret; + } + mod_ptr = &mod_time; + } + + if (if_unmod) { + if (parse_time(if_unmod, &unmod_time) < 0) { + ret = -EINVAL; + return ret; + } + unmod_ptr = &unmod_time; + } + + bufferlist aclbl; + dest_policy.encode(aclbl); + + attrs[RGW_ATTR_ACL] = aclbl; + rgw_get_request_metadata(s->cct, s->info, attrs); + + map::iterator iter; + for (iter = s->generic_attrs.begin(); iter != s->generic_attrs.end(); ++iter) { + bufferlist& attrbl = attrs[iter->first]; + const string& val = iter->second; + attrbl.append(val.c_str(), val.size() + 1); + } + return 0; +} + +static void copy_obj_progress_cb(off_t ofs, void *param) +{ + RGWCopyObj *op = static_cast(param); + op->progress_cb(ofs); +} + +void RGWCopyObj::progress_cb(off_t ofs) +{ + if (!s->cct->_conf->rgw_copy_obj_progress) + return; + + if (ofs - last_ofs < s->cct->_conf->rgw_copy_obj_progress_every_bytes) + return; + + send_partial_response(ofs); + + last_ofs = ofs; +} + +void RGWCopyObj::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWCopyObj::execute() +{ + rgw_obj src_obj, dst_obj; + + if (init_common() < 0) + return; + + src_obj.init(src_bucket, src_object); + dst_obj.init(dest_bucket, dest_object); + store->set_atomic(s->obj_ctx, src_obj); + + store->set_atomic(s->obj_ctx, dst_obj); + + ret = store->copy_obj(s->obj_ctx, + s->user.user_id, + client_id, + op_id, + &s->info, + source_zone, + dst_obj, + src_obj, + dest_bucket_info, + src_bucket_info, + &mtime, + mod_ptr, + unmod_ptr, + if_match, + if_nomatch, + replace_attrs, + attrs, RGW_OBJ_CATEGORY_MAIN, + &s->req_id, /* use req_id as tag */ + &s->err, + copy_obj_progress_cb, (void *)this + ); +} + +int RGWGetACLs::verify_permission() +{ + bool perm; + if (s->object) { + perm = verify_object_permission(s, RGW_PERM_READ_ACP); + } else { + perm = verify_bucket_permission(s, RGW_PERM_READ_ACP); + } + if (!perm) + return -EACCES; + + return 0; +} + +void RGWGetACLs::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWGetACLs::execute() +{ + stringstream ss; + RGWAccessControlPolicy *acl = (s->object ? s->object_acl : s->bucket_acl); + RGWAccessControlPolicy_S3 *s3policy = static_cast(acl); + s3policy->to_xml(ss); + acls = ss.str(); +} + + + +int RGWPutACLs::verify_permission() +{ + bool perm; + if (s->object) { + perm = verify_object_permission(s, RGW_PERM_WRITE_ACP); + } else { + perm = verify_bucket_permission(s, RGW_PERM_WRITE_ACP); + } + if (!perm) + return -EACCES; + + return 0; +} + +void RGWPutACLs::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWPutACLs::execute() +{ + bufferlist bl; + + RGWAccessControlPolicy_S3 *policy = NULL; + RGWACLXMLParser_S3 parser(s->cct); + RGWAccessControlPolicy_S3 new_policy(s->cct); + stringstream ss; + char *new_data = NULL; + ACLOwner owner; + rgw_obj obj; + + ret = 0; + + if (!parser.init()) { + ret = -EINVAL; + return; + } + + owner.set_id(s->user.user_id); + owner.set_name(s->user.display_name); + + ret = get_params(); + if (ret < 0) + return; + + ldout(s->cct, 15) << "read len=" << len << " data=" << (data ? data : "") << dendl; + + if (!s->canned_acl.empty() && len) { + ret = -EINVAL; + return; + } + + if (!s->canned_acl.empty() || s->has_acl_header) { + ret = get_policy_from_state(store, s, ss); + if (ret < 0) + return; + + new_data = strdup(ss.str().c_str()); + free(data); + data = new_data; + len = ss.str().size(); + } + + if (!parser.parse(data, len, 1)) { + ret = -EACCES; + return; + } + policy = static_cast(parser.find_first("AccessControlPolicy")); + if (!policy) { + ret = -EINVAL; + return; + } + + if (s->cct->_conf->subsys.should_gather(ceph_subsys_rgw, 15)) { + ldout(s->cct, 15) << "Old AccessControlPolicy"; + policy->to_xml(*_dout); + *_dout << dendl; + } + + ret = policy->rebuild(store, &owner, new_policy); + if (ret < 0) + return; + + if (s->cct->_conf->subsys.should_gather(ceph_subsys_rgw, 15)) { + ldout(s->cct, 15) << "New AccessControlPolicy:"; + new_policy.to_xml(*_dout); + *_dout << dendl; + } + + RGWObjVersionTracker *ptracker = (s->object ? NULL : &s->bucket_info.objv_tracker); + + new_policy.encode(bl); + obj.init(s->bucket, s->object_str); + map attrs; + attrs[RGW_ATTR_ACL] = bl; + store->set_atomic(s->obj_ctx, obj); + if (s->object) { + ret = store->set_attrs(s->obj_ctx, obj, attrs, NULL, ptracker); + } else { + ret = rgw_bucket_set_attrs(store, s->bucket_info, attrs, NULL, ptracker); + } +} + +int RGWGetCORS::verify_permission() +{ + if (s->user.user_id.compare(s->bucket_owner.get_id()) != 0) + return -EACCES; + + return 0; +} + +void RGWGetCORS::execute() +{ + ret = read_bucket_cors(); + if (ret < 0) + return ; + + if (!cors_exist) { + dout(2) << "No CORS configuration set yet for this bucket" << dendl; + ret = -ENOENT; + return; + } +} + +int RGWPutCORS::verify_permission() +{ + if (s->user.user_id.compare(s->bucket_owner.get_id()) != 0) + return -EACCES; + + return 0; +} + +void RGWPutCORS::execute() +{ + rgw_obj obj; + + ret = get_params(); + if (ret < 0) + return; + + RGWObjVersionTracker *ptracker = (s->object ? NULL : &s->bucket_info.objv_tracker); + + store->get_bucket_instance_obj(s->bucket, obj); + store->set_atomic(s->obj_ctx, obj); + ret = store->set_attr(s->obj_ctx, obj, RGW_ATTR_CORS, cors_bl, ptracker); +} + +int RGWDeleteCORS::verify_permission() +{ + if (s->user.user_id.compare(s->bucket_owner.get_id()) != 0) + return -EACCES; + + return 0; +} + +void RGWDeleteCORS::execute() +{ + ret = read_bucket_cors(); + if (ret < 0) + return; + + bufferlist bl; + rgw_obj obj; + if (!cors_exist) { + dout(2) << "No CORS configuration set yet for this bucket" << dendl; + ret = -ENOENT; + return; + } + store->get_bucket_instance_obj(s->bucket, obj); + store->set_atomic(s->obj_ctx, obj); + map orig_attrs, attrs, rmattrs; + map::iterator iter; + + RGWObjVersionTracker *ptracker = (s->object ? NULL : &s->bucket_info.objv_tracker); + + /* check if obj exists, read orig attrs */ + ret = get_obj_attrs(store, s, obj, orig_attrs, NULL, ptracker); + if (ret < 0) + return; + + /* only remove meta attrs */ + for (iter = orig_attrs.begin(); iter != orig_attrs.end(); ++iter) { + const string& name = iter->first; + dout(10) << "DeleteCORS : attr: " << name << dendl; + if (name.compare(0, (sizeof(RGW_ATTR_CORS) - 1), RGW_ATTR_CORS) == 0) { + rmattrs[name] = iter->second; + } else if (attrs.find(name) == attrs.end()) { + attrs[name] = iter->second; + } + } + ret = store->set_attrs(s->obj_ctx, obj, attrs, &rmattrs, ptracker); +} + +void RGWOptionsCORS::get_response_params(string& hdrs, string& exp_hdrs, unsigned *max_age) { + get_cors_response_headers(rule, req_hdrs, hdrs, exp_hdrs, max_age); +} + +int RGWOptionsCORS::validate_cors_request(RGWCORSConfiguration *cc) { + rule = cc->host_name_rule(origin); + if (!rule) { + dout(10) << "There is no cors rule present for " << origin << dendl; + return -ENOENT; + } + + if (!validate_cors_rule_method(rule, req_meth)) { + return -ENOENT; + } + return 0; +} + +void RGWOptionsCORS::execute() +{ + ret = read_bucket_cors(); + if (ret < 0) + return; + + origin = s->info.env->get("HTTP_ORIGIN"); + if (!origin) { + dout(0) << + "Preflight request without mandatory Origin header" + << dendl; + ret = -EINVAL; + return; + } + req_meth = s->info.env->get("HTTP_ACCESS_CONTROL_REQUEST_METHOD"); + if (!req_meth) { + dout(0) << + "Preflight request without mandatory Access-control-request-method header" + << dendl; + ret = -EINVAL; + return; + } + if (!cors_exist) { + dout(2) << "No CORS configuration set yet for this bucket" << dendl; + ret = -ENOENT; + return; + } + req_hdrs = s->info.env->get("HTTP_ACCESS_CONTROL_REQUEST_HEADERS"); + ret = validate_cors_request(&bucket_cors); + if (!rule) { + origin = req_meth = NULL; + return; + } + return; +} + +int RGWInitMultipart::verify_permission() +{ + if (!verify_bucket_permission(s, RGW_PERM_WRITE)) + return -EACCES; + + return 0; +} + +void RGWInitMultipart::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWInitMultipart::execute() +{ + bufferlist aclbl; + map attrs; + rgw_obj obj; + map::iterator iter; + + if (get_params() < 0) + return; + ret = -EINVAL; + if (!s->object) + return; + + policy.encode(aclbl); + + attrs[RGW_ATTR_ACL] = aclbl; + + for (iter = s->generic_attrs.begin(); iter != s->generic_attrs.end(); ++iter) { + bufferlist& attrbl = attrs[iter->first]; + const string& val = iter->second; + attrbl.append(val.c_str(), val.size() + 1); + } + + rgw_get_request_metadata(s->cct, s->info, attrs); + + do { + char buf[33]; + gen_rand_alphanumeric(s->cct, buf, sizeof(buf) - 1); + upload_id = "2/"; /* v2 upload id */ + upload_id.append(buf); + + string tmp_obj_name; + RGWMPObj mp(s->object_str, upload_id); + tmp_obj_name = mp.get_meta(); + + obj.init_ns(s->bucket, tmp_obj_name, mp_ns); + // the meta object will be indexed with 0 size, we c + obj.set_in_extra_data(true); + ret = store->put_obj_meta(s->obj_ctx, obj, 0, NULL, attrs, RGW_OBJ_CATEGORY_MULTIMETA, PUT_OBJ_CREATE_EXCL, s->owner.get_id()); + } while (ret == -EEXIST); +} + +static int get_multipart_info(RGWRados *store, struct req_state *s, string& meta_oid, + RGWAccessControlPolicy *policy, map& attrs) +{ + map parts_map; + map::iterator iter; + bufferlist header; + + rgw_obj obj; + obj.init_ns(s->bucket, meta_oid, mp_ns); + obj.set_in_extra_data(true); + + int ret = get_obj_attrs(store, s, obj, attrs, NULL, NULL); + if (ret < 0) + return ret; + + if (policy) { + for (iter = attrs.begin(); iter != attrs.end(); ++iter) { + string name = iter->first; + if (name.compare(RGW_ATTR_ACL) == 0) { + bufferlist& bl = iter->second; + bufferlist::iterator bli = bl.begin(); + try { + ::decode(*policy, bli); + } catch (buffer::error& err) { + ldout(s->cct, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl; + return -EIO; + } + break; + } + } + } + + return 0; +} + +static int list_multipart_parts(RGWRados *store, struct req_state *s, + const string& upload_id, + string& meta_oid, int num_parts, + int marker, map& parts, + int *next_marker, bool *truncated, + bool assume_unsorted = false) +{ + map parts_map; + map::iterator iter; + bufferlist header; + + rgw_obj obj; + obj.init_ns(s->bucket, meta_oid, mp_ns); + obj.set_in_extra_data(true); + + bool sorted_omap = is_v2_upload_id(upload_id) && !assume_unsorted; + + int ret; + + parts.clear(); + + if (sorted_omap) { + string p; + p = "part."; + char buf[32]; + + snprintf(buf, sizeof(buf), "%08d", marker); + p.append(buf); + + ret = store->omap_get_vals(obj, header, p, num_parts + 1, parts_map); + } else { + ret = store->omap_get_all(obj, header, parts_map); + } + if (ret < 0) + return ret; + + int i; + int last_num = 0; + + uint32_t expected_next = marker + 1; + + for (i = 0, iter = parts_map.begin(); (i < num_parts || !sorted_omap) && iter != parts_map.end(); ++iter, ++i) { + bufferlist& bl = iter->second; + bufferlist::iterator bli = bl.begin(); + RGWUploadPartInfo info; + try { + ::decode(info, bli); + } catch (buffer::error& err) { + ldout(s->cct, 0) << "ERROR: could not part info, caught buffer::error" << dendl; + return -EIO; + } + if (sorted_omap) { + if (info.num != expected_next) { + /* ouch, we expected a specific part num here, but we got a different one. Either + * a part is missing, or it could be a case of mixed rgw versions working on the same + * upload, where one gateway doesn't support correctly sorted omap keys for multipart + * upload just assume data is unsorted. + */ + return list_multipart_parts(store, s, upload_id, meta_oid, num_parts, marker, parts, next_marker, truncated, true); + } + expected_next++; + } + if (sorted_omap || + (int)info.num > marker) { + parts[info.num] = info; + last_num = info.num; + } + } + + if (sorted_omap) { + if (truncated) + *truncated = (iter != parts_map.end()); + } else { + /* rebuild a map with only num_parts entries */ + + map new_parts; + map::iterator piter; + + for (i = 0, piter = parts.begin(); i < num_parts && piter != parts.end(); ++i, ++piter) { + new_parts[piter->first] = piter->second; + last_num = piter->first; + } + + if (truncated) + *truncated = (piter != parts.end()); + + parts.swap(new_parts); + } + + if (next_marker) { + *next_marker = last_num; + } + + return 0; +} + +int RGWCompleteMultipart::verify_permission() +{ + if (!verify_bucket_permission(s, RGW_PERM_WRITE)) + return -EACCES; + + return 0; +} + +void RGWCompleteMultipart::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWCompleteMultipart::execute() +{ + RGWMultiCompleteUpload *parts; + map::iterator iter; + RGWMultiXMLParser parser; + string meta_oid; + map obj_parts; + map::iterator obj_iter; + map attrs; + off_t ofs = 0; + MD5 hash; + char final_etag[CEPH_CRYPTO_MD5_DIGESTSIZE]; + char final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 16]; + bufferlist etag_bl; + rgw_obj meta_obj; + rgw_obj target_obj; + RGWMPObj mp; + RGWObjManifest manifest; + + ret = get_params(); + if (ret < 0) + return; + + if (!data) { + ret = -EINVAL; + return; + } + + if (!parser.init()) { + ret = -EINVAL; + return; + } + + if (!parser.parse(data, len, 1)) { + ret = -EINVAL; + return; + } + + parts = static_cast(parser.find_first("CompleteMultipartUpload")); + if (!parts) { + ret = -EINVAL; + return; + } + + mp.init(s->object_str, upload_id); + meta_oid = mp.get_meta(); + + int total_parts = 0; + int handled_parts = 0; + int max_parts = 1000; + int marker = 0; + bool truncated; + + uint64_t min_part_size = s->cct->_conf->rgw_multipart_min_part_size; + + list remove_objs; /* objects to be removed from index listing */ + + iter = parts->parts.begin(); + + meta_obj.init_ns(s->bucket, meta_oid, mp_ns); + meta_obj.set_in_extra_data(true); + + ret = get_obj_attrs(store, s, meta_obj, attrs, NULL, NULL); + if (ret < 0) { + ldout(s->cct, 0) << "ERROR: failed to get obj attrs, obj=" << meta_obj << " ret=" << ret << dendl; + return; + } + + do { + ret = list_multipart_parts(store, s, upload_id, meta_oid, max_parts, marker, obj_parts, &marker, &truncated); + if (ret == -ENOENT) { + ret = -ERR_NO_SUCH_UPLOAD; + } + if (ret < 0) + return; + + total_parts += obj_parts.size(); + if (!truncated && total_parts != (int)parts->parts.size()) { + ret = -ERR_INVALID_PART; + return; + } + + for (obj_iter = obj_parts.begin(); iter != parts->parts.end() && obj_iter != obj_parts.end(); ++iter, ++obj_iter, ++handled_parts) { + uint64_t part_size = obj_iter->second.size; + if (handled_parts < (int)parts->parts.size() - 1 && + part_size < min_part_size) { + ret = -ERR_TOO_SMALL; + return; + } + + char petag[CEPH_CRYPTO_MD5_DIGESTSIZE]; + if (iter->first != (int)obj_iter->first) { + ldout(s->cct, 0) << "NOTICE: parts num mismatch: next requested: " << iter->first << " next uploaded: " << obj_iter->first << dendl; + ret = -ERR_INVALID_PART; + return; + } + string part_etag = rgw_string_unquote(iter->second); + if (part_etag.compare(obj_iter->second.etag) != 0) { + ldout(s->cct, 0) << "NOTICE: etag mismatch: part: " << iter->first << " etag: " << iter->second << dendl; + ret = -ERR_INVALID_PART; + return; + } + + hex_to_buf(obj_iter->second.etag.c_str(), petag, CEPH_CRYPTO_MD5_DIGESTSIZE); + hash.Update((const byte *)petag, sizeof(petag)); + + RGWUploadPartInfo& obj_part = obj_iter->second; + + /* update manifest for part */ + string oid = mp.get_part(obj_iter->second.num); + rgw_obj src_obj; + src_obj.init_ns(s->bucket, oid, mp_ns); + + if (obj_part.manifest.empty()) { + ldout(s->cct, 0) << "ERROR: empty manifest for object part: obj=" << src_obj << dendl; + ret = -ERR_INVALID_PART; + return; + } else { + manifest.append(obj_part.manifest); + } + + remove_objs.push_back(src_obj.object); + + ofs += obj_part.size; + } + } while (truncated); + hash.Final((byte *)final_etag); + + buf_to_hex((unsigned char *)final_etag, sizeof(final_etag), final_etag_str); + snprintf(&final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2], sizeof(final_etag_str) - CEPH_CRYPTO_MD5_DIGESTSIZE * 2, + "-%lld", (long long)parts->parts.size()); + etag = final_etag_str; + ldout(s->cct, 10) << "calculated etag: " << final_etag_str << dendl; + + etag_bl.append(final_etag_str, strlen(final_etag_str) + 1); + + attrs[RGW_ATTR_ETAG] = etag_bl; + + target_obj.init(s->bucket, s->object_str); + + store->set_atomic(s->obj_ctx, target_obj); + + RGWRados::PutObjMetaExtraParams extra_params; + + extra_params.manifest = &manifest; + extra_params.remove_objs = &remove_objs; + + extra_params.ptag = &s->req_id; /* use req_id as operation tag */ + extra_params.owner = s->owner.get_id(); + + ret = store->put_obj_meta(s->obj_ctx, target_obj, ofs, attrs, + RGW_OBJ_CATEGORY_MAIN, PUT_OBJ_CREATE, + extra_params); + if (ret < 0) + return; + + // remove the upload obj + store->delete_obj(s->obj_ctx, s->bucket_owner.get_id(), meta_obj); +} + +int RGWAbortMultipart::verify_permission() +{ + if (!verify_bucket_permission(s, RGW_PERM_WRITE)) + return -EACCES; + + return 0; +} + +void RGWAbortMultipart::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWAbortMultipart::execute() +{ + ret = -EINVAL; + string upload_id; + string meta_oid; + upload_id = s->info.args.get("uploadId"); + map obj_parts; + map::iterator obj_iter; + map attrs; + rgw_obj meta_obj; + RGWMPObj mp; + const string& owner = s->bucket_owner.get_id(); + + if (upload_id.empty() || s->object_str.empty()) + return; + + mp.init(s->object_str, upload_id); + meta_oid = mp.get_meta(); + + ret = get_multipart_info(store, s, meta_oid, NULL, attrs); + if (ret < 0) + return; + + bool truncated; + int marker = 0; + int max_parts = 1000; + + do { + ret = list_multipart_parts(store, s, upload_id, meta_oid, max_parts, marker, obj_parts, &marker, &truncated); + if (ret < 0) + return; + + for (obj_iter = obj_parts.begin(); obj_iter != obj_parts.end(); ++obj_iter) { + RGWUploadPartInfo& obj_part = obj_iter->second; + + if (obj_part.manifest.empty()) { + string oid = mp.get_part(obj_iter->second.num); + rgw_obj obj; + obj.init_ns(s->bucket, oid, mp_ns); + ret = store->delete_obj(s->obj_ctx, owner, obj); + if (ret < 0 && ret != -ENOENT) + return; + } else { + RGWObjManifest& manifest = obj_part.manifest; + RGWObjManifest::obj_iterator oiter; + for (oiter = manifest.obj_begin(); oiter != manifest.obj_end(); ++oiter) { + rgw_obj loc = oiter.get_location(); + ret = store->delete_obj(s->obj_ctx, owner, loc); + if (ret < 0 && ret != -ENOENT) + return; + } + } + } + } while (truncated); + + // and also remove the metadata obj + meta_obj.init_ns(s->bucket, meta_oid, mp_ns); + meta_obj.set_in_extra_data(true); + ret = store->delete_obj(s->obj_ctx, owner, meta_obj); + if (ret == -ENOENT) { + ret = -ERR_NO_SUCH_BUCKET; + } +} + +int RGWListMultipart::verify_permission() +{ + if (!verify_object_permission(s, RGW_PERM_READ)) + return -EACCES; + + return 0; +} + +void RGWListMultipart::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWListMultipart::execute() +{ + map xattrs; + string meta_oid; + RGWMPObj mp; + + ret = get_params(); + if (ret < 0) + return; + + mp.init(s->object_str, upload_id); + meta_oid = mp.get_meta(); + + ret = get_multipart_info(store, s, meta_oid, &policy, xattrs); + if (ret < 0) + return; + + ret = list_multipart_parts(store, s, upload_id, meta_oid, max_parts, marker, parts, NULL, &truncated); +} + +int RGWListBucketMultiparts::verify_permission() +{ + if (!verify_bucket_permission(s, RGW_PERM_READ)) + return -EACCES; + + return 0; +} + +void RGWListBucketMultiparts::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWListBucketMultiparts::execute() +{ + vector objs; + string marker_meta; + + ret = get_params(); + if (ret < 0) + return; + + if (s->prot_flags & RGW_REST_SWIFT) { + string path_args; + path_args = s->info.args.get("path"); + if (!path_args.empty()) { + if (!delimiter.empty() || !prefix.empty()) { + ret = -EINVAL; + return; + } + prefix = path_args; + delimiter="/"; + } + } + marker_meta = marker.get_meta(); + ret = store->list_objects(s->bucket, max_uploads, prefix, delimiter, marker_meta, NULL, objs, common_prefixes, + !!(s->prot_flags & RGW_REST_SWIFT), mp_ns, true, &is_truncated, &mp_filter); + if (!objs.empty()) { + vector::iterator iter; + RGWMultipartUploadEntry entry; + for (iter = objs.begin(); iter != objs.end(); ++iter) { + string name = iter->name; + if (!entry.mp.from_meta(name)) + continue; + entry.obj = *iter; + uploads.push_back(entry); + } + next_marker = entry; + } +} + +int RGWDeleteMultiObj::verify_permission() +{ + if (!verify_bucket_permission(s, RGW_PERM_WRITE)) + return -EACCES; + + return 0; +} + +void RGWDeleteMultiObj::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWDeleteMultiObj::execute() +{ + RGWMultiDelDelete *multi_delete; + vector::iterator iter; + RGWMultiDelXMLParser parser; + pair result; + int num_processed = 0; + + ret = get_params(); + if (ret < 0) { + goto error; + } + + if (!data) { + ret = -EINVAL; + goto error; + } + + if (!parser.init()) { + ret = -EINVAL; + goto error; + } + + if (!parser.parse(data, len, 1)) { + ret = -EINVAL; + goto error; + } + + multi_delete = static_cast(parser.find_first("Delete")); + if (!multi_delete) { + ret = -EINVAL; + goto error; + } + + if (multi_delete->is_quiet()) + quiet = true; + + begin_response(); + if (multi_delete->objects.empty()) { + goto done; + } + + for (iter = multi_delete->objects.begin(); + iter != multi_delete->objects.end() && num_processed < max_to_delete; + ++iter, num_processed++) { + + rgw_obj obj(bucket,(*iter)); + store->set_atomic(s->obj_ctx, obj); + ret = store->delete_obj(s->obj_ctx, s->bucket_owner.get_id(), obj); + if (ret == -ENOENT) { + ret = 0; + } + result = make_pair(*iter, ret); + + send_partial_response(result); + } + + /* set the return code to zero, errors at this point will be + dumped to the response */ + ret = 0; + +done: + // will likely segfault if begin_response() has not been called + end_response(); + free(data); + return; + +error: + send_status(); + free(data); + return; + +} + +RGWHandler::~RGWHandler() +{ +} + +int RGWHandler::init(RGWRados *_store, struct req_state *_s, RGWClientIO *cio) +{ + store = _store; + s = _s; + + return 0; +} + +int RGWHandler::do_read_permissions(RGWOp *op, bool only_bucket) +{ + int ret = rgw_build_policies(store, s, only_bucket, op->prefetch_data()); + + if (ret < 0) { + ldout(s->cct, 10) << "read_permissions on " << s->bucket << ":" <object_str << " only_bucket=" << only_bucket << " ret=" << ret << dendl; + if (ret == -ENODATA) + ret = -EACCES; + } + + return ret; +} + + +RGWOp *RGWHandler::get_op(RGWRados *store) +{ + RGWOp *op; + switch (s->op) { + case OP_GET: + op = op_get(); + break; + case OP_PUT: + op = op_put(); + break; + case OP_DELETE: + op = op_delete(); + break; + case OP_HEAD: + op = op_head(); + break; + case OP_POST: + op = op_post(); + break; + case OP_COPY: + op = op_copy(); + break; + case OP_OPTIONS: + op = op_options(); + break; + default: + return NULL; + } + + if (op) { + op->init(store, s, this); + } + return op; +} + +void RGWHandler::put_op(RGWOp *op) +{ + delete op; +} + diff --git a/src/rgw/rgw_policy_s3.cc b/src/rgw/rgw_policy_s3.cc index fac05f18884b6..9bf20f2c57649 100644 --- a/src/rgw/rgw_policy_s3.cc +++ b/src/rgw/rgw_policy_s3.cc @@ -284,11 +284,13 @@ int RGWPolicy::from_json(bufferlist& bl, string& err_msg) int r = add_condition(v[0], v[1], v[2], err_msg); if (r < 0) return r; - } else { + } else if (!citer.end()) { JSONObj *c = *citer; dout(0) << "adding simple_check: " << c->get_name() << " : " << c->get_data() << dendl; add_simple_check(c->get_name(), c->get_data()); + } else { + return -EINVAL; } } return 0; diff --git a/src/rgw/rgw_policy_s3.cc.orig b/src/rgw/rgw_policy_s3.cc.orig new file mode 100644 index 0000000000000..fac05f18884b6 --- /dev/null +++ b/src/rgw/rgw_policy_s3.cc.orig @@ -0,0 +1,295 @@ + +#include + +#include "common/ceph_json.h" +#include "rgw_policy_s3.h" +#include "rgw_common.h" + + +#define dout_subsys ceph_subsys_rgw + +class RGWPolicyCondition { +protected: + string v1; + string v2; + + virtual bool check(const string& first, const string& second, string& err_msg) = 0; + +public: + virtual ~RGWPolicyCondition() {} + + void set_vals(const string& _v1, const string& _v2) { + v1 = _v1; + v2 = _v2; + } + + bool check(RGWPolicyEnv *env, map& checked_vars, string& err_msg) { + string first, second; + env->get_value(v1, first, checked_vars); + env->get_value(v2, second, checked_vars); + + dout(1) << "policy condition check " << v1 << " [" << first << "] " << v2 << " [" << second << "]" << dendl; + bool ret = check(first, second, err_msg); + if (!ret) { + err_msg.append(": "); + err_msg.append(v1); + err_msg.append(", "); + err_msg.append(v2); + } + return ret; + } + +}; + + +class RGWPolicyCondition_StrEqual : public RGWPolicyCondition { +protected: + bool check(const string& first, const string& second, string& msg) { + bool ret = first.compare(second) == 0; + if (!ret) { + msg = "Policy condition failed: eq"; + } + return ret; + } +}; + +class RGWPolicyCondition_StrStartsWith : public RGWPolicyCondition { +protected: + bool check(const string& first, const string& second, string& msg) { + bool ret = first.compare(0, second.size(), second) == 0; + if (!ret) { + msg = "Policy condition failed: starts-with"; + } + return ret; + } +}; + +void RGWPolicyEnv::add_var(const string& name, const string& value) +{ + vars[name] = value; +} + +bool RGWPolicyEnv::get_var(const string& name, string& val) +{ + map::iterator iter = vars.find(name); + if (iter == vars.end()) + return false; + + val = iter->second; + + return true; +} + +bool RGWPolicyEnv::get_value(const string& s, string& val, map& checked_vars) +{ + if (s.empty() || s[0] != '$') { + val = s; + return true; + } + + const string& var = s.substr(1); + checked_vars[var] = true; + + return get_var(var, val); +} + + +bool RGWPolicyEnv::match_policy_vars(map& policy_vars, string& err_msg) +{ + map::iterator iter; + string ignore_prefix = "x-ignore-"; + for (iter = vars.begin(); iter != vars.end(); ++iter) { + const string& var = iter->first; + if (strncasecmp(ignore_prefix.c_str(), var.c_str(), ignore_prefix.size()) == 0) + continue; + if (policy_vars.count(var) == 0) { + err_msg = "Policy missing condition: "; + err_msg.append(iter->first); + dout(1) << "env var missing in policy: " << iter->first << dendl; + return false; + } + } + return true; +} + +RGWPolicy::~RGWPolicy() +{ + list::iterator citer; + for (citer = conditions.begin(); citer != conditions.end(); ++citer) { + RGWPolicyCondition *cond = *citer; + delete cond; + } +} + +int RGWPolicy::set_expires(const string& e) +{ + struct tm t; + if (!parse_iso8601(e.c_str(), &t)) + return -EINVAL; + + expires = timegm(&t); + + return 0; +} + +int RGWPolicy::add_condition(const string& op, const string& first, const string& second, string& err_msg) +{ + RGWPolicyCondition *cond = NULL; + if (stringcasecmp(op, "eq") == 0) { + cond = new RGWPolicyCondition_StrEqual; + } else if (stringcasecmp(op, "starts-with") == 0) { + cond = new RGWPolicyCondition_StrStartsWith; + } else if (stringcasecmp(op, "content-length-range") == 0) { + off_t min, max; + int r = stringtoll(first, &min); + if (r < 0) { + err_msg = "Bad content-length-range param"; + dout(0) << "bad content-length-range param: " << first << dendl; + return r; + } + + r = stringtoll(second, &max); + if (r < 0) { + err_msg = "Bad content-length-range param"; + dout(0) << "bad content-length-range param: " << second << dendl; + return r; + } + + if (min > min_length) + min_length = min; + + if (max < max_length) + max_length = max; + + return 0; + } + + if (!cond) { + err_msg = "Invalid condition: "; + err_msg.append(op); + dout(0) << "invalid condition: " << op << dendl; + return -EINVAL; + } + + cond->set_vals(first, second); + + conditions.push_back(cond); + + return 0; +} + +int RGWPolicy::check(RGWPolicyEnv *env, string& err_msg) +{ + uint64_t now = ceph_clock_now(NULL).sec(); + if (expires <= now) { + dout(0) << "NOTICE: policy calculated as expired: " << expiration_str << dendl; + err_msg = "Policy expired"; + return -EACCES; // change to condition about expired policy following S3 + } + + list >::iterator viter; + for (viter = var_checks.begin(); viter != var_checks.end(); ++viter) { + pair& p = *viter; + const string& name = p.first; + const string& check_val = p.second; + string val; + if (!env->get_var(name, val)) { + dout(20) << " policy check failed, variable not found: '" << name << "'" << dendl; + err_msg = "Policy check failed, variable not found: "; + err_msg.append(name); + return -EACCES; + } + + set_var_checked(name); + + dout(20) << "comparing " << name << " [" << val << "], " << check_val << dendl; + if (val.compare(check_val) != 0) { + err_msg = "Policy check failed, variable not met condition: "; + err_msg.append(name); + dout(1) << "policy check failed, val=" << val << " != " << check_val << dendl; + return -EACCES; + } + } + + list::iterator citer; + for (citer = conditions.begin(); citer != conditions.end(); ++citer) { + RGWPolicyCondition *cond = *citer; + if (!cond->check(env, checked_vars, err_msg)) { + return -EACCES; + } + } + + if (!env->match_policy_vars(checked_vars, err_msg)) { + dout(1) << "missing policy condition" << dendl; + return -EACCES; + } + return 0; +} + + +int RGWPolicy::from_json(bufferlist& bl, string& err_msg) +{ + JSONParser parser; + + if (!parser.parse(bl.c_str(), bl.length())) { + err_msg = "Malformed JSON"; + dout(0) << "malformed json" << dendl; + return -EINVAL; + } + + // as no time was included in the request, we hope that the user has included a short timeout + JSONObjIter iter = parser.find_first("expiration"); + if (iter.end()) { + err_msg = "Policy missing expiration"; + dout(0) << "expiration not found" << dendl; + return -EINVAL; // change to a "no expiration" error following S3 + } + + JSONObj *obj = *iter; + expiration_str = obj->get_data(); + int r = set_expires(expiration_str); + if (r < 0) { + err_msg = "Failed to parse policy expiration"; + return r; + } + + iter = parser.find_first("conditions"); + if (iter.end()) { + err_msg = "Policy missing conditions"; + dout(0) << "conditions not found" << dendl; + return -EINVAL; // change to a "no conditions" error following S3 + } + + obj = *iter; + + iter = obj->find_first(); + for (; !iter.end(); ++iter) { + JSONObj *child = *iter; + dout(20) << "data=" << child->get_data() << dendl; + dout(20) << "is_object=" << child->is_object() << dendl; + dout(20) << "is_array=" << child->is_array() << dendl; + JSONObjIter citer = child->find_first(); + if (child->is_array()) { + vector v; + int i; + for (i = 0; !citer.end() && i < 3; ++citer, ++i) { + JSONObj *o = *citer; + v.push_back(o->get_data()); + } + if (i != 3 || !citer.end()) { /* we expect exactly 3 arguments here */ + err_msg = "Bad condition array, expecting 3 arguments"; + return -EINVAL; + } + + int r = add_condition(v[0], v[1], v[2], err_msg); + if (r < 0) + return r; + } else { + JSONObj *c = *citer; + dout(0) << "adding simple_check: " << c->get_name() << " : " << c->get_data() << dendl; + + add_simple_check(c->get_name(), c->get_data()); + } + } + return 0; +} diff --git a/src/rgw/rgw_rest_s3.cc b/src/rgw/rgw_rest_s3.cc index c7961f4a9c534..204e5ee0aba96 100644 --- a/src/rgw/rgw_rest_s3.cc +++ b/src/rgw/rgw_rest_s3.cc @@ -124,6 +124,11 @@ int RGWGetObj_ObjStore_S3::send_response_data(bufferlist& bl, off_t bl_ofs, off_ bool exists; string val = s->info.args.get(p->param, &exists); if (exists) { + /* reject unauthenticated response header manipulation, see + * https://docs.aws.amazon.com/AmazonS3/latest/API/API_GetObject.html */ + if (s->auth.identity->is_anonymous()) { + return -ERR_INVALID_REQUEST; + } if (strcmp(p->param, "response-content-type") != 0) { response_attrs[p->http_attr] = val; } else { diff --git a/src/rgw/rgw_rest_s3.cc.orig b/src/rgw/rgw_rest_s3.cc.orig new file mode 100644 index 0000000000000..438656ba1a483 --- /dev/null +++ b/src/rgw/rgw_rest_s3.cc.orig @@ -0,0 +1,2242 @@ +#include +#include + +#include "common/ceph_crypto.h" +#include "common/Formatter.h" +#include "common/utf8.h" +#include "common/ceph_json.h" + +#include "rgw_rest.h" +#include "rgw_rest_s3.h" +#include "rgw_auth_s3.h" +#include "rgw_acl.h" +#include "rgw_policy_s3.h" +#include "rgw_user.h" +#include "rgw_cors.h" +#include "rgw_cors_s3.h" + +#include "rgw_client_io.h" + +#define dout_subsys ceph_subsys_rgw + +using namespace ceph::crypto; + +void list_all_buckets_start(struct req_state *s) +{ + s->formatter->open_array_section_in_ns("ListAllMyBucketsResult", + "http://s3.amazonaws.com/doc/2006-03-01/"); +} + +void list_all_buckets_end(struct req_state *s) +{ + s->formatter->close_section(); +} + +void dump_bucket(struct req_state *s, RGWBucketEnt& obj) +{ + s->formatter->open_object_section("Bucket"); + s->formatter->dump_string("Name", obj.bucket.name); + dump_time(s, "CreationDate", &obj.creation_time); + s->formatter->close_section(); +} + +void rgw_get_errno_s3(rgw_http_errors *e , int err_no) +{ + const struct rgw_http_errors *r; + r = search_err(err_no, RGW_HTTP_ERRORS, ARRAY_LEN(RGW_HTTP_ERRORS)); + + if (r) { + e->http_ret = r->http_ret; + e->s3_code = r->s3_code; + } else { + e->http_ret = 500; + e->s3_code = "UnknownError"; + } +} + +struct response_attr_param { + const char *param; + const char *http_attr; +}; + +static struct response_attr_param resp_attr_params[] = { + {"response-content-type", "Content-Type"}, + {"response-content-language", "Content-Language"}, + {"response-expires", "Expires"}, + {"response-cache-control", "Cache-Control"}, + {"response-content-disposition", "Content-Disposition"}, + {"response-content-encoding", "Content-Encoding"}, + {NULL, NULL}, +}; + +int RGWGetObj_ObjStore_S3::send_response_data(bufferlist& bl, off_t bl_ofs, off_t bl_len) +{ + const char *content_type = NULL; + string content_type_str; + map response_attrs; + map::iterator riter; + bufferlist metadata_bl; + + if (ret) + goto done; + + if (sent_header) + goto send_data; + + if (range_str) + dump_range(s, start, end, s->obj_size); + + if (s->system_request && + s->info.args.exists(RGW_SYS_PARAM_PREFIX "prepend-metadata")) { + + /* JSON encode object metadata */ + JSONFormatter jf; + jf.open_object_section("obj_metadata"); + encode_json("attrs", attrs, &jf); + encode_json("mtime", lastmod, &jf); + jf.close_section(); + stringstream ss; + jf.flush(ss); + metadata_bl.append(ss.str()); + s->cio->print("Rgwx-Embedded-Metadata-Len: %lld\r\n", (long long)metadata_bl.length()); + total_len += metadata_bl.length(); + } + + if (s->system_request && lastmod) { + /* we end up dumping mtime in two different methods, a bit redundant */ + dump_epoch_header(s, "Rgwx-Mtime", lastmod); + } + + dump_content_length(s, total_len); + dump_last_modified(s, lastmod); + + if (!ret) { + map::iterator iter = attrs.find(RGW_ATTR_ETAG); + if (iter != attrs.end()) { + bufferlist& bl = iter->second; + if (bl.length()) { + char *etag = bl.c_str(); + dump_etag(s, etag); + } + } + + for (struct response_attr_param *p = resp_attr_params; p->param; p++) { + bool exists; + string val = s->info.args.get(p->param, &exists); + if (exists) { + /* reject unauthenticated response header manipulation, see + * https://docs.aws.amazon.com/AmazonS3/latest/API/API_GetObject.html */ + if (s->auth.identity->is_anonymous()) { + return -EPERM; + } + if (strcmp(p->param, "response-content-type") != 0) { + response_attrs[p->http_attr] = val; + } else { + content_type_str = val; + content_type = content_type_str.c_str(); + } + } + } + + for (iter = attrs.begin(); iter != attrs.end(); ++iter) { + const char *name = iter->first.c_str(); + map::iterator aiter = rgw_to_http_attrs.find(name); + if (aiter != rgw_to_http_attrs.end()) { + if (response_attrs.count(aiter->second) > 0) // was already overridden by a response param + continue; + + if (aiter->first.compare(RGW_ATTR_CONTENT_TYPE) == 0) { // special handling for content_type + if (!content_type) + content_type = iter->second.c_str(); + continue; + } + response_attrs[aiter->second] = iter->second.c_str(); + } else { + if (strncmp(name, RGW_ATTR_META_PREFIX, sizeof(RGW_ATTR_META_PREFIX)-1) == 0) { + name += sizeof(RGW_ATTR_PREFIX) - 1; + s->cio->print("%s: %s\r\n", name, iter->second.c_str()); + } + } + } + } + +done: + set_req_state_err(s, (partial_content && !ret) ? STATUS_PARTIAL_CONTENT : ret); + + dump_errno(s); + + for (riter = response_attrs.begin(); riter != response_attrs.end(); ++riter) { + s->cio->print("%s: %s\n", riter->first.c_str(), riter->second.c_str()); + } + + if (!content_type) + content_type = "binary/octet-stream"; + + end_header(s, this, content_type); + + if (metadata_bl.length()) { + s->cio->write(metadata_bl.c_str(), metadata_bl.length()); + } + sent_header = true; + +send_data: + if (get_data && !ret) { + int r = s->cio->write(bl.c_str() + bl_ofs, bl_len); + if (r < 0) + return r; + } + + return 0; +} + +void RGWListBuckets_ObjStore_S3::send_response_begin(bool has_buckets) +{ + if (ret) + set_req_state_err(s, ret); + dump_errno(s); + dump_start(s); + end_header(s, NULL, "application/xml"); + + if (!ret) { + list_all_buckets_start(s); + dump_owner(s, s->user.user_id, s->user.display_name); + s->formatter->open_array_section("Buckets"); + sent_data = true; + } +} + +void RGWListBuckets_ObjStore_S3::send_response_data(RGWUserBuckets& buckets) +{ + if (!sent_data) + return; + + map& m = buckets.get_buckets(); + map::iterator iter; + + for (iter = m.begin(); iter != m.end(); ++iter) { + RGWBucketEnt obj = iter->second; + dump_bucket(s, obj); + } + rgw_flush_formatter(s, s->formatter); +} + +void RGWListBuckets_ObjStore_S3::send_response_end() +{ + if (sent_data) { + s->formatter->close_section(); + list_all_buckets_end(s); + rgw_flush_formatter_and_reset(s, s->formatter); + } +} + +int RGWListBucket_ObjStore_S3::get_params() +{ + prefix = s->info.args.get("prefix"); + marker = s->info.args.get("marker"); + max_keys = s->info.args.get("max-keys"); + ret = parse_max_keys(); + if (ret < 0) { + return ret; + } + delimiter = s->info.args.get("delimiter"); + return 0; +} + +void RGWListBucket_ObjStore_S3::send_response() +{ + if (ret < 0) + set_req_state_err(s, ret); + dump_errno(s); + + end_header(s, this, "application/xml"); + dump_start(s); + if (ret < 0) + return; + + s->formatter->open_object_section_in_ns("ListBucketResult", + "http://s3.amazonaws.com/doc/2006-03-01/"); + s->formatter->dump_string("Name", s->bucket_name_str); + s->formatter->dump_string("Prefix", prefix); + s->formatter->dump_string("Marker", marker); + if (is_truncated && !next_marker.empty()) + s->formatter->dump_string("NextMarker", next_marker); + s->formatter->dump_int("MaxKeys", max); + if (!delimiter.empty()) + s->formatter->dump_string("Delimiter", delimiter); + + s->formatter->dump_string("IsTruncated", (max && is_truncated ? "true" : "false")); + + if (ret >= 0) { + vector::iterator iter; + for (iter = objs.begin(); iter != objs.end(); ++iter) { + s->formatter->open_array_section("Contents"); + s->formatter->dump_string("Key", iter->name); + time_t mtime = iter->mtime.sec(); + dump_time(s, "LastModified", &mtime); + s->formatter->dump_format("ETag", "\"%s\"", iter->etag.c_str()); + s->formatter->dump_int("Size", iter->size); + s->formatter->dump_string("StorageClass", "STANDARD"); + dump_owner(s, iter->owner, iter->owner_display_name); + s->formatter->close_section(); + } + if (common_prefixes.size() > 0) { + map::iterator pref_iter; + for (pref_iter = common_prefixes.begin(); pref_iter != common_prefixes.end(); ++pref_iter) { + s->formatter->open_array_section("CommonPrefixes"); + s->formatter->dump_string("Prefix", pref_iter->first); + s->formatter->close_section(); + } + } + } + s->formatter->close_section(); + rgw_flush_formatter_and_reset(s, s->formatter); +} + +void RGWGetBucketLogging_ObjStore_S3::send_response() +{ + dump_errno(s); + end_header(s, this, "application/xml"); + dump_start(s); + + s->formatter->open_object_section_in_ns("BucketLoggingStatus", + "http://doc.s3.amazonaws.com/doc/2006-03-01/"); + s->formatter->close_section(); + rgw_flush_formatter_and_reset(s, s->formatter); +} + +static void dump_bucket_metadata(struct req_state *s, RGWBucketEnt& bucket) +{ + char buf[32]; + snprintf(buf, sizeof(buf), "%lld", (long long)bucket.count); + s->cio->print("X-RGW-Object-Count: %s\n", buf); + snprintf(buf, sizeof(buf), "%lld", (long long)bucket.size); + s->cio->print("X-RGW-Bytes-Used: %s\n", buf); +} + +void RGWStatBucket_ObjStore_S3::send_response() +{ + if (ret >= 0) { + dump_bucket_metadata(s, bucket); + } + + set_req_state_err(s, ret); + dump_errno(s); + + end_header(s, this); + dump_start(s); +} + +static int create_s3_policy(struct req_state *s, RGWRados *store, RGWAccessControlPolicy_S3& s3policy) +{ + if (s->has_acl_header) { + if (!s->canned_acl.empty()) + return -ERR_INVALID_REQUEST; + + return s3policy.create_from_headers(store, s->info.env, s->owner); + } + + return s3policy.create_canned(s->owner, s->bucket_owner, s->canned_acl); +} + +class RGWLocationConstraint : public XMLObj +{ +public: + RGWLocationConstraint() {} + ~RGWLocationConstraint() {} + bool xml_end(const char *el) { + if (!el) + return false; + + location_constraint = get_data(); + + return true; + } + + string location_constraint; +}; + +class RGWCreateBucketConfig : public XMLObj +{ +public: + RGWCreateBucketConfig() {} + ~RGWCreateBucketConfig() {} +}; + +class RGWCreateBucketParser : public RGWXMLParser +{ + XMLObj *alloc_obj(const char *el) { + return new XMLObj; + } + +public: + RGWCreateBucketParser() {} + ~RGWCreateBucketParser() {} + + bool get_location_constraint(string& region) { + XMLObj *config = find_first("CreateBucketConfiguration"); + if (!config) + return false; + + XMLObj *constraint = config->find_first("LocationConstraint"); + if (!constraint) + return false; + + region = constraint->get_data(); + + return true; + } +}; + +int RGWCreateBucket_ObjStore_S3::get_params() +{ + RGWAccessControlPolicy_S3 s3policy(s->cct); + + int r = create_s3_policy(s, store, s3policy); + if (r < 0) + return r; + + policy = s3policy; + + int len = 0; + char *data; +#define CREATE_BUCKET_MAX_REQ_LEN (512 * 1024) /* this is way more than enough */ + ret = rgw_rest_read_all_input(s, &data, &len, CREATE_BUCKET_MAX_REQ_LEN); + if ((ret < 0) && (ret != -ERR_LENGTH_REQUIRED)) + return ret; + + bufferptr in_ptr(data, len); + in_data.append(in_ptr); + + if (len) { + RGWCreateBucketParser parser; + + if (!parser.init()) { + ldout(s->cct, 0) << "ERROR: failed to initialize parser" << dendl; + return -EIO; + } + + bool success = parser.parse(data, len, 1); + ldout(s->cct, 20) << "create bucket input data=" << data << dendl; + + if (!success) { + ldout(s->cct, 0) << "failed to parse input: " << data << dendl; + free(data); + return -EINVAL; + } + free(data); + + if (!parser.get_location_constraint(location_constraint)) { + ldout(s->cct, 0) << "provided input did not specify location constraint correctly" << dendl; + return -EINVAL; + } + + ldout(s->cct, 10) << "create bucket location constraint: " << location_constraint << dendl; + } + + int pos = location_constraint.find(':'); + if (pos >= 0) { + placement_rule = location_constraint.substr(pos + 1); + location_constraint = location_constraint.substr(0, pos); + } + + return 0; +} + +void RGWCreateBucket_ObjStore_S3::send_response() +{ + if (ret == -ERR_BUCKET_EXISTS) + ret = 0; + if (ret) + set_req_state_err(s, ret); + dump_errno(s); + end_header(s); + + if (ret < 0) + return; + + if (s->system_request) { + JSONFormatter f; /* use json formatter for system requests output */ + + f.open_object_section("info"); + encode_json("entry_point_object_ver", ep_objv, &f); + encode_json("object_ver", info.objv_tracker.read_version, &f); + encode_json("bucket_info", info, &f); + f.close_section(); + rgw_flush_formatter_and_reset(s, &f); + } +} + +void RGWDeleteBucket_ObjStore_S3::send_response() +{ + int r = ret; + if (!r) + r = STATUS_NO_CONTENT; + + set_req_state_err(s, r); + dump_errno(s); + end_header(s, this); + + if (s->system_request) { + JSONFormatter f; /* use json formatter for system requests output */ + + f.open_object_section("info"); + encode_json("object_ver", objv_tracker.read_version, &f); + f.close_section(); + rgw_flush_formatter_and_reset(s, &f); + } +} + +int RGWPutObj_ObjStore_S3::get_params() +{ + RGWAccessControlPolicy_S3 s3policy(s->cct); + if (!s->length) + return -ERR_LENGTH_REQUIRED; + + int r = create_s3_policy(s, store, s3policy); + if (r < 0) + return r; + + policy = s3policy; + + return RGWPutObj_ObjStore::get_params(); +} + +static int get_success_retcode(int code) +{ + switch (code) { + case 201: + return STATUS_CREATED; + case 204: + return STATUS_NO_CONTENT; + } + return 0; +} + +void RGWPutObj_ObjStore_S3::send_response() +{ + if (ret) { + set_req_state_err(s, ret); + } else { + if (s->cct->_conf->rgw_s3_success_create_obj_status) { + ret = get_success_retcode(s->cct->_conf->rgw_s3_success_create_obj_status); + set_req_state_err(s, ret); + } + dump_etag(s, etag.c_str()); + dump_content_length(s, 0); + } + if (s->system_request && mtime) { + dump_epoch_header(s, "Rgwx-Mtime", mtime); + } + dump_errno(s); + end_header(s, this); +} + +/* + * parses params in the format: 'first; param1=foo; param2=bar' + */ +static void parse_params(const string& params_str, string& first, map& params) +{ + int pos = params_str.find(';'); + if (pos < 0) { + first = rgw_trim_whitespace(params_str); + return; + } + + first = rgw_trim_whitespace(params_str.substr(0, pos)); + + pos++; + + while (pos < (int)params_str.size()) { + ssize_t end = params_str.find(';', pos); + if (end < 0) + end = params_str.size(); + + string param = params_str.substr(pos, end - pos); + + int eqpos = param.find('='); + if (eqpos > 0) { + string param_name = rgw_trim_whitespace(param.substr(0, eqpos)); + string val = rgw_trim_quotes(param.substr(eqpos + 1)); + params[param_name] = val; + } else { + params[rgw_trim_whitespace(param)] = ""; + } + + pos = end + 1; + } +} + +static int parse_part_field(const string& line, string& field_name, struct post_part_field& field) +{ + int pos = line.find(':'); + if (pos < 0) + return -EINVAL; + + field_name = line.substr(0, pos); + if (pos >= (int)line.size() - 1) + return 0; + + parse_params(line.substr(pos + 1), field.val, field.params); + + return 0; +} + +bool is_crlf(const char *s) +{ + return (*s == '\r' && *(s + 1) == '\n'); +} + +/* + * find the index of the boundary, if exists, or optionally the next end of line + * also returns how many bytes to skip + */ +static int index_of(bufferlist& bl, int max_len, const string& str, bool check_crlf, + bool *reached_boundary, int *skip) +{ + *reached_boundary = false; + *skip = 0; + + if (str.size() < 2) // we assume boundary is at least 2 chars (makes it easier with crlf checks) + return -EINVAL; + + if (bl.length() < str.size()) + return -1; + + const char *buf = bl.c_str(); + const char *s = str.c_str(); + + if (max_len > (int)bl.length()) + max_len = bl.length(); + + int i; + for (i = 0; i < max_len; i++, buf++) { + if (check_crlf && + i >= 1 && + is_crlf(buf - 1)) { + return i + 1; // skip the crlf + } + if ((i < max_len - (int)str.size() + 1) && + (buf[0] == s[0] && buf[1] == s[1]) && + (strncmp(buf, s, str.size()) == 0)) { + *reached_boundary = true; + *skip = str.size(); + + /* oh, great, now we need to swallow the preceding crlf + * if exists + */ + if ((i >= 2) && + is_crlf(buf - 2)) { + i -= 2; + *skip += 2; + } + return i; + } + } + + return -1; +} + +int RGWPostObj_ObjStore_S3::read_with_boundary(bufferlist& bl, uint64_t max, bool check_crlf, + bool *reached_boundary, bool *done) +{ + uint64_t cl = max + 2 + boundary.size(); + + if (max > in_data.length()) { + uint64_t need_to_read = cl - in_data.length(); + + bufferptr bp(need_to_read); + + int read_len; + s->cio->read(bp.c_str(), need_to_read, &read_len); + + in_data.append(bp, 0, read_len); + } + + *done = false; + int skip; + int index = index_of(in_data, cl, boundary, check_crlf, reached_boundary, &skip); + if (index >= 0) + max = index; + + if (max > in_data.length()) + max = in_data.length(); + + bl.substr_of(in_data, 0, max); + + bufferlist new_read_data; + + /* + * now we need to skip boundary for next time, also skip any crlf, or + * check to see if it's the last final boundary (marked with "--" at the end + */ + if (*reached_boundary) { + int left = in_data.length() - max; + if (left < skip + 2) { + int need = skip + 2 - left; + bufferptr boundary_bp(need); + int actual; + s->cio->read(boundary_bp.c_str(), need, &actual); + in_data.append(boundary_bp); + } + max += skip; // skip boundary for next time + if (in_data.length() >= max + 2) { + const char *data = in_data.c_str(); + if (is_crlf(data + max)) { + max += 2; + } else { + if (*(data + max) == '-' && + *(data + max + 1) == '-') { + *done = true; + max += 2; + } + } + } + } + + new_read_data.substr_of(in_data, max, in_data.length() - max); + in_data = new_read_data; + + return 0; +} + +int RGWPostObj_ObjStore_S3::read_line(bufferlist& bl, uint64_t max, + bool *reached_boundary, bool *done) +{ + return read_with_boundary(bl, max, true, reached_boundary, done); +} + +int RGWPostObj_ObjStore_S3::read_data(bufferlist& bl, uint64_t max, + bool *reached_boundary, bool *done) +{ + return read_with_boundary(bl, max, false, reached_boundary, done); +} + + +int RGWPostObj_ObjStore_S3::read_form_part_header(struct post_form_part *part, + bool *done) +{ + bufferlist bl; + bool reached_boundary; + uint64_t chunk_size = s->cct->_conf->rgw_max_chunk_size; + int r = read_line(bl, chunk_size, &reached_boundary, done); + if (r < 0) + return r; + + if (*done) { + return 0; + } + + if (reached_boundary) { // skip the first boundary + r = read_line(bl, chunk_size, &reached_boundary, done); + if (r < 0) + return r; + if (*done) + return 0; + } + + while (true) { + /* + * iterate through fields + */ + string line = rgw_trim_whitespace(string(bl.c_str(), bl.length())); + + if (line.empty()) + break; + + struct post_part_field field; + + string field_name; + r = parse_part_field(line, field_name, field); + if (r < 0) + return r; + + part->fields[field_name] = field; + + if (stringcasecmp(field_name, "Content-Disposition") == 0) { + part->name = field.params["name"]; + } + + if (reached_boundary) + break; + + r = read_line(bl, chunk_size, &reached_boundary, done); + } + + return 0; +} + +bool RGWPostObj_ObjStore_S3::part_str(const string& name, string *val) +{ + map::iterator iter = parts.find(name); + if (iter == parts.end()) + return false; + + bufferlist& data = iter->second.data; + string str = string(data.c_str(), data.length()); + *val = rgw_trim_whitespace(str); + return true; +} + +bool RGWPostObj_ObjStore_S3::part_bl(const string& name, bufferlist *pbl) +{ + map::iterator iter = parts.find(name); + if (iter == parts.end()) + return false; + + *pbl = iter->second.data; + return true; +} + +void RGWPostObj_ObjStore_S3::rebuild_key(string& key) +{ + static string var = "${filename}"; + int pos = key.find(var); + if (pos < 0) + return; + + string new_key = key.substr(0, pos); + new_key.append(filename); + new_key.append(key.substr(pos + var.size())); + + key = new_key; +} + +int RGWPostObj_ObjStore_S3::get_params() +{ + // get the part boundary + string req_content_type_str = s->info.env->get("CONTENT_TYPE", ""); + string req_content_type; + map params; + + if (s->expect_cont) { + /* ok, here it really gets ugly. With POST, the params are embedded in the + * request body, so we need to continue before being able to actually look + * at them. This diverts from the usual request flow. + */ + dump_continue(s); + s->expect_cont = false; + } + + parse_params(req_content_type_str, req_content_type, params); + + if (req_content_type.compare("multipart/form-data") != 0) { + err_msg = "Request Content-Type is not multipart/form-data"; + return -EINVAL; + } + + if (s->cct->_conf->subsys.should_gather(ceph_subsys_rgw, 20)) { + ldout(s->cct, 20) << "request content_type_str=" << req_content_type_str << dendl; + ldout(s->cct, 20) << "request content_type params:" << dendl; + map::iterator iter; + for (iter = params.begin(); iter != params.end(); ++iter) { + ldout(s->cct, 20) << " " << iter->first << " -> " << iter->second << dendl; + } + } + + ldout(s->cct, 20) << "adding bucket to policy env: " << s->bucket.name << dendl; + env.add_var("bucket", s->bucket.name); + + map::iterator iter = params.find("boundary"); + if (iter == params.end()) { + err_msg = "Missing multipart boundary specification"; + return -EINVAL; + } + + // create the boundary + boundary = "--"; + boundary.append(iter->second); + + bool done; + do { + struct post_form_part part; + int r = read_form_part_header(&part, &done); + if (r < 0) + return r; + + if (s->cct->_conf->subsys.should_gather(ceph_subsys_rgw, 20)) { + map::iterator piter; + for (piter = part.fields.begin(); piter != part.fields.end(); ++piter) { + ldout(s->cct, 20) << "read part header: name=" << part.name << " content_type=" << part.content_type << dendl; + ldout(s->cct, 20) << "name=" << piter->first << dendl; + ldout(s->cct, 20) << "val=" << piter->second.val << dendl; + ldout(s->cct, 20) << "params:" << dendl; + map& params = piter->second.params; + for (iter = params.begin(); iter != params.end(); ++iter) { + ldout(s->cct, 20) << " " << iter->first << " -> " << iter->second << dendl; + } + } + } + + if (done) { /* unexpected here */ + err_msg = "Malformed request"; + return -EINVAL; + } + + if (stringcasecmp(part.name, "file") == 0) { /* beginning of data transfer */ + struct post_part_field& field = part.fields["Content-Disposition"]; + map::iterator iter = field.params.find("filename"); + if (iter != field.params.end()) { + filename = iter->second; + } + parts[part.name] = part; + data_pending = true; + break; + } + + bool boundary; + uint64_t chunk_size = s->cct->_conf->rgw_max_chunk_size; + r = read_data(part.data, chunk_size, &boundary, &done); + if (!boundary) { + err_msg = "Couldn't find boundary"; + return -EINVAL; + } + parts[part.name] = part; + string part_str(part.data.c_str(), part.data.length()); + env.add_var(part.name, part_str); + } while (!done); + + if (!part_str("key", &s->object_str)) { + err_msg = "Key not specified"; + return -EINVAL; + } + + rebuild_key(s->object_str); + + if (s->object_str.empty()) { + err_msg = "Empty object name"; + return -EINVAL; + } + + env.add_var("key", s->object_str); + + part_str("Content-Type", &content_type); + env.add_var("Content-Type", content_type); + + map::iterator piter = parts.upper_bound(RGW_AMZ_META_PREFIX); + for (; piter != parts.end(); ++piter) { + string n = piter->first; + if (strncasecmp(n.c_str(), RGW_AMZ_META_PREFIX, sizeof(RGW_AMZ_META_PREFIX) - 1) != 0) + break; + + string attr_name = RGW_ATTR_PREFIX; + attr_name.append(n); + + /* need to null terminate it */ + bufferlist& data = piter->second.data; + string str = string(data.c_str(), data.length()); + + bufferlist attr_bl; + attr_bl.append(str.c_str(), str.size() + 1); + + attrs[attr_name] = attr_bl; + } + + int r = get_policy(); + if (r < 0) + return r; + + min_len = post_policy.min_length; + max_len = post_policy.max_length; + + return 0; +} + +int RGWPostObj_ObjStore_S3::get_policy() +{ + bufferlist encoded_policy; + + if (part_bl("policy", &encoded_policy)) { + + // check that the signature matches the encoded policy + string s3_access_key; + if (!part_str("AWSAccessKeyId", &s3_access_key)) { + ldout(s->cct, 0) << "No S3 access key found!" << dendl; + err_msg = "Missing access key"; + return -EINVAL; + } + string received_signature_str; + if (!part_str("signature", &received_signature_str)) { + ldout(s->cct, 0) << "No signature found!" << dendl; + err_msg = "Missing signature"; + return -EINVAL; + } + + RGWUserInfo user_info; + + ret = rgw_get_user_info_by_access_key(store, s3_access_key, user_info); + if (ret < 0) { + ldout(s->cct, 0) << "User lookup failed!" << dendl; + err_msg = "Bad access key / signature"; + return -EACCES; + } + + map access_keys = user_info.access_keys; + + map::const_iterator iter = access_keys.find(s3_access_key); + // We know the key must exist, since the user was returned by + // rgw_get_user_info_by_access_key, but it doesn't hurt to check! + if (iter == access_keys.end()) { + ldout(s->cct, 0) << "Secret key lookup failed!" << dendl; + err_msg = "No secret key for matching access key"; + return -EACCES; + } + string s3_secret_key = (iter->second).key; + + char expected_signature_char[CEPH_CRYPTO_HMACSHA1_DIGESTSIZE]; + + calc_hmac_sha1(s3_secret_key.c_str(), s3_secret_key.size(), encoded_policy.c_str(), encoded_policy.length(), expected_signature_char); + bufferlist expected_signature_hmac_raw; + bufferlist expected_signature_hmac_encoded; + expected_signature_hmac_raw.append(expected_signature_char, CEPH_CRYPTO_HMACSHA1_DIGESTSIZE); + expected_signature_hmac_raw.encode_base64(expected_signature_hmac_encoded); + expected_signature_hmac_encoded.append((char)0); /* null terminate */ + + if (received_signature_str.compare(expected_signature_hmac_encoded.c_str()) != 0) { + ldout(s->cct, 0) << "Signature verification failed!" << dendl; + ldout(s->cct, 0) << "received: " << received_signature_str.c_str() << dendl; + ldout(s->cct, 0) << "expected: " << expected_signature_hmac_encoded.c_str() << dendl; + err_msg = "Bad access key / signature"; + return -EACCES; + } + ldout(s->cct, 0) << "Successful Signature Verification!" << dendl; + bufferlist decoded_policy; + try { + decoded_policy.decode_base64(encoded_policy); + } catch (buffer::error& err) { + ldout(s->cct, 0) << "failed to decode_base64 policy" << dendl; + err_msg = "Could not decode policy"; + return -EINVAL; + } + + decoded_policy.append('\0'); // NULL terminate + + ldout(s->cct, 0) << "POST policy: " << decoded_policy.c_str() << dendl; + + int r = post_policy.from_json(decoded_policy, err_msg); + if (r < 0) { + if (err_msg.empty()) { + err_msg = "Failed to parse policy"; + } + ldout(s->cct, 0) << "failed to parse policy" << dendl; + return -EINVAL; + } + + post_policy.set_var_checked("AWSAccessKeyId"); + post_policy.set_var_checked("policy"); + post_policy.set_var_checked("signature"); + + r = post_policy.check(&env, err_msg); + if (r < 0) { + if (err_msg.empty()) { + err_msg = "Policy check failed"; + } + ldout(s->cct, 0) << "policy check failed" << dendl; + return r; + } + + s->user = user_info; + s->owner.set_id(user_info.user_id); + s->owner.set_name(user_info.display_name); + } else { + ldout(s->cct, 0) << "No attached policy found!" << dendl; + } + + string canned_acl; + part_str("acl", &canned_acl); + + RGWAccessControlPolicy_S3 s3policy(s->cct); + ldout(s->cct, 20) << "canned_acl=" << canned_acl << dendl; + if (s3policy.create_canned(s->owner, s->bucket_owner, canned_acl) < 0) { + err_msg = "Bad canned ACLs"; + return -EINVAL; + } + + policy = s3policy; + + return 0; +} + +int RGWPostObj_ObjStore_S3::complete_get_params() +{ + bool done; + do { + struct post_form_part part; + int r = read_form_part_header(&part, &done); + if (r < 0) + return r; + + bufferlist part_data; + bool boundary; + uint64_t chunk_size = s->cct->_conf->rgw_max_chunk_size; + r = read_data(part.data, chunk_size, &boundary, &done); + if (!boundary) { + return -EINVAL; + } + + parts[part.name] = part; + } while (!done); + + return 0; +} + +int RGWPostObj_ObjStore_S3::get_data(bufferlist& bl) +{ + bool boundary; + bool done; + + uint64_t chunk_size = s->cct->_conf->rgw_max_chunk_size; + int r = read_data(bl, chunk_size, &boundary, &done); + if (r < 0) + return r; + + if (boundary) { + data_pending = false; + + if (!done) { /* reached end of data, let's drain the rest of the params */ + r = complete_get_params(); + if (r < 0) + return r; + } + } + + return bl.length(); +} + +void RGWPostObj_ObjStore_S3::send_response() +{ + if (ret == 0 && parts.count("success_action_redirect")) { + string redirect; + + part_str("success_action_redirect", &redirect); + + string bucket; + string key; + string etag_str = "\""; + + etag_str.append(etag); + etag_str.append("\""); + + string etag_url; + + url_encode(s->bucket_name_str, bucket); + url_encode(s->object_str, key); + url_encode(etag_str, etag_url); + + redirect.append("?bucket="); + redirect.append(bucket); + redirect.append("&key="); + redirect.append(key); + redirect.append("&etag="); + redirect.append(etag_url); + + int r = check_utf8(redirect.c_str(), redirect.size()); + if (r < 0) { + ret = r; + goto done; + } + dump_redirect(s, redirect); + ret = STATUS_REDIRECT; + } else if (ret == 0 && parts.count("success_action_status")) { + string status_string; + uint32_t status_int; + + part_str("success_action_status", &status_string); + + int r = stringtoul(status_string, &status_int); + if (r < 0) { + ret = r; + goto done; + } + + switch (status_int) { + case 200: + break; + case 201: + ret = STATUS_CREATED; + break; + default: + ret = STATUS_NO_CONTENT; + break; + } + } else if (!ret) { + ret = STATUS_NO_CONTENT; + } + +done: + if (ret == STATUS_CREATED) { + s->formatter->open_object_section("PostResponse"); + if (g_conf->rgw_dns_name.length()) + s->formatter->dump_format("Location", "%s/%s", s->info.script_uri.c_str(), s->object_str.c_str()); + s->formatter->dump_string("Bucket", s->bucket_name_str); + s->formatter->dump_string("Key", s->object_str); + s->formatter->close_section(); + } + s->err.message = err_msg; + set_req_state_err(s, ret); + dump_errno(s); + if (ret >= 0) { + dump_content_length(s, s->formatter->get_len()); + } + end_header(s, this); + if (ret != STATUS_CREATED) + return; + + rgw_flush_formatter_and_reset(s, s->formatter); +} + + +void RGWDeleteObj_ObjStore_S3::send_response() +{ + int r = ret; + if (r == -ENOENT) + r = 0; + if (!r) + r = STATUS_NO_CONTENT; + + set_req_state_err(s, r); + dump_errno(s); + end_header(s, this); +} + +int RGWCopyObj_ObjStore_S3::init_dest_policy() +{ + RGWAccessControlPolicy_S3 s3policy(s->cct); + + /* build a policy for the target object */ + int r = create_s3_policy(s, store, s3policy); + if (r < 0) + return r; + + dest_policy = s3policy; + + return 0; +} + +int RGWCopyObj_ObjStore_S3::get_params() +{ + if_mod = s->info.env->get("HTTP_X_AMZ_COPY_IF_MODIFIED_SINCE"); + if_unmod = s->info.env->get("HTTP_X_AMZ_COPY_IF_UNMODIFIED_SINCE"); + if_match = s->info.env->get("HTTP_X_AMZ_COPY_IF_MATCH"); + if_nomatch = s->info.env->get("HTTP_X_AMZ_COPY_IF_NONE_MATCH"); + + src_bucket_name = s->src_bucket_name; + src_object = s->src_object; + dest_bucket_name = s->bucket.name; + dest_object = s->object_str; + + if (s->system_request) { + source_zone = s->info.args.get(RGW_SYS_PARAM_PREFIX "source-zone"); + if (!source_zone.empty()) { + client_id = s->info.args.get(RGW_SYS_PARAM_PREFIX "client-id"); + op_id = s->info.args.get(RGW_SYS_PARAM_PREFIX "op-id"); + + if (client_id.empty() || op_id.empty()) { + ldout(s->cct, 0) << RGW_SYS_PARAM_PREFIX "client-id or " RGW_SYS_PARAM_PREFIX "op-id were not provided, required for intra-region copy" << dendl; + return -EINVAL; + } + } + } + + const char *md_directive = s->info.env->get("HTTP_X_AMZ_METADATA_DIRECTIVE"); + if (md_directive) { + if (strcasecmp(md_directive, "COPY") == 0) { + replace_attrs = false; + } else if (strcasecmp(md_directive, "REPLACE") == 0) { + replace_attrs = true; + } else if (!source_zone.empty()) { + replace_attrs = false; // default for intra-region copy + } else { + ldout(s->cct, 0) << "invalid metadata directive" << dendl; + return -EINVAL; + } + } + + if (source_zone.empty() && + (dest_bucket_name.compare(src_bucket_name) == 0) && + (dest_object.compare(src_object) == 0) && + !replace_attrs) { + /* can only copy object into itself if replacing attrs */ + ldout(s->cct, 0) << "can't copy object into itself if not replacing attrs" << dendl; + return -ERR_INVALID_REQUEST; + } + return 0; +} + +void RGWCopyObj_ObjStore_S3::send_partial_response(off_t ofs) +{ + if (!sent_header) { + if (ret) + set_req_state_err(s, ret); + dump_errno(s); + + end_header(s, this, "binary/octet-stream"); + if (ret == 0) { + s->formatter->open_object_section("CopyObjectResult"); + } + sent_header = true; + } else { + /* Send progress field. Note that this diverge from the original S3 + * spec. We do this in order to keep connection alive. + */ + s->formatter->dump_int("Progress", (uint64_t)ofs); + } + rgw_flush_formatter(s, s->formatter); +} + +void RGWCopyObj_ObjStore_S3::send_response() +{ + if (!sent_header) + send_partial_response(0); + + if (ret == 0) { + dump_time(s, "LastModified", &mtime); + map::iterator iter = attrs.find(RGW_ATTR_ETAG); + if (iter != attrs.end()) { + bufferlist& bl = iter->second; + if (bl.length()) { + char *etag = bl.c_str(); + s->formatter->dump_string("ETag", etag); + } + } + s->formatter->close_section(); + rgw_flush_formatter_and_reset(s, s->formatter); + } +} + +void RGWGetACLs_ObjStore_S3::send_response() +{ + if (ret) + set_req_state_err(s, ret); + dump_errno(s); + end_header(s, this, "application/xml"); + dump_start(s); + s->cio->write(acls.c_str(), acls.size()); +} + +int RGWPutACLs_ObjStore_S3::get_policy_from_state(RGWRados *store, struct req_state *s, stringstream& ss) +{ + RGWAccessControlPolicy_S3 s3policy(s->cct); + + // bucket-* canned acls do not apply to bucket + if (s->object_str.empty()) { + if (s->canned_acl.find("bucket") != string::npos) + s->canned_acl.clear(); + } + + int r = create_s3_policy(s, store, s3policy); + if (r < 0) + return r; + + s3policy.to_xml(ss); + + return 0; +} + +void RGWPutACLs_ObjStore_S3::send_response() +{ + if (ret) + set_req_state_err(s, ret); + dump_errno(s); + end_header(s, this, "application/xml"); + dump_start(s); +} + +void RGWGetCORS_ObjStore_S3::send_response() +{ + if (ret) { + if (ret == -ENOENT) + set_req_state_err(s, ERR_NOT_FOUND); + else + set_req_state_err(s, ret); + } + dump_errno(s); + end_header(s, NULL, "application/xml"); + dump_start(s); + if (!ret) { + string cors; + RGWCORSConfiguration_S3 *s3cors = static_cast(&bucket_cors); + stringstream ss; + + s3cors->to_xml(ss); + cors = ss.str(); + s->cio->write(cors.c_str(), cors.size()); + } +} + +int RGWPutCORS_ObjStore_S3::get_params() +{ + int r; + char *data = NULL; + int len = 0; + size_t cl = 0; + RGWCORSXMLParser_S3 parser(s->cct); + RGWCORSConfiguration_S3 *cors_config; + + if (s->length) + cl = atoll(s->length); + if (cl) { + data = (char *)malloc(cl + 1); + if (!data) { + r = -ENOMEM; + goto done_err; + } + int read_len; + r = s->cio->read(data, cl, &read_len); + len = read_len; + if (r < 0) + goto done_err; + data[len] = '\0'; + } else { + len = 0; + } + + if (!parser.init()) { + r = -EINVAL; + goto done_err; + } + + if (!data || !parser.parse(data, len, 1)) { + r = -EINVAL; + goto done_err; + } + cors_config = static_cast(parser.find_first("CORSConfiguration")); + if (!cors_config) { + r = -EINVAL; + goto done_err; + } + + if (s->cct->_conf->subsys.should_gather(ceph_subsys_rgw, 15)) { + ldout(s->cct, 15) << "CORSConfiguration"; + cors_config->to_xml(*_dout); + *_dout << dendl; + } + + cors_config->encode(cors_bl); + + free(data); + return 0; +done_err: + free(data); + return r; +} + +void RGWPutCORS_ObjStore_S3::send_response() +{ + if (ret) + set_req_state_err(s, ret); + dump_errno(s); + end_header(s, NULL, "application/xml"); + dump_start(s); +} + +void RGWDeleteCORS_ObjStore_S3::send_response() +{ + int r = ret; + if (!r || r == -ENOENT) + r = STATUS_NO_CONTENT; + + set_req_state_err(s, r); + dump_errno(s); + end_header(s, NULL); +} + +void RGWOptionsCORS_ObjStore_S3::send_response() +{ + string hdrs, exp_hdrs; + uint32_t max_age = CORS_MAX_AGE_INVALID; + /*EACCES means, there is no CORS registered yet for the bucket + *ENOENT means, there is no match of the Origin in the list of CORSRule + */ + if (ret == -ENOENT) + ret = -EACCES; + if (ret < 0) { + set_req_state_err(s, ret); + dump_errno(s); + end_header(s, NULL); + return; + } + get_response_params(hdrs, exp_hdrs, &max_age); + + dump_errno(s); + dump_access_control(s, origin, req_meth, hdrs.c_str(), exp_hdrs.c_str(), max_age); + end_header(s, NULL); +} + +int RGWInitMultipart_ObjStore_S3::get_params() +{ + RGWAccessControlPolicy_S3 s3policy(s->cct); + ret = create_s3_policy(s, store, s3policy); + if (ret < 0) + return ret; + + policy = s3policy; + + return 0; +} + +void RGWInitMultipart_ObjStore_S3::send_response() +{ + if (ret) + set_req_state_err(s, ret); + dump_errno(s); + end_header(s, this, "application/xml"); + if (ret == 0) { + dump_start(s); + s->formatter->open_object_section_in_ns("InitiateMultipartUploadResult", + "http://s3.amazonaws.com/doc/2006-03-01/"); + s->formatter->dump_string("Bucket", s->bucket_name_str); + s->formatter->dump_string("Key", s->object); + s->formatter->dump_string("UploadId", upload_id); + s->formatter->close_section(); + rgw_flush_formatter_and_reset(s, s->formatter); + } +} + +void RGWCompleteMultipart_ObjStore_S3::send_response() +{ + if (ret) + set_req_state_err(s, ret); + dump_errno(s); + end_header(s, this, "application/xml"); + if (ret == 0) { + dump_start(s); + s->formatter->open_object_section_in_ns("CompleteMultipartUploadResult", + "http://s3.amazonaws.com/doc/2006-03-01/"); + if (g_conf->rgw_dns_name.length()) + s->formatter->dump_format("Location", "%s.%s", s->bucket_name_str.c_str(), g_conf->rgw_dns_name.c_str()); + s->formatter->dump_string("Bucket", s->bucket_name_str); + s->formatter->dump_string("Key", s->object); + s->formatter->dump_string("ETag", etag); + s->formatter->close_section(); + rgw_flush_formatter_and_reset(s, s->formatter); + } +} + +void RGWAbortMultipart_ObjStore_S3::send_response() +{ + int r = ret; + if (!r) + r = STATUS_NO_CONTENT; + + set_req_state_err(s, r); + dump_errno(s); + end_header(s, this); +} + +void RGWListMultipart_ObjStore_S3::send_response() +{ + if (ret) + set_req_state_err(s, ret); + dump_errno(s); + end_header(s, this, "application/xml"); + + if (ret == 0) { + dump_start(s); + s->formatter->open_object_section_in_ns("ListMultipartUploadResult", + "http://s3.amazonaws.com/doc/2006-03-01/"); + map::iterator iter; + map::reverse_iterator test_iter; + int cur_max = 0; + + iter = parts.begin(); + test_iter = parts.rbegin(); + if (test_iter != parts.rend()) { + cur_max = test_iter->first; + } + s->formatter->dump_string("Bucket", s->bucket_name_str); + s->formatter->dump_string("Key", s->object); + s->formatter->dump_string("UploadId", upload_id); + s->formatter->dump_string("StorageClass", "STANDARD"); + s->formatter->dump_int("PartNumberMarker", marker); + s->formatter->dump_int("NextPartNumberMarker", cur_max); + s->formatter->dump_int("MaxParts", max_parts); + s->formatter->dump_string("IsTruncated", (truncated ? "true" : "false")); + + ACLOwner& owner = policy.get_owner(); + dump_owner(s, owner.get_id(), owner.get_display_name()); + + for (; iter != parts.end(); ++iter) { + RGWUploadPartInfo& info = iter->second; + + time_t sec = info.modified.sec(); + struct tm tmp; + gmtime_r(&sec, &tmp); + char buf[TIME_BUF_SIZE]; + + s->formatter->open_object_section("Part"); + + if (strftime(buf, sizeof(buf), "%Y-%m-%dT%T.000Z", &tmp) > 0) { + s->formatter->dump_string("LastModified", buf); + } + + s->formatter->dump_unsigned("PartNumber", info.num); + s->formatter->dump_string("ETag", info.etag); + s->formatter->dump_unsigned("Size", info.size); + s->formatter->close_section(); + } + s->formatter->close_section(); + rgw_flush_formatter_and_reset(s, s->formatter); + } +} + +void RGWListBucketMultiparts_ObjStore_S3::send_response() +{ + if (ret < 0) + set_req_state_err(s, ret); + dump_errno(s); + + end_header(s, this, "application/xml"); + dump_start(s); + if (ret < 0) + return; + + s->formatter->open_object_section("ListMultipartUploadsResult"); + s->formatter->dump_string("Bucket", s->bucket_name_str); + if (!prefix.empty()) + s->formatter->dump_string("ListMultipartUploadsResult.Prefix", prefix); + string& key_marker = marker.get_key(); + if (!key_marker.empty()) + s->formatter->dump_string("KeyMarker", key_marker); + string& upload_id_marker = marker.get_upload_id(); + if (!upload_id_marker.empty()) + s->formatter->dump_string("UploadIdMarker", upload_id_marker); + string next_key = next_marker.mp.get_key(); + if (!next_key.empty()) + s->formatter->dump_string("NextKeyMarker", next_key); + string next_upload_id = next_marker.mp.get_upload_id(); + if (!next_upload_id.empty()) + s->formatter->dump_string("NextUploadIdMarker", next_upload_id); + s->formatter->dump_int("MaxUploads", max_uploads); + if (!delimiter.empty()) + s->formatter->dump_string("Delimiter", delimiter); + s->formatter->dump_string("IsTruncated", (is_truncated ? "true" : "false")); + + if (ret >= 0) { + vector::iterator iter; + for (iter = uploads.begin(); iter != uploads.end(); ++iter) { + RGWMPObj& mp = iter->mp; + s->formatter->open_array_section("Upload"); + s->formatter->dump_string("Key", mp.get_key()); + s->formatter->dump_string("UploadId", mp.get_upload_id()); + dump_owner(s, s->user.user_id, s->user.display_name, "Initiator"); + dump_owner(s, s->user.user_id, s->user.display_name); + s->formatter->dump_string("StorageClass", "STANDARD"); + time_t mtime = iter->obj.mtime.sec(); + dump_time(s, "Initiated", &mtime); + s->formatter->close_section(); + } + if (common_prefixes.size() > 0) { + s->formatter->open_array_section("CommonPrefixes"); + map::iterator pref_iter; + for (pref_iter = common_prefixes.begin(); pref_iter != common_prefixes.end(); ++pref_iter) { + s->formatter->dump_string("CommonPrefixes.Prefix", pref_iter->first); + } + s->formatter->close_section(); + } + } + s->formatter->close_section(); + rgw_flush_formatter_and_reset(s, s->formatter); +} + +void RGWDeleteMultiObj_ObjStore_S3::send_status() +{ + if (!status_dumped) { + if (ret < 0) + set_req_state_err(s, ret); + dump_errno(s); + status_dumped = true; + } +} + +void RGWDeleteMultiObj_ObjStore_S3::begin_response() +{ + + if (!status_dumped) { + send_status(); + } + + dump_start(s); + end_header(s, this, "application/xml"); + s->formatter->open_object_section_in_ns("DeleteResult", + "http://s3.amazonaws.com/doc/2006-03-01/"); + + rgw_flush_formatter(s, s->formatter); +} + +void RGWDeleteMultiObj_ObjStore_S3::send_partial_response(pair& result) +{ + if (!result.first.empty()) { + if (result.second == 0 && !quiet) { + s->formatter->open_object_section("Deleted"); + s->formatter->dump_string("Key", result.first); + s->formatter->close_section(); + } else if (result.second < 0) { + struct rgw_http_errors r; + int err_no; + + s->formatter->open_object_section("Error"); + + err_no = -(result.second); + rgw_get_errno_s3(&r, err_no); + + s->formatter->dump_string("Key", result.first); + s->formatter->dump_int("Code", r.http_ret); + s->formatter->dump_string("Message", r.s3_code); + s->formatter->close_section(); + } + + rgw_flush_formatter(s, s->formatter); + } +} + +void RGWDeleteMultiObj_ObjStore_S3::end_response() +{ + + s->formatter->close_section(); + rgw_flush_formatter_and_reset(s, s->formatter); +} + +RGWOp *RGWHandler_ObjStore_Service_S3::op_get() +{ + return new RGWListBuckets_ObjStore_S3; +} + +RGWOp *RGWHandler_ObjStore_Service_S3::op_head() +{ + return new RGWListBuckets_ObjStore_S3; +} + +RGWOp *RGWHandler_ObjStore_Bucket_S3::get_obj_op(bool get_data) +{ + if (get_data) + return new RGWListBucket_ObjStore_S3; + else + return new RGWStatBucket_ObjStore_S3; +} + +RGWOp *RGWHandler_ObjStore_Bucket_S3::op_get() +{ + if (s->info.args.sub_resource_exists("logging")) + return new RGWGetBucketLogging_ObjStore_S3; + if (is_acl_op()) { + return new RGWGetACLs_ObjStore_S3; + } else if (is_cors_op()) { + return new RGWGetCORS_ObjStore_S3; + } else if (s->info.args.exists("uploads")) { + return new RGWListBucketMultiparts_ObjStore_S3; + } + return get_obj_op(true); +} + +RGWOp *RGWHandler_ObjStore_Bucket_S3::op_head() +{ + if (is_acl_op()) { + return new RGWGetACLs_ObjStore_S3; + } else if (s->info.args.exists("uploads")) { + return new RGWListBucketMultiparts_ObjStore_S3; + } + return get_obj_op(false); +} + +RGWOp *RGWHandler_ObjStore_Bucket_S3::op_put() +{ + if (s->info.args.sub_resource_exists("logging")) + return NULL; + if (is_acl_op()) { + return new RGWPutACLs_ObjStore_S3; + } else if (is_cors_op()) { + return new RGWPutCORS_ObjStore_S3; + } + return new RGWCreateBucket_ObjStore_S3; +} + +RGWOp *RGWHandler_ObjStore_Bucket_S3::op_delete() +{ + if (is_cors_op()) { + return new RGWDeleteCORS_ObjStore_S3; + } + return new RGWDeleteBucket_ObjStore_S3; +} + +RGWOp *RGWHandler_ObjStore_Bucket_S3::op_post() +{ + if ( s->info.request_params == "delete" ) { + return new RGWDeleteMultiObj_ObjStore_S3; + } + + return new RGWPostObj_ObjStore_S3; +} + +RGWOp *RGWHandler_ObjStore_Bucket_S3::op_options() +{ + return new RGWOptionsCORS_ObjStore_S3; +} + +RGWOp *RGWHandler_ObjStore_Obj_S3::get_obj_op(bool get_data) +{ + if (is_acl_op()) { + return new RGWGetACLs_ObjStore_S3; + } + RGWGetObj_ObjStore_S3 *get_obj_op = new RGWGetObj_ObjStore_S3; + get_obj_op->set_get_data(get_data); + return get_obj_op; +} + +RGWOp *RGWHandler_ObjStore_Obj_S3::op_get() +{ + if (is_acl_op()) { + return new RGWGetACLs_ObjStore_S3; + } else if (s->info.args.exists("uploadId")) { + return new RGWListMultipart_ObjStore_S3; + } + return get_obj_op(true); +} + +RGWOp *RGWHandler_ObjStore_Obj_S3::op_head() +{ + if (is_acl_op()) { + return new RGWGetACLs_ObjStore_S3; + } else if (s->info.args.exists("uploadId")) { + return new RGWListMultipart_ObjStore_S3; + } + return get_obj_op(false); +} + +RGWOp *RGWHandler_ObjStore_Obj_S3::op_put() +{ + if (is_acl_op()) { + return new RGWPutACLs_ObjStore_S3; + } + if (!s->copy_source) + return new RGWPutObj_ObjStore_S3; + else + return new RGWCopyObj_ObjStore_S3; +} + +RGWOp *RGWHandler_ObjStore_Obj_S3::op_delete() +{ + string upload_id = s->info.args.get("uploadId"); + + if (upload_id.empty()) + return new RGWDeleteObj_ObjStore_S3; + else + return new RGWAbortMultipart_ObjStore_S3; +} + +RGWOp *RGWHandler_ObjStore_Obj_S3::op_post() +{ + if (s->info.args.exists("uploadId")) + return new RGWCompleteMultipart_ObjStore_S3; + + if (s->info.args.exists("uploads")) + return new RGWInitMultipart_ObjStore_S3; + + return NULL; +} + +RGWOp *RGWHandler_ObjStore_Obj_S3::op_options() +{ + return new RGWOptionsCORS_ObjStore_S3; +} + +int RGWHandler_ObjStore_S3::init_from_header(struct req_state *s, int default_formatter, bool configurable_format) +{ + string req; + string first; + + const char *req_name = s->relative_uri.c_str(); + const char *p; + + if (*req_name == '?') { + p = req_name; + } else { + p = s->info.request_params.c_str(); + } + + s->info.args.set(p); + s->info.args.parse(); + + /* must be called after the args parsing */ + int ret = allocate_formatter(s, default_formatter, configurable_format); + if (ret < 0) + return ret; + + if (*req_name != '/') + return 0; + + req_name++; + + if (!*req_name) + return 0; + + req = req_name; + int pos = req.find('/'); + if (pos >= 0) { + first = req.substr(0, pos); + } else { + first = req; + } + + if (s->bucket_name_str.empty()) { + s->bucket_name_str = first; + + if (pos >= 0) { + string encoded_obj_str = req.substr(pos+1); + s->object_str = encoded_obj_str; + + if (s->object_str.size() > 0) { + s->object = strdup(s->object_str.c_str()); + } + } + } else { + s->object_str = req_name; + s->object = strdup(s->object_str.c_str()); + } + return 0; +} + +static bool looks_like_ip_address(const char *bucket) +{ + int num_periods = 0; + bool expect_period = false; + for (const char *b = bucket; *b; ++b) { + if (*b == '.') { + if (!expect_period) + return false; + ++num_periods; + if (num_periods > 3) + return false; + expect_period = false; + } + else if (isdigit(*b)) { + expect_period = true; + } + else { + return false; + } + } + return (num_periods == 3); +} + +int RGWHandler_ObjStore_S3::validate_bucket_name(const string& bucket, bool relaxed_names) +{ + int ret = RGWHandler_ObjStore::validate_bucket_name(bucket); + if (ret < 0) + return ret; + + if (bucket.size() == 0) + return 0; + + // bucket names must start with a number, letter, or underscore + if (!(isalpha(bucket[0]) || isdigit(bucket[0]))) { + if (!relaxed_names) + return -ERR_INVALID_BUCKET_NAME; + else if (!(bucket[0] == '_' || bucket[0] == '.' || bucket[0] == '-')) + return -ERR_INVALID_BUCKET_NAME; + } + + for (const char *s = bucket.c_str(); *s; ++s) { + char c = *s; + if (isdigit(c) || (c == '.')) + continue; + if (isalpha(c)) + continue; + if ((c == '-') || (c == '_')) + continue; + // Invalid character + return -ERR_INVALID_BUCKET_NAME; + } + + if (looks_like_ip_address(bucket.c_str())) + return -ERR_INVALID_BUCKET_NAME; + + return 0; +} + +int RGWHandler_ObjStore_S3::init(RGWRados *store, struct req_state *s, RGWClientIO *cio) +{ + dout(10) << "s->object=" << (s->object ? s->object : "") << " s->bucket=" << (!s->bucket_name_str.empty() ? s->bucket_name_str : "") << dendl; + + bool relaxed_names = s->cct->_conf->rgw_relaxed_s3_bucket_names; + int ret = validate_bucket_name(s->bucket_name_str, relaxed_names); + if (ret) + return ret; + ret = validate_object_name(s->object_str); + if (ret) + return ret; + + const char *cacl = s->info.env->get("HTTP_X_AMZ_ACL"); + if (cacl) + s->canned_acl = cacl; + + s->has_acl_header = s->info.env->exists_prefix("HTTP_X_AMZ_GRANT"); + + s->copy_source = s->info.env->get("HTTP_X_AMZ_COPY_SOURCE"); + if (s->copy_source) { + ret = RGWCopyObj::parse_copy_location(s->copy_source, s->src_bucket_name, s->src_object); + if (!ret) { + ldout(s->cct, 0) << "failed to parse copy location" << dendl; + return -EINVAL; + } + } + + s->dialect = "s3"; + + return RGWHandler_ObjStore::init(store, s, cio); +} + + +/* + * Try to validate S3 auth against keystone s3token interface + */ +int RGW_Auth_S3_Keystone_ValidateToken::validate_s3token(const string& auth_id, const string& auth_token, const string& auth_sign) { + /* prepare keystone url */ + string keystone_url = cct->_conf->rgw_keystone_url; + if (keystone_url[keystone_url.size() - 1] != '/') + keystone_url.append("/"); + keystone_url.append("v2.0/s3tokens"); + + /* set required headers for keystone request */ + append_header("X-Auth-Token", cct->_conf->rgw_keystone_admin_token); + append_header("Content-Type", "application/json"); + + /* encode token */ + bufferlist token_buff; + bufferlist token_encoded; + token_buff.append(auth_token); + token_buff.encode_base64(token_encoded); + token_encoded.append((char)0); + + /* create json credentials request body */ + JSONFormatter credentials(false); + credentials.open_object_section(""); + credentials.open_object_section("credentials"); + credentials.dump_string("access", auth_id); + credentials.dump_string("token", token_encoded.c_str()); + credentials.dump_string("signature", auth_sign); + credentials.close_section(); + credentials.close_section(); + + std::stringstream os; + credentials.flush(os); + set_tx_buffer(os.str()); + + /* send request */ + int ret = process("POST", keystone_url.c_str()); + if (ret < 0) { + dout(2) << "s3 keystone: token validation ERROR: " << rx_buffer.c_str() << dendl; + return -EPERM; + } + + /* now parse response */ + if (response.parse(cct, rx_buffer) < 0) { + dout(2) << "s3 keystone: token parsing failed" << dendl; + return -EPERM; + } + + /* check if we have a valid role */ + bool found = false; + list::iterator iter; + for (iter = roles_list.begin(); iter != roles_list.end(); ++iter) { + if ((found=response.user.has_role(*iter))==true) + break; + } + + if (!found) { + ldout(cct, 5) << "s3 keystone: user does not hold a matching role; required roles: " << cct->_conf->rgw_keystone_accepted_roles << dendl; + return -EPERM; + } + + /* everything seems fine, continue with this user */ + ldout(cct, 5) << "s3 keystone: validated token: " << response.token.tenant.name << ":" << response.user.name << " expires: " << response.token.expires << dendl; + return 0; +} + +/* + * verify that a signed request comes from the keyholder + * by checking the signature against our locally-computed version + */ +int RGW_Auth_S3::authorize(RGWRados *store, struct req_state *s) +{ + bool qsr = false; + string auth_id; + string auth_sign; + + time_t now; + time(&now); + + /* neither keystone and rados enabled; warn and exit! */ + if (!store->ctx()->_conf->rgw_s3_auth_use_rados + && !store->ctx()->_conf->rgw_s3_auth_use_keystone) { + dout(0) << "WARNING: no authorization backend enabled! Users will never authenticate." << dendl; + return -EPERM; + } + + if (!s->http_auth || !(*s->http_auth)) { + auth_id = s->info.args.get("AWSAccessKeyId"); + if (auth_id.size()) { + auth_sign = s->info.args.get("Signature"); + + string date = s->info.args.get("Expires"); + time_t exp = atoll(date.c_str()); + if (now >= exp) + return -EPERM; + + qsr = true; + } else { + /* anonymous access */ + rgw_get_anon_user(s->user); + s->perm_mask = RGW_PERM_FULL_CONTROL; + return 0; + } + } else { + if (strncmp(s->http_auth, "AWS ", 4)) + return -EINVAL; + string auth_str(s->http_auth + 4); + int pos = auth_str.find(':'); + if (pos < 0) + return -EINVAL; + + auth_id = auth_str.substr(0, pos); + auth_sign = auth_str.substr(pos + 1); + } + + /* try keystone auth first */ + int keystone_result = -EINVAL; + if (store->ctx()->_conf->rgw_s3_auth_use_keystone + && !store->ctx()->_conf->rgw_keystone_url.empty()) { + dout(20) << "s3 keystone: trying keystone auth" << dendl; + + RGW_Auth_S3_Keystone_ValidateToken keystone_validator(store->ctx()); + string token; + + if (!rgw_create_s3_canonical_header(s->info, &s->header_time, token, qsr)) { + dout(10) << "failed to create auth header\n" << token << dendl; + } else { + keystone_result = keystone_validator.validate_s3token(auth_id, token, auth_sign); + if (keystone_result == 0) { + s->user.user_id = keystone_validator.response.token.tenant.id; + s->user.display_name = keystone_validator.response.token.tenant.name; // wow. + + /* try to store user if it not already exists */ + if (rgw_get_user_info_by_uid(store, keystone_validator.response.token.tenant.id, s->user) < 0) { + int ret = rgw_store_user_info(store, s->user, NULL, NULL, 0, true); + if (ret < 0) + dout(10) << "NOTICE: failed to store new user's info: ret=" << ret << dendl; + } + + s->perm_mask = RGW_PERM_FULL_CONTROL; + } + } + } + + /* keystone failed (or not enabled); check if we want to use rados backend */ + if (!store->ctx()->_conf->rgw_s3_auth_use_rados + && keystone_result < 0) + return keystone_result; + + /* now try rados backend, but only if keystone did not succeed */ + if (keystone_result < 0) { + /* get the user info */ + if (rgw_get_user_info_by_access_key(store, auth_id, s->user) < 0) { + dout(5) << "error reading user info, uid=" << auth_id << " can't authenticate" << dendl; + return -EPERM; + } + + /* now verify signature */ + + string auth_hdr; + if (!rgw_create_s3_canonical_header(s->info, &s->header_time, auth_hdr, qsr)) { + dout(10) << "failed to create auth header\n" << auth_hdr << dendl; + return -EPERM; + } + dout(10) << "auth_hdr:\n" << auth_hdr << dendl; + + time_t req_sec = s->header_time.sec(); + if ((req_sec < now - RGW_AUTH_GRACE_MINS * 60 || + req_sec > now + RGW_AUTH_GRACE_MINS * 60) && !qsr) { + dout(10) << "req_sec=" << req_sec << " now=" << now << "; now - RGW_AUTH_GRACE_MINS=" << now - RGW_AUTH_GRACE_MINS * 60 << "; now + RGW_AUTH_GRACE_MINS=" << now + RGW_AUTH_GRACE_MINS * 60 << dendl; + dout(0) << "NOTICE: request time skew too big now=" << utime_t(now, 0) << " req_time=" << s->header_time << dendl; + return -ERR_REQUEST_TIME_SKEWED; + } + + map::iterator iter = s->user.access_keys.find(auth_id); + if (iter == s->user.access_keys.end()) { + dout(0) << "ERROR: access key not encoded in user info" << dendl; + return -EPERM; + } + RGWAccessKey& k = iter->second; + + if (!k.subuser.empty()) { + map::iterator uiter = s->user.subusers.find(k.subuser); + if (uiter == s->user.subusers.end()) { + dout(0) << "NOTICE: could not find subuser: " << k.subuser << dendl; + return -EPERM; + } + RGWSubUser& subuser = uiter->second; + s->perm_mask = subuser.perm_mask; + } else + s->perm_mask = RGW_PERM_FULL_CONTROL; + + string digest; + int ret = rgw_get_s3_header_digest(auth_hdr, k.key, digest); + if (ret < 0) { + return -EPERM; + } + + dout(15) << "calculated digest=" << digest << dendl; + dout(15) << "auth_sign=" << auth_sign << dendl; + dout(15) << "compare=" << auth_sign.compare(digest) << dendl; + + if (auth_sign != digest) + return -EPERM; + + if (s->user.system) { + s->system_request = true; + dout(20) << "system request" << dendl; + s->info.args.set_system(); + string effective_uid = s->info.args.get(RGW_SYS_PARAM_PREFIX "uid"); + RGWUserInfo effective_user; + if (!effective_uid.empty()) { + ret = rgw_get_user_info_by_uid(store, effective_uid, effective_user); + if (ret < 0) { + ldout(s->cct, 0) << "User lookup failed!" << dendl; + return -ENOENT; + } + s->user = effective_user; + } + } + + } /* if keystone_result < 0 */ + + // populate the owner info + s->owner.set_id(s->user.user_id); + s->owner.set_name(s->user.display_name); + + + return 0; +} + +int RGWHandler_Auth_S3::init(RGWRados *store, struct req_state *state, RGWClientIO *cio) +{ + int ret = RGWHandler_ObjStore_S3::init_from_header(state, RGW_FORMAT_JSON, true); + if (ret < 0) + return ret; + + return RGWHandler_ObjStore::init(store, state, cio); +} + +RGWHandler *RGWRESTMgr_S3::get_handler(struct req_state *s) +{ + int ret = RGWHandler_ObjStore_S3::init_from_header(s, RGW_FORMAT_XML, false); + if (ret < 0) + return NULL; + + if (s->bucket_name_str.empty()) + return new RGWHandler_ObjStore_Service_S3; + + if (!s->object) + return new RGWHandler_ObjStore_Bucket_S3; + + return new RGWHandler_ObjStore_Obj_S3; +} diff --git a/src/test/librados/cmd.cc b/src/test/librados/cmd.cc index 4f327a0e84b2b..0a7ed16a18010 100644 --- a/src/test/librados/cmd.cc +++ b/src/test/librados/cmd.cc @@ -49,6 +49,41 @@ TEST(LibRadosCmd, MonDescribe) { rados_buffer_free(buf); rados_buffer_free(st); + cmd[0] = (char *)""; + ASSERT_EQ(-EINVAL, rados_mon_command(cluster, (const char **)cmd, 1, "{}", 2, &buf, &buflen, &st, &stlen)); + rados_buffer_free(buf); + rados_buffer_free(st); + + cmd[0] = (char *)"{}"; + ASSERT_EQ(-EINVAL, rados_mon_command(cluster, (const char **)cmd, 1, "", 0, &buf, &buflen, &st, &stlen)); + rados_buffer_free(buf); + rados_buffer_free(st); + + cmd[0] = (char *)"{\"abc\":\"something\"}"; + ASSERT_EQ(-EINVAL, rados_mon_command(cluster, (const char **)cmd, 1, "", 0, &buf, &buflen, &st, &stlen)); + rados_buffer_free(buf); + rados_buffer_free(st); + + cmd[0] = (char *)"{\"prefix\":\"\"}"; + ASSERT_EQ(-EINVAL, rados_mon_command(cluster, (const char **)cmd, 1, "", 0, &buf, &buflen, &st, &stlen)); + rados_buffer_free(buf); + rados_buffer_free(st); + + cmd[0] = (char *)"{\"prefix\":\" \"}"; + ASSERT_EQ(-EINVAL, rados_mon_command(cluster, (const char **)cmd, 1, "", 0, &buf, &buflen, &st, &stlen)); + rados_buffer_free(buf); + rados_buffer_free(st); + + cmd[0] = (char *)"{\"prefix\":\";;;,,,;;,,\"}"; + ASSERT_EQ(-EINVAL, rados_mon_command(cluster, (const char **)cmd, 1, "", 0, &buf, &buflen, &st, &stlen)); + rados_buffer_free(buf); + rados_buffer_free(st); + + cmd[0] = (char *)"{\"prefix\":\"extra command\"}"; + ASSERT_EQ(-EINVAL, rados_mon_command(cluster, (const char **)cmd, 1, "", 0, &buf, &buflen, &st, &stlen)); + rados_buffer_free(buf); + rados_buffer_free(st); + cmd[0] = (char *)"{\"prefix\":\"mon_status\"}"; ASSERT_EQ(0, rados_mon_command(cluster, (const char **)cmd, 1, "", 0, &buf, &buflen, &st, &stlen)); ASSERT_LT(0u, buflen); diff --git a/src/test/librados/cmd.cc.orig b/src/test/librados/cmd.cc.orig new file mode 100644 index 0000000000000..4f327a0e84b2b --- /dev/null +++ b/src/test/librados/cmd.cc.orig @@ -0,0 +1,210 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "mds/mdstypes.h" +#include "include/buffer.h" +#include "include/rbd_types.h" +#include "include/rados/librados.h" +#include "include/rados/librados.hpp" +#include "include/stringify.h" +#include "test/librados/test.h" + +#include "common/Cond.h" + +#include "gtest/gtest.h" +#include +#include +#include +#include + +using namespace librados; +using ceph::buffer; +using std::map; +using std::ostringstream; +using std::string; + +TEST(LibRadosCmd, MonDescribe) { + rados_t cluster; + ASSERT_EQ("", connect_cluster(&cluster)); + + char *buf, *st; + size_t buflen, stlen; + char *cmd[2]; + + cmd[1] = NULL; + + cmd[0] = (char *)"{\"prefix\":\"get_command_descriptions\"}"; + ASSERT_EQ(0, rados_mon_command(cluster, (const char **)cmd, 1, "", 0, &buf, &buflen, &st, &stlen)); + ASSERT_LT(0u, buflen); + rados_buffer_free(buf); + rados_buffer_free(st); + + cmd[0] = (char *)"get_command_descriptions"; + ASSERT_EQ(-EINVAL, rados_mon_command(cluster, (const char **)cmd, 1, "", 0, &buf, &buflen, &st, &stlen)); + rados_buffer_free(buf); + rados_buffer_free(st); + + cmd[0] = (char *)"asdfqwer"; + ASSERT_EQ(-EINVAL, rados_mon_command(cluster, (const char **)cmd, 1, "{}", 2, &buf, &buflen, &st, &stlen)); + rados_buffer_free(buf); + rados_buffer_free(st); + + cmd[0] = (char *)"{\"prefix\":\"mon_status\"}"; + ASSERT_EQ(0, rados_mon_command(cluster, (const char **)cmd, 1, "", 0, &buf, &buflen, &st, &stlen)); + ASSERT_LT(0u, buflen); + //ASSERT_LT(0u, stlen); + rados_buffer_free(buf); + rados_buffer_free(st); + rados_shutdown(cluster); +} + +TEST(LibRadosCmd, MonDescribePP) { + Rados cluster; + ASSERT_EQ("", connect_cluster_pp(cluster)); + bufferlist inbl, outbl; + string outs; + ASSERT_EQ(0, cluster.mon_command("{\"prefix\": \"get_command_descriptions\"}", + inbl, &outbl, &outs)); + ASSERT_LT(0u, outbl.length()); + ASSERT_LE(0u, outs.length()); + cluster.shutdown(); +} + +TEST(LibRadosCmd, OSDCmd) { + rados_t cluster; + ASSERT_EQ("", connect_cluster(&cluster)); + int r; + char *buf, *st; + size_t buflen, stlen; + char *cmd[2]; + cmd[1] = NULL; + + // note: tolerate NXIO here in case the cluster is thrashing out underneath us. + cmd[0] = (char *)"asdfasdf"; + r = rados_osd_command(cluster, 0, (const char **)cmd, 1, "", 0, &buf, &buflen, &st, &stlen); + ASSERT_TRUE(r == -22 || r == -ENXIO); + cmd[0] = (char *)"version"; + r = rados_osd_command(cluster, 0, (const char **)cmd, 1, "", 0, &buf, &buflen, &st, &stlen); + ASSERT_TRUE(r == -22 || r == -ENXIO); + cmd[0] = (char *)"{\"prefix\":\"version\"}"; + r = rados_osd_command(cluster, 0, (const char **)cmd, 1, "", 0, &buf, &buflen, &st, &stlen); + ASSERT_TRUE((r == 0 && buflen > 0) || (r == -ENXIO && buflen == 0)); + rados_buffer_free(buf); + rados_buffer_free(st); + rados_shutdown(cluster); +} + +TEST(LibRadosCmd, PGCmd) { + rados_t cluster; + std::string pool_name = get_temp_pool_name(); + ASSERT_EQ("", create_one_pool(pool_name, &cluster)); + + char *buf, *st; + size_t buflen, stlen; + char *cmd[2]; + cmd[1] = NULL; + + int64_t poolid = rados_pool_lookup(cluster, pool_name.c_str()); + ASSERT_LT(0, poolid); + + string pgid = stringify(poolid) + ".0"; + + cmd[0] = (char *)"asdfasdf"; + // note: tolerate NXIO here in case the cluster is thrashing out underneath us. + int r = rados_pg_command(cluster, pgid.c_str(), (const char **)cmd, 1, "", 0, &buf, &buflen, &st, &stlen); + ASSERT_TRUE(r == -22 || r == -ENXIO); + + // make sure the pg exists on the osd before we query it + rados_ioctx_t io; + rados_ioctx_create(cluster, pool_name.c_str(), &io); + for (int i=0; i<100; i++) { + string oid = "obj" + stringify(i); + ASSERT_EQ(-ENOENT, rados_stat(io, oid.c_str(), NULL, NULL)); + } + rados_ioctx_destroy(io); + + string qstr = "{\"prefix\":\"pg\", \"cmd\":\"query\", \"pgid\":\"" + pgid + "\"}"; + cmd[0] = (char *)qstr.c_str(); + // note: tolerate ENOENT/ENXIO here if hte osd is thrashing out underneath us + r = rados_pg_command(cluster, pgid.c_str(), (const char **)cmd, 1, "", 0, &buf, &buflen, &st, &stlen); + ASSERT_TRUE(r == 0 || r == -ENOENT || r == -ENXIO); + + ASSERT_LT(0u, buflen); + rados_buffer_free(buf); + rados_buffer_free(st); + + ASSERT_EQ(0, destroy_one_pool(pool_name, &cluster)); +} + +struct Log { + list log; + Cond cond; + Mutex lock; + + Log() : lock("l::lock") {} + + bool contains(string str) { + Mutex::Locker l(lock); + for (list::iterator p = log.begin(); p != log.end(); ++p) { + if (p->find(str) != std::string::npos) + return true; + } + return false; + } +}; + +void log_cb(void *arg, + const char *line, + const char *who, uint64_t stampsec, uint64_t stamp_nsec, + uint64_t seq, const char *level, + const char *msg) { + Log *l = static_cast(arg); + Mutex::Locker locker(l->lock); + l->log.push_back(line); + l->cond.Signal(); + cout << "got: " << line << std::endl; +} + +TEST(LibRadosCmd, WatchLog) { + rados_t cluster; + ASSERT_EQ("", connect_cluster(&cluster)); + char *buf, *st; + char *cmd[2]; + cmd[1] = NULL; + size_t buflen, stlen; + Log l; + + ASSERT_EQ(0, rados_monitor_log(cluster, "info", log_cb, &l)); + cmd[0] = (char *)"{\"prefix\":\"log\", \"logtext\":[\"onexx\"]}"; + ASSERT_EQ(0, rados_mon_command(cluster, (const char **)cmd, 1, "", 0, &buf, &buflen, &st, &stlen)); + for (int i=0; !l.contains("onexx"); i++) { + ASSERT_TRUE(i<100); + sleep(1); + } + ASSERT_TRUE(l.contains("onexx")); + + /* + changing the subscribe level is currently broken. + + cmd[0] = (char *)"{\"prefix\":\"log\", \"logtext\":[\"twoxx\"]}"; + ASSERT_EQ(0, rados_monitor_log(cluster, "err", log_cb, &l)); + ASSERT_EQ(0, rados_mon_command(cluster, (const char **)cmd, 1, "", 0, &buf, &buflen, &st, &stlen)); + sleep(2); + ASSERT_FALSE(l.contains("twoxx")); + */ + + ASSERT_EQ(0, rados_monitor_log(cluster, "info", log_cb, &l)); + cmd[0] = (char *)"{\"prefix\":\"log\", \"logtext\":[\"threexx\"]}"; + ASSERT_EQ(0, rados_mon_command(cluster, (const char **)cmd, 1, "", 0, &buf, &buflen, &st, &stlen)); + for (int i=0; !l.contains("threexx"); i++) { + ASSERT_TRUE(i<100); + sleep(1); + } + + ASSERT_EQ(0, rados_monitor_log(cluster, "info", NULL, NULL)); + cmd[0] = (char *)"{\"prefix\":\"log\", \"logtext\":[\"fourxx\"]}"; + ASSERT_EQ(0, rados_mon_command(cluster, (const char **)cmd, 1, "", 0, &buf, &buflen, &st, &stlen)); + sleep(2); + ASSERT_FALSE(l.contains("fourxx")); + rados_shutdown(cluster); +}