Commit horrifying db cleanup script

This commit is contained in:
Raphael 2011-01-14 10:49:45 -08:00
parent a0489112b7
commit 3f7b89ac86

177
script/sanitize_database.rb Normal file
View file

@ -0,0 +1,177 @@
@start_time = Time.now.to_i
@count = 0
def sanitize_user(user)
log "Sanitizing user #{@count += 1}: #{user.username}"
people = Person.all(:owner_id => user.id)
log "#{user.username} has #{people.count} person objects."
people.sort_by {|person| contact_count(person)}
keep_person = people.last
dumb_people = people[0..(people.count)]
d_p_ids = dumb_people.map{|p| "ObjectId('#{p.id.to_s}')"}
d_p_ids_json = "[#{d_p_ids.join(',')}]"
["posts", "comments", "contacts"].each do |table_name|
eval_string = <<-JS
db.#{table_name}.find({ "person_id" : {"$in" : #{d_p_ids_json}}}).forEach(function(document){
db.#{table_name}.update({"_id" : document["_id"]}, {"$set" : { "person_id" : ObjectId("#{keep_person.id.to_s}")}});
});
JS
MongoMapper.database.eval eval_string
end
['from_id', 'to_id'].each do |key|
eval_string = <<-JS
db.requests.find({ "#{key}" : {"$in" : #{d_p_ids_json}}}).forEach(function(document){
db.requests.update({"_id" : document["_id"]}, {"$set" : { "#{key}" : ObjectId("#{keep_person.id.to_s}")}});
});
JS
MongoMapper.database.eval eval_string
end
"Ids for user #{user.username} set to one person"
dumb_people.each{|dumb| dumb.delete}
if user.serialized_private_key
keep_person.serialized_public_key = OpenSSL::PKey::RSA.new(user.serialized_private_key).public_key
keep_person.save
else
log "#{user.username} HAS NO ENCRYPTION KEY"
end
end
def log string
time_diff = Time.now.to_i - @start_time
puts "#{time_diff}s; #{string}"
end
def contact_count person
@contact_counts ||= {}
return @contact_counts[person.id] if @contact_counts[person.id]
query_result = @contacts_for_people_collection.find("_id" => person.id).first
if query_result
@contact_counts[person.id] = query_result["value"]
else
@contact_counts[person.id] = 0
end
@contact_counts[person.id]
end
def get_user_ids
cmd = BSON::OrderedHash.new
cmd["mapreduce"] = "people"
cmd["map"] = 'function(){ emit(this["owner_id"], 1)};'
cmd["reduce"] = 'function(key, vals) {' +
'var sum=0;' +
'for(var i in vals) sum += vals[i];' +
'return sum;' +
'};'
result = MongoMapper.database.command(cmd)
collection = MongoMapper.database.collection(result["result"])
collection.find("value" => {"$gte" => 2}).map{|r| r["_id"]}
end
def contacts_for_people_collection
cmd = BSON::OrderedHash.new
cmd["mapreduce"] = "contacts"
cmd["map"] = 'function(){ emit(this["person_id"], 1)};'
cmd["reduce"] = 'function(key, vals) {' +
'var sum=0;' +
'for(var i in vals) sum += vals[i];' +
'return sum;' +
'};'
result = MongoMapper.database.command(cmd)
MongoMapper.database.collection(result["result"])
end
user_ids = get_user_ids
@contacts_for_people_collection = contacts_for_people_collection
users = User.where(:id.in => user_ids).all
log "#{users.size} Users retreived."
users.each{ |user| sanitize_user(user) }
log "Eliminating local people with no corresponding user."
MongoMapper.database.eval <<-MOREJS
db.people.find().forEach(
function(doc){
if(doc["owner_id"] != null && db.users.count({"_id" : doc["owner_id"]}) == 0){
db.people.remove({"_id" : doc["_id"]});
}
}
);
MOREJS
def dup_user_emails
cmd = BSON::OrderedHash.new
cmd["mapreduce"] = "users"
cmd["map"] = 'function(){ emit(this["email"], 1)};'
cmd["reduce"] = 'function(key, vals) {' +
'var sum=0;' +
'for(var i in vals) sum += vals[i];' +
'return sum;' +
'};'
result = MongoMapper.database.command(cmd)
coll = MongoMapper.database.collection(result["result"])
user_emails = coll.find("value" => {"$gte" => 2}).map{|r| r["_id"]}
end
emails = dup_user_emails
log "Eliminating #{emails.count} users with duplicate emails"
users_coll = MongoMapper.database.collection("users")
users_coll.remove("email" => {"$in" => emails})
def dup_requests
cmd = BSON::OrderedHash.new
cmd["mapreduce"] = "requests"
cmd["map"] = 'function(){ emit(this["from_id"].toString() + "," + this["to_id"].toString(), {"array" : [this["_id"]], "count" : 1 })};'
cmd["reduce"] = 'function(key, vals) {' +
'var result = {"array" : [], "count" : 0};' +
'for(var i in vals){' +
'result["array"] = result["array"].concat(vals[i]["array"]);' +
'result["count"] += vals[i]["count"];' +
'}' +
'return result;' +
'};'
result = MongoMapper.database.command(cmd)
coll = MongoMapper.database.collection(result["result"])
#FIND WHERE "array" size is greater than 1
coll.find({"value.count" => {"$gte" => 2}}).map{|r| r["value"]["array"]}
end
non_unique_requests = dup_requests
non_unique_requests.each{|request_id_array| request_id_array.pop}
non_unique_requests.flatten!
log "Eliminating #{non_unique_requests.length} duplicate requests"
req_coll = MongoMapper.database.collection("requests")
req_coll.remove("_id" => {"$in" => non_unique_requests})
def dup_contacts
cmd = BSON::OrderedHash.new
cmd["mapreduce"] = "contacts"
cmd["map"] = 'function(){ emit(this["person_id"].toString() + "," + this["user_id"].toString(), {"array" : [this["_id"]], "count" : 1 })};'
cmd["reduce"] = 'function(key, vals) {' +
'var result = {"array" : [], "count" : 0};' +
'for(var i in vals){' +
'result["array"] = result["array"].concat(vals[i]["array"]);' +
'result["count"] += vals[i]["count"];' +
'}' +
'return result;' +
'};'
result = MongoMapper.database.command(cmd)
coll = MongoMapper.database.collection(result["result"])
#FIND WHERE "array" size is greater than 1
coll.find({"value.count" => {"$gte" => 2}}).map{|r| r["value"]["array"]}
end
non_unique_contacts = dup_contacts
non_unique_contacts.each{|contact_id_array| contact_id_array.pop}
non_unique_contacts.flatten!
log "Eliminating #{non_unique_contacts.length} duplicate contacts"
req_coll = MongoMapper.database.collection("contacts")
req_coll.remove("_id" => {"$in" => non_unique_contacts})