HiveBrain v1.2.0
Get Started
← Back to all entries
patternjavascriptMinor

Retrieving StackExchange sites and tags

Submitted by: @import:stackexchange-codereview··
0
Viewed 0 times
sitestagsstackexchangeandretrieving

Problem

I have setup agenda tasks to periodically retrieve and possibly insert new SE sites and tags in my MongoDB database.

Some things that bother me in particular:

  • Lots of code duplication



  • No string formatting to inject variables in it



  • Many callbacks -- is this the preferred way to work?



  • Updating/inserting entries in the database feels as if it does too much work



```
var Agenda = require('agenda');
var gzip_get = require('gzip-get');
var db = require('./database.js');
var agenda = new Agenda({ db: { address: 'localhost:27017/agenda' } });

agenda.define('insert tags', function (job, done) {
console.log("start inserting new tags");
console.log("requesting tags from server");

db.collection('sites').find().toArray(function (err, sites) {
sites.forEach(function (site) {
retrieveTags(site.site, 1);
});
});

done();
});

agenda.define('insert sites', function (job, done) {
console.log("start inserting new sites");
console.log("requesting sites from server");

retrieveSites(1);

done();
});

var retrieveSites = function (page) {
gzip_get('http://api.stackexchange.com/2.2/sites?page=' + page, function (err, data) {
if (err) {
console.error(err);
} else {
var json = JSON.parse(data);
if(json.error_message){
console.error(json.error_message);
return;
}

var sites = [];
for (var i = 0; i 5){
console.log("reached page 5 of tags for site " + site);
// It's impossible to query all tags -- Stack Overflow alone already has close to 1500 pages with tags.
// Possible workaround: get all the tags from a data dump and update it daily with the newest tags of that day?
return;
}

var url = 'https://api.stackexchange.com/2.2/tags?order=desc&sort=popular&site=' + site + '&page=' + page;
gzip_get(url, function (err, data) {
if (err) {

Solution

Use map/reduce

Whenever you can, you should consider using map functions to transform data. or example, you transform a JSON items array, to a sites array. This is currently done like:

var sites = [];
for (var i = 0; i < json.items.length; i++) {
    var site = json.items[i];
    sites.push({
        title: site.name,
        icon: site.icon_url,
        site: site.api_site_parameter
    });
}


but could be done as:

var sites = json.items.map(function(site) {
    return {
        title: site.name,
        icon: site.icon_url,
        site: site.api_site_parameter
    };
});


The same can be done in the tags code block.
Arrow Code

JavaScript is particularly vulnerable to arrow code, especially given the regular callback functions embedded in the code. You need to consider each opportunity you have to prevent nesting statements deeply, otherwise your readability suffers.

Consider this code:

var retrieveSites = function (page) {
    gzip_get('http://api.stackexchange.com/2.2/sites?page=' + page, function (err, data) {
        if (err) {
            console.error(err);
        } else {
            var json = JSON.parse(data);
            if(json.error_message){
                console.error(json.error_message);
                return;
            }

            var sites = [];
            for (var i = 0; i < json.items.length; i++) {
                var site = json.items[i];
                sites.push({
                    title: site.name,
                    icon: site.icon_url,
                    site: site.api_site_parameter
                });
            }
            console.log("found " + sites.length + " sites");

            var siteCollection = db.collection('sites', { strict: true });
            sites.forEach(function (site) {
                siteCollection.findOne({ title: site.title }, function (err, doc) {
                    if (!doc) {
                        siteCollection.insert(site, { safe: true }, function (err, result) {
                            if (err) {
                                console.error("could not insert site " + site.title);
                            }
                            if (result) {
                                console.log("inserted site " + site.title);
                            }
                        });
                    } else {
                        console.log("skipping site " + site.title + " because it already exists");
                    }
                });
            });

            if (json.has_more === true) {
                retrieveSites(page + 1);
            }
        }
    });
};


Just by rearranging the conditions, and returning early, you can prevent nesting:

var retrieveSites = function (page) {
    gzip_get('http://api.stackexchange.com/2.2/sites?page=' + page, function (err, data) {
        if (err) {
            console.error(err);
            return;
        }

        var json = JSON.parse(data);
        if(json.error_message){
            console.error(json.error_message);
            return;
        }

        var sites = [];
        for (var i = 0; i < json.items.length; i++) {
            var site = json.items[i];
            sites.push({
                title: site.name,
                icon: site.icon_url,
                site: site.api_site_parameter
            });
        }
        console.log("found " + sites.length + " sites");

        var siteCollection = db.collection('sites', { strict: true });

        sites.forEach(function (site) {
            siteCollection.findOne({ title: site.title }, function (err, doc) {
                if (doc) {
                    console.log("skipping site " + site.title + " because it already exists");
                    return;
                }

                siteCollection.insert(site, { safe: true }, function (err, result) {
                    if (err) {
                        console.error("could not insert site " + site.title);
                        return;
                    }
                    console.log("inserted site " + site.title);
                });

            });

            if (json.has_more === true) {
                retrieveSites(page + 1);
            }
        }
    });
};

Code Snippets

var sites = [];
for (var i = 0; i < json.items.length; i++) {
    var site = json.items[i];
    sites.push({
        title: site.name,
        icon: site.icon_url,
        site: site.api_site_parameter
    });
}
var sites = json.items.map(function(site) {
    return {
        title: site.name,
        icon: site.icon_url,
        site: site.api_site_parameter
    };
});
var retrieveSites = function (page) {
    gzip_get('http://api.stackexchange.com/2.2/sites?page=' + page, function (err, data) {
        if (err) {
            console.error(err);
        } else {
            var json = JSON.parse(data);
            if(json.error_message){
                console.error(json.error_message);
                return;
            }

            var sites = [];
            for (var i = 0; i < json.items.length; i++) {
                var site = json.items[i];
                sites.push({
                    title: site.name,
                    icon: site.icon_url,
                    site: site.api_site_parameter
                });
            }
            console.log("found " + sites.length + " sites");

            var siteCollection = db.collection('sites', { strict: true });
            sites.forEach(function (site) {
                siteCollection.findOne({ title: site.title }, function (err, doc) {
                    if (!doc) {
                        siteCollection.insert(site, { safe: true }, function (err, result) {
                            if (err) {
                                console.error("could not insert site " + site.title);
                            }
                            if (result) {
                                console.log("inserted site " + site.title);
                            }
                        });
                    } else {
                        console.log("skipping site " + site.title + " because it already exists");
                    }
                });
            });

            if (json.has_more === true) {
                retrieveSites(page + 1);
            }
        }
    });
};
var retrieveSites = function (page) {
    gzip_get('http://api.stackexchange.com/2.2/sites?page=' + page, function (err, data) {
        if (err) {
            console.error(err);
            return;
        }

        var json = JSON.parse(data);
        if(json.error_message){
            console.error(json.error_message);
            return;
        }

        var sites = [];
        for (var i = 0; i < json.items.length; i++) {
            var site = json.items[i];
            sites.push({
                title: site.name,
                icon: site.icon_url,
                site: site.api_site_parameter
            });
        }
        console.log("found " + sites.length + " sites");

        var siteCollection = db.collection('sites', { strict: true });

        sites.forEach(function (site) {
            siteCollection.findOne({ title: site.title }, function (err, doc) {
                if (doc) {
                    console.log("skipping site " + site.title + " because it already exists");
                    return;
                }

                siteCollection.insert(site, { safe: true }, function (err, result) {
                    if (err) {
                        console.error("could not insert site " + site.title);
                        return;
                    }
                    console.log("inserted site " + site.title);
                });

            });

            if (json.has_more === true) {
                retrieveSites(page + 1);
            }
        }
    });
};

Context

StackExchange Code Review Q#104935, answer score: 3

Revisions (0)

No revisions yet.