Abstract

Collaborative filtering is an important personalized method in recommender systems in internet commerce. It is infeasible that traditional collaborative filtering is based on absolute rating for items since users are difficult to accurately make an absolute rating for items, and also different users give different rating distribution. In this tutorial, it shows that how to use a Hama to calculate TCF.

Implementation

...

Get the pairs of all row key combinations w/o repetition

We don't have to recalculate the same value pair with reversed order.
ex) similar(UserA, UserB) == similar(UserB, UserA).

In this case, it is going to return by discarding from the full possible combination. Since there will be mC2 combinations (m : num keys), one can optimize it to have mC2 / N values per reducer (N : num-reducers). Something like :

```partition(index i, key key_j, int N) { // N is num reducers
// find the data per reducer
int dataPerRed = mC2 / N; // assuming m is known
int prev_sum = 0;
// calculate the total combinations contributed by previous indexes
for (k=1; k < i; k++) {
prev_sum += m - k + 1; // this adds the number of combinations contributed by kth index
}
prev_sum += j - i + 1 // self contribution
return prev_sum % dataPerRed
}```

...

...

Pseudo code for TCF with Hama

```import java.math.BigInteger;

import org.apache.hama.HamaConfiguration;
import org.apache.hama.Matrix;
import org.apache.hama.Vector;

public static double[][] data = {
{ 2, 5, 1, 4 },
{ 4, 1, 3, 3 },
{ 3, 4, 2, 4 }
};

public static void main(String[] args) {
HamaConfiguration conf = new HamaConfiguration();

// 1. Build a user by item matrix, Set entries from the raw data
Matrix userByItem = new Matrix(conf, "userByItem");
for (int i = 0; i < 3; i++) {
for (int j = 0; j < 4; j++) {
userByItem.set(i, j, data[i][j]);
}
}

// 2. Get the pair set of all row key combinations
Combination x = new Combination(data.length, 2);

// 3. |a|·|b|cos(q) calculation
while (x.hasMore()) {
int[] pair = x.getNext();
System.out.print("Similarity: (" + pair[0] + ", " + pair[1] + ") = ");

Vector v1 = userByItem.getRow(pair[0]);
Vector v2 = userByItem.getRow(pair[1]);
double similarity = v1.dot(v2);

// 4. Collect the similarity result of the two users
System.out.println(similarity);
}

// Each part can be applied to large-scale job scheduling using Map/Reduce.
}

static class Combination {

private int[] a;
private int n;
private int r;
private BigInteger numLeft;
private BigInteger total;

public Combination(int n, int r) {
if (r > n) {
throw new IllegalArgumentException();
}
if (n < 1) {
throw new IllegalArgumentException();
}
this.n = n;
this.r = r;
a = new int[r];
BigInteger nFact = getFactorial(n);
BigInteger rFact = getFactorial(r);
BigInteger nminusrFact = getFactorial(n - r);
total = nFact.divide(rFact.multiply(nminusrFact));
reset();
}

public void reset() {
for (int i = 0; i < a.length; i++) {
a[i] = i;
}
numLeft = total;
}

public BigInteger getNumLeft() {
return numLeft;
}

public boolean hasMore() {
return numLeft.compareTo(BigInteger.ZERO) > 0;
}

public BigInteger getTotal() {
}

private BigInteger getFactorial(int n) {
BigInteger fact = BigInteger.ONE;
for (int i = n; i > 1; i--) {
fact = fact.multiply(BigInteger.valueOf(i));
}
return fact;
}

public int[] getNext() {
if (numLeft.equals(total)) {
numLeft = numLeft.subtract(BigInteger.ONE);
return a;
}

int i = r - 1;
while (a[i] == n - r + i) {
i--;
}
a[i]++;
for (int j = i + 1; j < r; j++) {
a[j] = a[i] + j - i;
}

numLeft = numLeft.subtract(BigInteger.ONE);
return a;
}
}
}```